You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fastnlp_tutorial_1.ipynb 46 kB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "id": "cdc25fcd",
  6. "metadata": {},
  7. "source": [
  8. "# T1. dataset 和 vocabulary 的基本使用\n",
  9. "\n",
  10. "  1   dataset 的使用与结构\n",
  11. " \n",
  12. "    1.1   dataset 的结构与创建\n",
  13. "\n",
  14. "    1.2   dataset 的数据预处理\n",
  15. "\n",
  16. "    1.3   延伸:instance 和 field\n",
  17. "\n",
  18. "  2   vocabulary 的结构与使用\n",
  19. "\n",
  20. "    2.1   vocabulary 的创建与修改\n",
  21. "\n",
  22. "    2.2   vocabulary 与 OOV 问题\n",
  23. "\n",
  24. "  3   dataset 和 vocabulary 的组合使用\n",
  25. " \n",
  26. "    3.1   从 dataframe 中加载 dataset\n",
  27. "\n",
  28. "    3.2   从 dataset 中获取 vocabulary"
  29. ]
  30. },
  31. {
  32. "cell_type": "markdown",
  33. "id": "0eb18a22",
  34. "metadata": {},
  35. "source": [
  36. "## 1. dataset 的基本使用\n",
  37. "\n",
  38. "### 1.1 dataset 的结构与创建\n",
  39. "\n",
  40. "在`fastNLP 0.8`中,使用`DataSet`模块表示数据集,**`dataset`类似于关系型数据库中的数据表**(下文统一为小写`dataset`)\n",
  41. "\n",
  42. "  **主要包含`field`字段和`instance`实例两个元素**,对应`table`中的`field`字段和`record`记录\n",
  43. "\n",
  44. "在`fastNLP 0.8`中,`DataSet`模块被定义在`fastNLP.core.dataset`路径下,导入该模块后,最简单的\n",
  45. "\n",
  46. "  初始化方法,即将字典形式的表格 **`{'field1': column1, 'field2': column2, ...}`** 传入构造函数"
  47. ]
  48. },
  49. {
  50. "cell_type": "code",
  51. "execution_count": 1,
  52. "id": "a1d69ad2",
  53. "metadata": {},
  54. "outputs": [
  55. {
  56. "data": {
  57. "text/html": [
  58. "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">\n",
  59. "</pre>\n"
  60. ],
  61. "text/plain": [
  62. "\n"
  63. ]
  64. },
  65. "metadata": {},
  66. "output_type": "display_data"
  67. },
  68. {
  69. "name": "stdout",
  70. "output_type": "stream",
  71. "text": [
  72. "+-----+------------------------+------------------------+-----+\n",
  73. "| idx | sentence | words | num |\n",
  74. "+-----+------------------------+------------------------+-----+\n",
  75. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  76. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  77. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  78. "+-----+------------------------+------------------------+-----+\n"
  79. ]
  80. }
  81. ],
  82. "source": [
  83. "from fastNLP import DataSet\n",
  84. "\n",
  85. "data = {'idx': [0, 1, 2], \n",
  86. " 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"],\n",
  87. " 'words': [['This', 'is', 'an', 'apple', '.'], \n",
  88. " ['I', 'like', 'apples', '.'], \n",
  89. " ['Apples', 'are', 'good', 'for', 'our', 'health', '.']],\n",
  90. " 'num': [5, 4, 7]}\n",
  91. "\n",
  92. "dataset = DataSet(data)\n",
  93. "print(dataset)"
  94. ]
  95. },
  96. {
  97. "cell_type": "markdown",
  98. "id": "9260fdc6",
  99. "metadata": {},
  100. "source": [
  101. "&emsp; 在`dataset`的实例中,字段`field`的名称和实例`instance`中的字符串也可以中文"
  102. ]
  103. },
  104. {
  105. "cell_type": "code",
  106. "execution_count": 2,
  107. "id": "3d72ef00",
  108. "metadata": {},
  109. "outputs": [
  110. {
  111. "name": "stdout",
  112. "output_type": "stream",
  113. "text": [
  114. "+------+--------------------+------------------------+------+\n",
  115. "| 序号 | 句子 | 字符 | 长度 |\n",
  116. "+------+--------------------+------------------------+------+\n",
  117. "| 0 | 生活就像海洋, | ['生', '活', '就', ... | 7 |\n",
  118. "| 1 | 只有意志坚强的人, | ['只', '有', '意', ... | 9 |\n",
  119. "| 2 | 才能到达彼岸。 | ['才', '能', '到', ... | 7 |\n",
  120. "+------+--------------------+------------------------+------+\n"
  121. ]
  122. }
  123. ],
  124. "source": [
  125. "temp = {'序号': [0, 1, 2], \n",
  126. " '句子':[\"生活就像海洋,\", \"只有意志坚强的人,\", \"才能到达彼岸。\"],\n",
  127. " '字符': [['生', '活', '就', '像', '海', '洋', ','], \n",
  128. " ['只', '有', '意', '志', '坚', '强', '的', '人', ','], \n",
  129. " ['才', '能', '到', '达', '彼', '岸', '。']],\n",
  130. " '长度': [7, 9, 7]}\n",
  131. "\n",
  132. "chinese = DataSet(temp)\n",
  133. "print(chinese)"
  134. ]
  135. },
  136. {
  137. "cell_type": "markdown",
  138. "id": "202e5490",
  139. "metadata": {},
  140. "source": [
  141. "在`dataset`中,使用`drop`方法可以删除满足条件的实例,这里使用了python中的`lambda`表达式\n",
  142. "\n",
  143. "&emsp; 注一:在`drop`方法中,通过设置`inplace`参数将删除对应实例后的`dataset`作为一个新的实例生成"
  144. ]
  145. },
  146. {
  147. "cell_type": "code",
  148. "execution_count": 3,
  149. "id": "09b478f8",
  150. "metadata": {},
  151. "outputs": [
  152. {
  153. "name": "stdout",
  154. "output_type": "stream",
  155. "text": [
  156. "2492313174344 2491986424200\n",
  157. "+-----+------------------------+------------------------+-----+\n",
  158. "| idx | sentence | words | num |\n",
  159. "+-----+------------------------+------------------------+-----+\n",
  160. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  161. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  162. "+-----+------------------------+------------------------+-----+\n",
  163. "+-----+------------------------+------------------------+-----+\n",
  164. "| idx | sentence | words | num |\n",
  165. "+-----+------------------------+------------------------+-----+\n",
  166. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  167. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  168. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  169. "+-----+------------------------+------------------------+-----+\n"
  170. ]
  171. }
  172. ],
  173. "source": [
  174. "dropped = dataset\n",
  175. "dropped = dropped.drop(lambda ins:ins['num'] < 5, inplace=False)\n",
  176. "print(id(dropped), id(dataset))\n",
  177. "print(dropped)\n",
  178. "print(dataset)"
  179. ]
  180. },
  181. {
  182. "cell_type": "markdown",
  183. "id": "aa277674",
  184. "metadata": {},
  185. "source": [
  186. "&emsp; 注二:**对对象使用等号一般表示传引用**,所以对`dataset`使用等号,是传引用而不是赋值\n",
  187. "\n",
  188. "&emsp; &emsp; 如下所示,**`dropped`和`dataset`具有相同`id`**,**对`dropped`执行删除操作`dataset`同时会被修改**"
  189. ]
  190. },
  191. {
  192. "cell_type": "code",
  193. "execution_count": 4,
  194. "id": "77c8583a",
  195. "metadata": {},
  196. "outputs": [
  197. {
  198. "name": "stdout",
  199. "output_type": "stream",
  200. "text": [
  201. "2491986424200 2491986424200\n",
  202. "+-----+------------------------+------------------------+-----+\n",
  203. "| idx | sentence | words | num |\n",
  204. "+-----+------------------------+------------------------+-----+\n",
  205. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  206. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  207. "+-----+------------------------+------------------------+-----+\n",
  208. "+-----+------------------------+------------------------+-----+\n",
  209. "| idx | sentence | words | num |\n",
  210. "+-----+------------------------+------------------------+-----+\n",
  211. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  212. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  213. "+-----+------------------------+------------------------+-----+\n"
  214. ]
  215. }
  216. ],
  217. "source": [
  218. "dropped = dataset\n",
  219. "dropped.drop(lambda ins:ins['num'] < 5)\n",
  220. "print(id(dropped), id(dataset))\n",
  221. "print(dropped)\n",
  222. "print(dataset)"
  223. ]
  224. },
  225. {
  226. "cell_type": "markdown",
  227. "id": "a76199dc",
  228. "metadata": {},
  229. "source": [
  230. "在`dataset`中,使用`delet_instance`方法可以删除对应序号的`instance`实例,序号从0开始"
  231. ]
  232. },
  233. {
  234. "cell_type": "code",
  235. "execution_count": 5,
  236. "id": "d8824b40",
  237. "metadata": {},
  238. "outputs": [
  239. {
  240. "name": "stdout",
  241. "output_type": "stream",
  242. "text": [
  243. "+-----+--------------------+------------------------+-----+\n",
  244. "| idx | sentence | words | num |\n",
  245. "+-----+--------------------+------------------------+-----+\n",
  246. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  247. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  248. "+-----+--------------------+------------------------+-----+\n"
  249. ]
  250. }
  251. ],
  252. "source": [
  253. "dataset = DataSet(data)\n",
  254. "dataset.delete_instance(2)\n",
  255. "print(dataset)"
  256. ]
  257. },
  258. {
  259. "cell_type": "markdown",
  260. "id": "f4fa9f33",
  261. "metadata": {},
  262. "source": [
  263. "在`dataset`中,使用`delet_field`方法可以删除对应名称的`field`字段"
  264. ]
  265. },
  266. {
  267. "cell_type": "code",
  268. "execution_count": 6,
  269. "id": "f68ddb40",
  270. "metadata": {},
  271. "outputs": [
  272. {
  273. "name": "stdout",
  274. "output_type": "stream",
  275. "text": [
  276. "+-----+--------------------+------------------------------+\n",
  277. "| idx | sentence | words |\n",
  278. "+-----+--------------------+------------------------------+\n",
  279. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  280. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  281. "+-----+--------------------+------------------------------+\n"
  282. ]
  283. }
  284. ],
  285. "source": [
  286. "dataset.delete_field('num')\n",
  287. "print(dataset)"
  288. ]
  289. },
  290. {
  291. "cell_type": "markdown",
  292. "id": "b1e9d42c",
  293. "metadata": {},
  294. "source": [
  295. "### 1.2 dataset 的数据预处理\n",
  296. "\n",
  297. "在`dataset`模块中,`apply`、`apply_field`、`apply_more`和`apply_field_more`函数可以进行简单的数据预处理\n",
  298. "\n",
  299. "&emsp; **`apply`和`apply_more`输入整条实例**,**`apply_field`和`apply_field_more`仅输入实例的部分字段**\n",
  300. "\n",
  301. "&emsp; **`apply`和`apply_field`仅输出单个字段**,**`apply_more`和`apply_field_more`则是输出多个字段**\n",
  302. "\n",
  303. "&emsp; **`apply`和`apply_field`返回的是个列表**,**`apply_more`和`apply_field_more`返回的是个字典**\n",
  304. "\n",
  305. "&emsp; &emsp; 预处理过程中,通过`progress_bar`参数设置显示进度条类型,通过`num_proc`设置多进程\n",
  306. "***\n",
  307. "\n",
  308. "`apply`的参数包括一个函数`func`和一个新字段名`new_field_name`,函数`func`的处理对象是`dataset`模块中\n",
  309. "\n",
  310. "&emsp; 的每个`instance`实例,函数`func`的处理结果存放在`new_field_name`对应的新建字段内"
  311. ]
  312. },
  313. {
  314. "cell_type": "code",
  315. "execution_count": 7,
  316. "id": "72a0b5f9",
  317. "metadata": {},
  318. "outputs": [
  319. {
  320. "data": {
  321. "application/vnd.jupyter.widget-view+json": {
  322. "model_id": "",
  323. "version_major": 2,
  324. "version_minor": 0
  325. },
  326. "text/plain": [
  327. "Processing: 0%| | 0/3 [00:00<?, ?it/s]"
  328. ]
  329. },
  330. "metadata": {},
  331. "output_type": "display_data"
  332. },
  333. {
  334. "name": "stdout",
  335. "output_type": "stream",
  336. "text": [
  337. "+-----+------------------------------+------------------------------+\n",
  338. "| idx | sentence | words |\n",
  339. "+-----+------------------------------+------------------------------+\n",
  340. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  341. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  342. "| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
  343. "+-----+------------------------------+------------------------------+\n"
  344. ]
  345. }
  346. ],
  347. "source": [
  348. "from fastNLP import DataSet\n",
  349. "\n",
  350. "data = {'idx': [0, 1, 2], \n",
  351. " 'sentence':[\"This is an apple .\", \"I like apples .\", \"Apples are good for our health .\"], }\n",
  352. "dataset = DataSet(data)\n",
  353. "dataset.apply(lambda ins: ins['sentence'].split(), new_field_name='words', progress_bar=\"tqdm\") #\n",
  354. "print(dataset)"
  355. ]
  356. },
  357. {
  358. "cell_type": "markdown",
  359. "id": "c10275ee",
  360. "metadata": {},
  361. "source": [
  362. "&emsp; **`apply`使用的函数可以是一个基于`lambda`表达式的匿名函数**,**也可以是一个自定义的函数**"
  363. ]
  364. },
  365. {
  366. "cell_type": "code",
  367. "execution_count": 8,
  368. "id": "b1a8631f",
  369. "metadata": {},
  370. "outputs": [
  371. {
  372. "data": {
  373. "application/vnd.jupyter.widget-view+json": {
  374. "model_id": "",
  375. "version_major": 2,
  376. "version_minor": 0
  377. },
  378. "text/plain": [
  379. "Processing: 0%| | 0/3 [00:00<?, ?it/s]"
  380. ]
  381. },
  382. "metadata": {},
  383. "output_type": "display_data"
  384. },
  385. {
  386. "name": "stdout",
  387. "output_type": "stream",
  388. "text": [
  389. "+-----+------------------------------+------------------------------+\n",
  390. "| idx | sentence | words |\n",
  391. "+-----+------------------------------+------------------------------+\n",
  392. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  393. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  394. "| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
  395. "+-----+------------------------------+------------------------------+\n"
  396. ]
  397. }
  398. ],
  399. "source": [
  400. "dataset = DataSet(data)\n",
  401. "\n",
  402. "def get_words(instance):\n",
  403. " sentence = instance['sentence']\n",
  404. " words = sentence.split()\n",
  405. " return words\n",
  406. "\n",
  407. "dataset.apply(get_words, new_field_name='words', progress_bar=\"tqdm\")\n",
  408. "print(dataset)"
  409. ]
  410. },
  411. {
  412. "cell_type": "markdown",
  413. "id": "64abf745",
  414. "metadata": {},
  415. "source": [
  416. "`apply_field`的参数,除了函数`func`外还有`field_name`和`new_field_name`,该函数`func`的处理对象仅\n",
  417. "\n",
  418. "&emsp; 是`dataset`模块中的每个`field_name`对应的字段内容,处理结果存放在`new_field_name`对应的新建字段内"
  419. ]
  420. },
  421. {
  422. "cell_type": "code",
  423. "execution_count": 9,
  424. "id": "057c1d2c",
  425. "metadata": {},
  426. "outputs": [
  427. {
  428. "data": {
  429. "application/vnd.jupyter.widget-view+json": {
  430. "model_id": "",
  431. "version_major": 2,
  432. "version_minor": 0
  433. },
  434. "text/plain": [
  435. "Processing: 0%| | 0/3 [00:00<?, ?it/s]"
  436. ]
  437. },
  438. "metadata": {},
  439. "output_type": "display_data"
  440. },
  441. {
  442. "name": "stdout",
  443. "output_type": "stream",
  444. "text": [
  445. "+-----+------------------------------+------------------------------+\n",
  446. "| idx | sentence | words |\n",
  447. "+-----+------------------------------+------------------------------+\n",
  448. "| 0 | This is an apple . | ['This', 'is', 'an', 'app... |\n",
  449. "| 1 | I like apples . | ['I', 'like', 'apples', '... |\n",
  450. "| 2 | Apples are good for our h... | ['Apples', 'are', 'good',... |\n",
  451. "+-----+------------------------------+------------------------------+\n"
  452. ]
  453. }
  454. ],
  455. "source": [
  456. "dataset = DataSet(data)\n",
  457. "dataset.apply_field(lambda sent:sent.split(), field_name='sentence', new_field_name='words', \n",
  458. " progress_bar=\"tqdm\")\n",
  459. "print(dataset)"
  460. ]
  461. },
  462. {
  463. "cell_type": "markdown",
  464. "id": "5a9cc8b2",
  465. "metadata": {},
  466. "source": [
  467. "`apply_more`的参数只有函数`func`,函数`func`的处理对象是`dataset`模块中的每个`instance`实例\n",
  468. "\n",
  469. "&emsp; 要求函数`func`返回一个字典,根据字典的`key-value`确定存储在`dataset`中的字段名称与内容"
  470. ]
  471. },
  472. {
  473. "cell_type": "code",
  474. "execution_count": 10,
  475. "id": "51e2f02c",
  476. "metadata": {},
  477. "outputs": [
  478. {
  479. "data": {
  480. "application/vnd.jupyter.widget-view+json": {
  481. "model_id": "",
  482. "version_major": 2,
  483. "version_minor": 0
  484. },
  485. "text/plain": [
  486. "Processing: 0%| | 0/3 [00:00<?, ?it/s]"
  487. ]
  488. },
  489. "metadata": {},
  490. "output_type": "display_data"
  491. },
  492. {
  493. "name": "stdout",
  494. "output_type": "stream",
  495. "text": [
  496. "+-----+------------------------+------------------------+-----+\n",
  497. "| idx | sentence | words | num |\n",
  498. "+-----+------------------------+------------------------+-----+\n",
  499. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  500. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  501. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  502. "+-----+------------------------+------------------------+-----+\n"
  503. ]
  504. }
  505. ],
  506. "source": [
  507. "dataset = DataSet(data)\n",
  508. "dataset.apply_more(lambda ins:{'words': ins['sentence'].split(), 'num': len(ins['sentence'].split())}, \n",
  509. " progress_bar=\"tqdm\")\n",
  510. "print(dataset)"
  511. ]
  512. },
  513. {
  514. "cell_type": "markdown",
  515. "id": "02d2b7ef",
  516. "metadata": {},
  517. "source": [
  518. "`apply_more`的参数只有函数`func`,函数`func`的处理对象是`dataset`模块中的每个`instance`实例\n",
  519. "\n",
  520. "&emsp; 要求函数`func`返回一个字典,根据字典的`key-value`确定存储在`dataset`中的字段名称与内容"
  521. ]
  522. },
  523. {
  524. "cell_type": "code",
  525. "execution_count": 11,
  526. "id": "db4295d5",
  527. "metadata": {},
  528. "outputs": [
  529. {
  530. "data": {
  531. "application/vnd.jupyter.widget-view+json": {
  532. "model_id": "",
  533. "version_major": 2,
  534. "version_minor": 0
  535. },
  536. "text/plain": [
  537. "Processing: 0%| | 0/3 [00:00<?, ?it/s]"
  538. ]
  539. },
  540. "metadata": {},
  541. "output_type": "display_data"
  542. },
  543. {
  544. "name": "stdout",
  545. "output_type": "stream",
  546. "text": [
  547. "+-----+------------------------+------------------------+-----+\n",
  548. "| idx | sentence | words | num |\n",
  549. "+-----+------------------------+------------------------+-----+\n",
  550. "| 0 | This is an apple . | ['This', 'is', 'an'... | 5 |\n",
  551. "| 1 | I like apples . | ['I', 'like', 'appl... | 4 |\n",
  552. "| 2 | Apples are good for... | ['Apples', 'are', '... | 7 |\n",
  553. "+-----+------------------------+------------------------+-----+\n"
  554. ]
  555. }
  556. ],
  557. "source": [
  558. "dataset = DataSet(data)\n",
  559. "dataset.apply_field_more(lambda sent:{'words': sent.split(), 'num': len(sent.split())}, \n",
  560. " field_name='sentence', progress_bar=\"tqdm\")\n",
  561. "print(dataset)"
  562. ]
  563. },
  564. {
  565. "cell_type": "markdown",
  566. "id": "9c09e592",
  567. "metadata": {},
  568. "source": [
  569. "### 1.3 延伸:instance 和 field\n",
  570. "\n",
  571. "在`fastNLP 0.8`中,使用`Instance`模块表示数据集`dataset`中的每条数据,被称为实例\n",
  572. "\n",
  573. "&emsp; 构造方式类似于构造一个字典,通过键值相同的`Instance`列表,也可以初始化一个`dataset`,代码如下"
  574. ]
  575. },
  576. {
  577. "cell_type": "code",
  578. "execution_count": 12,
  579. "id": "012f537c",
  580. "metadata": {},
  581. "outputs": [],
  582. "source": [
  583. "from fastNLP import DataSet\n",
  584. "from fastNLP import Instance\n",
  585. "\n",
  586. "dataset = DataSet([\n",
  587. " Instance(sentence=\"This is an apple .\",\n",
  588. " words=['This', 'is', 'an', 'apple', '.'],\n",
  589. " num=5),\n",
  590. " Instance(sentence=\"I like apples .\",\n",
  591. " words=['I', 'like', 'apples', '.'],\n",
  592. " num=4),\n",
  593. " Instance(sentence=\"Apples are good for our health .\",\n",
  594. " words=['Apples', 'are', 'good', 'for', 'our', 'health', '.'],\n",
  595. " num=7),\n",
  596. " ])"
  597. ]
  598. },
  599. {
  600. "cell_type": "markdown",
  601. "id": "2fafb1ef",
  602. "metadata": {},
  603. "source": [
  604. "&emsp; 通过`items`、`keys`和`values`方法,可以分别获得`dataset`的`item`列表、`key`列表、`value`列表"
  605. ]
  606. },
  607. {
  608. "cell_type": "code",
  609. "execution_count": 13,
  610. "id": "a4c1c10d",
  611. "metadata": {},
  612. "outputs": [
  613. {
  614. "name": "stdout",
  615. "output_type": "stream",
  616. "text": [
  617. "dict_items([('sentence', 'This is an apple .'), ('words', ['This', 'is', 'an', 'apple', '.']), ('num', 5)])\n",
  618. "dict_keys(['sentence', 'words', 'num'])\n",
  619. "dict_values(['This is an apple .', ['This', 'is', 'an', 'apple', '.'], 5])\n"
  620. ]
  621. }
  622. ],
  623. "source": [
  624. "ins = Instance(sentence=\"This is an apple .\", words=['This', 'is', 'an', 'apple', '.'], num=5)\n",
  625. "\n",
  626. "print(ins.items())\n",
  627. "print(ins.keys())\n",
  628. "print(ins.values())"
  629. ]
  630. },
  631. {
  632. "cell_type": "markdown",
  633. "id": "b5459a2d",
  634. "metadata": {},
  635. "source": [
  636. "&emsp; 通过`add_field`方法,可以在`Instance`实例中,通过参数`field_name`添加字段,通过参数`field`赋值"
  637. ]
  638. },
  639. {
  640. "cell_type": "code",
  641. "execution_count": 14,
  642. "id": "55376402",
  643. "metadata": {},
  644. "outputs": [
  645. {
  646. "name": "stdout",
  647. "output_type": "stream",
  648. "text": [
  649. "+--------------------+------------------------+-----+-----+\n",
  650. "| sentence | words | num | idx |\n",
  651. "+--------------------+------------------------+-----+-----+\n",
  652. "| This is an apple . | ['This', 'is', 'an'... | 5 | 0 |\n",
  653. "+--------------------+------------------------+-----+-----+\n"
  654. ]
  655. }
  656. ],
  657. "source": [
  658. "ins.add_field(field_name='idx', field=0)\n",
  659. "print(ins)"
  660. ]
  661. },
  662. {
  663. "cell_type": "markdown",
  664. "id": "49caaa9c",
  665. "metadata": {},
  666. "source": [
  667. "在`fastNLP 0.8`中,使用`FieldArray`模块表示数据集`dataset`中的每条字段名(注:没有`field`类)\n",
  668. "\n",
  669. "&emsp; 通过`get_all_fields`方法可以获取`dataset`的字段列表\n",
  670. "\n",
  671. "&emsp; 通过`get_field_names`方法可以获取`dataset`的字段名称列表,代码如下"
  672. ]
  673. },
  674. {
  675. "cell_type": "code",
  676. "execution_count": 15,
  677. "id": "fe15f4c1",
  678. "metadata": {},
  679. "outputs": [
  680. {
  681. "data": {
  682. "text/plain": [
  683. "{'sentence': <fastNLP.core.dataset.field.FieldArray at 0x2444977fe88>,\n",
  684. " 'words': <fastNLP.core.dataset.field.FieldArray at 0x2444977ff08>,\n",
  685. " 'num': <fastNLP.core.dataset.field.FieldArray at 0x2444977ff88>}"
  686. ]
  687. },
  688. "execution_count": 15,
  689. "metadata": {},
  690. "output_type": "execute_result"
  691. }
  692. ],
  693. "source": [
  694. "dataset.get_all_fields()"
  695. ]
  696. },
  697. {
  698. "cell_type": "code",
  699. "execution_count": 16,
  700. "id": "5433815c",
  701. "metadata": {},
  702. "outputs": [
  703. {
  704. "data": {
  705. "text/plain": [
  706. "['num', 'sentence', 'words']"
  707. ]
  708. },
  709. "execution_count": 16,
  710. "metadata": {},
  711. "output_type": "execute_result"
  712. }
  713. ],
  714. "source": [
  715. "dataset.get_field_names()"
  716. ]
  717. },
  718. {
  719. "cell_type": "markdown",
  720. "id": "4964eeed",
  721. "metadata": {},
  722. "source": [
  723. "其他`dataset`的基本使用:通过`in`或者`has_field`方法可以判断`dataset`的是否包含某种字段\n",
  724. "\n",
  725. "&emsp; 通过`rename_field`方法可以更改`dataset`中的字段名称;通过`concat`方法可以实现两个`dataset`中的拼接\n",
  726. "\n",
  727. "&emsp; 通过`len`可以统计`dataset`中的实例数目;`dataset`的全部变量与函数可以通过`dir(dataset)`查询"
  728. ]
  729. },
  730. {
  731. "cell_type": "code",
  732. "execution_count": 17,
  733. "id": "25ce5488",
  734. "metadata": {},
  735. "outputs": [
  736. {
  737. "name": "stdout",
  738. "output_type": "stream",
  739. "text": [
  740. "3 False\n",
  741. "6 True\n",
  742. "+------------------------------+------------------------------+--------+\n",
  743. "| sentence | words | length |\n",
  744. "+------------------------------+------------------------------+--------+\n",
  745. "| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n",
  746. "| I like apples . | ['I', 'like', 'apples', '... | 4 |\n",
  747. "| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n",
  748. "| This is an apple . | ['This', 'is', 'an', 'app... | 5 |\n",
  749. "| I like apples . | ['I', 'like', 'apples', '... | 4 |\n",
  750. "| Apples are good for our h... | ['Apples', 'are', 'good',... | 7 |\n",
  751. "+------------------------------+------------------------------+--------+\n"
  752. ]
  753. }
  754. ],
  755. "source": [
  756. "print(len(dataset), dataset.has_field('length')) \n",
  757. "if 'num' in dataset:\n",
  758. " dataset.rename_field('num', 'length')\n",
  759. "elif 'length' in dataset:\n",
  760. " dataset.rename_field('length', 'num')\n",
  761. "dataset.concat(dataset)\n",
  762. "print(len(dataset), dataset.has_field('length')) \n",
  763. "print(dataset) "
  764. ]
  765. },
  766. {
  767. "cell_type": "markdown",
  768. "id": "e30a6cd7",
  769. "metadata": {},
  770. "source": [
  771. "## 2. vocabulary 的结构与使用\n",
  772. "\n",
  773. "### 2.1 vocabulary 的创建与修改\n",
  774. "\n",
  775. "在`fastNLP 0.8`中,使用`Vocabulary`模块表示词汇表,**`vocabulary`的核心是从单词到序号的映射**\n",
  776. "\n",
  777. "&emsp; 可以直接通过构造函数实例化,通过查找`word2idx`属性,可以找到`vocabulary`映射对应的字典实现\n",
  778. "\n",
  779. "&emsp; **默认补零`padding`用`<pad>`表示**,**对应序号为0**;**未知单词`unknown`用`<unk>`表示**,**对应序号1**\n",
  780. "\n",
  781. "&emsp; 通过打印`vocabulary`可以看到词汇表中的单词列表,其中,`padding`和`unknown`不会显示"
  782. ]
  783. },
  784. {
  785. "cell_type": "code",
  786. "execution_count": 18,
  787. "id": "3515e096",
  788. "metadata": {},
  789. "outputs": [
  790. {
  791. "name": "stdout",
  792. "output_type": "stream",
  793. "text": [
  794. "Vocabulary([]...)\n",
  795. "{'<pad>': 0, '<unk>': 1}\n",
  796. "<pad> 0\n",
  797. "<unk> 1\n"
  798. ]
  799. }
  800. ],
  801. "source": [
  802. "from fastNLP import Vocabulary\n",
  803. "\n",
  804. "vocab = Vocabulary()\n",
  805. "print(vocab)\n",
  806. "print(vocab.word2idx)\n",
  807. "print(vocab.padding, vocab.padding_idx)\n",
  808. "print(vocab.unknown, vocab.unknown_idx)"
  809. ]
  810. },
  811. {
  812. "cell_type": "markdown",
  813. "id": "640be126",
  814. "metadata": {},
  815. "source": [
  816. "在`vocabulary`中,通过`add_word`方法或`add_word_lst`方法,可以单独或批量添加单词\n",
  817. "\n",
  818. "&emsp; 通过`len`或`word_count`属性,可以显示`vocabulary`的单词量和每个单词添加的次数"
  819. ]
  820. },
  821. {
  822. "cell_type": "code",
  823. "execution_count": 19,
  824. "id": "88c7472a",
  825. "metadata": {},
  826. "outputs": [
  827. {
  828. "name": "stdout",
  829. "output_type": "stream",
  830. "text": [
  831. "5 Counter({'生活': 1, '就像': 1, '海洋': 1})\n",
  832. "6 Counter({'生活': 1, '就像': 1, '海洋': 1, '只有': 1})\n",
  833. "6 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5}\n"
  834. ]
  835. }
  836. ],
  837. "source": [
  838. "vocab.add_word_lst(['生活', '就像', '海洋'])\n",
  839. "print(len(vocab), vocab.word_count)\n",
  840. "vocab.add_word('只有')\n",
  841. "print(len(vocab), vocab.word_count)\n",
  842. "print(len(vocab), vocab.word2idx)"
  843. ]
  844. },
  845. {
  846. "cell_type": "markdown",
  847. "id": "f9ec8b28",
  848. "metadata": {},
  849. "source": [
  850. "&emsp; **通过`to_word`方法可以找到单词对应的序号**,**通过`to_index`方法可以找到序号对应的单词**\n",
  851. "\n",
  852. "&emsp; &emsp; 由于序号0和序号1已经被占用,所以**新加入的词的序号从2开始计数**,如`'生活'`对应2\n",
  853. "\n",
  854. "&emsp; &emsp; 通过`has_word`方法可以判断单词是否在词汇表中,没有的单词被判做`<unk>`"
  855. ]
  856. },
  857. {
  858. "cell_type": "code",
  859. "execution_count": 20,
  860. "id": "3447acde",
  861. "metadata": {},
  862. "outputs": [
  863. {
  864. "name": "stdout",
  865. "output_type": "stream",
  866. "text": [
  867. "<pad> 0\n",
  868. "<unk> 1\n",
  869. "生活 2\n",
  870. "彼岸 1 False\n"
  871. ]
  872. }
  873. ],
  874. "source": [
  875. "print(vocab.to_word(0), vocab.to_index('<pad>'))\n",
  876. "print(vocab.to_word(1), vocab.to_index('<unk>'))\n",
  877. "print(vocab.to_word(2), vocab.to_index('生活'))\n",
  878. "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))"
  879. ]
  880. },
  881. {
  882. "cell_type": "markdown",
  883. "id": "b4e36850",
  884. "metadata": {},
  885. "source": [
  886. "**`vocabulary`允许反复添加相同单词**,**可以通过`word_count`方法看到相应单词被添加的次数**\n",
  887. "\n",
  888. "&emsp; 但其中没有`<unk>`和`<pad>`,`vocabulary`的全部变量与函数可以通过`dir(vocabulary)`查询\n",
  889. "\n",
  890. "&emsp; 注:**使用`add_word_lst`添加单词**,**单词对应序号不会动态调整**,**使用`dataset`添加单词的情况不同**"
  891. ]
  892. },
  893. {
  894. "cell_type": "code",
  895. "execution_count": 21,
  896. "id": "490b101c",
  897. "metadata": {},
  898. "outputs": [
  899. {
  900. "name": "stdout",
  901. "output_type": "stream",
  902. "text": [
  903. "生活 2\n",
  904. "彼岸 12 True\n",
  905. "13 Counter({'人': 4, '生活': 2, '就像': 2, '海洋': 2, '只有': 2, '意志': 1, '坚强的': 1, '才': 1, '能': 1, '到达': 1, '彼岸': 1})\n",
  906. "13 {'<pad>': 0, '<unk>': 1, '生活': 2, '就像': 3, '海洋': 4, '只有': 5, '人': 6, '意志': 7, '坚强的': 8, '才': 9, '能': 10, '到达': 11, '彼岸': 12}\n"
  907. ]
  908. }
  909. ],
  910. "source": [
  911. "vocab.add_word_lst(['生活', '就像', '海洋', '只有', '意志', '坚强的', '人', '人', '人', '人', '才', '能', '到达', '彼岸'])\n",
  912. "print(vocab.to_word(2), vocab.to_index('生活'))\n",
  913. "print('彼岸', vocab.to_index('彼岸'), vocab.has_word('彼岸'))\n",
  914. "print(len(vocab), vocab.word_count)\n",
  915. "print(len(vocab), vocab.word2idx)"
  916. ]
  917. },
  918. {
  919. "cell_type": "markdown",
  920. "id": "23e32a63",
  921. "metadata": {},
  922. "source": [
  923. "### 2.2 vocabulary 与 OOV 问题\n",
  924. "\n",
  925. "在`vocabulary`模块初始化的时候,可以通过指定`unknown`和`padding`为`None`,限制其存在\n",
  926. "\n",
  927. "&emsp; 此时添加单词直接从0开始标号,如果遇到未知单词会直接报错,即 out of vocabulary"
  928. ]
  929. },
  930. {
  931. "cell_type": "code",
  932. "execution_count": 22,
  933. "id": "a99ff909",
  934. "metadata": {},
  935. "outputs": [
  936. {
  937. "name": "stdout",
  938. "output_type": "stream",
  939. "text": [
  940. "{'positive': 0, 'negative': 1}\n",
  941. "ValueError: word `neutral` not in vocabulary\n"
  942. ]
  943. }
  944. ],
  945. "source": [
  946. "vocab = Vocabulary(unknown=None, padding=None)\n",
  947. "\n",
  948. "vocab.add_word_lst(['positive', 'negative'])\n",
  949. "print(vocab.word2idx)\n",
  950. "\n",
  951. "try:\n",
  952. " print(vocab.to_index('neutral'))\n",
  953. "except ValueError:\n",
  954. " print(\"ValueError: word `neutral` not in vocabulary\")"
  955. ]
  956. },
  957. {
  958. "cell_type": "markdown",
  959. "id": "618da6bd",
  960. "metadata": {},
  961. "source": [
  962. "&emsp; 相应的,如果只指定其中的`unknown`,则编号会后移一个,同时遇到未知单词全部当做`<unk>`"
  963. ]
  964. },
  965. {
  966. "cell_type": "code",
  967. "execution_count": 23,
  968. "id": "432f74c1",
  969. "metadata": {},
  970. "outputs": [
  971. {
  972. "name": "stdout",
  973. "output_type": "stream",
  974. "text": [
  975. "{'<unk>': 0, 'positive': 1, 'negative': 2}\n",
  976. "0 <unk>\n"
  977. ]
  978. }
  979. ],
  980. "source": [
  981. "vocab = Vocabulary(unknown='<unk>', padding=None)\n",
  982. "\n",
  983. "vocab.add_word_lst(['positive', 'negative'])\n",
  984. "print(vocab.word2idx)\n",
  985. "\n",
  986. "print(vocab.to_index('neutral'), vocab.to_word(vocab.to_index('neutral')))"
  987. ]
  988. },
  989. {
  990. "cell_type": "markdown",
  991. "id": "b6263f73",
  992. "metadata": {},
  993. "source": [
  994. "## 3 dataset 和 vocabulary 的组合使用\n",
  995. " \n",
  996. "### 3.1 从 dataframe 中加载 dataset\n",
  997. "\n",
  998. "以下通过 [NLP-beginner](https://github.com/FudanNLP/nlp-beginner) 实践一中 [Rotten Tomatoes 影评数据集](https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews) 的部分训练数据组成`test4dataset.tsv`文件\n",
  999. "\n",
  1000. "&emsp; 介绍如何使用`dataset`、`vocabulary`简单加载并处理数据集,首先使用`pandas`模块,读取原始数据的`dataframe`"
  1001. ]
  1002. },
  1003. {
  1004. "cell_type": "code",
  1005. "execution_count": 24,
  1006. "id": "3dbd985d",
  1007. "metadata": {},
  1008. "outputs": [
  1009. {
  1010. "data": {
  1011. "text/html": [
  1012. "<div>\n",
  1013. "<style scoped>\n",
  1014. " .dataframe tbody tr th:only-of-type {\n",
  1015. " vertical-align: middle;\n",
  1016. " }\n",
  1017. "\n",
  1018. " .dataframe tbody tr th {\n",
  1019. " vertical-align: top;\n",
  1020. " }\n",
  1021. "\n",
  1022. " .dataframe thead th {\n",
  1023. " text-align: right;\n",
  1024. " }\n",
  1025. "</style>\n",
  1026. "<table border=\"1\" class=\"dataframe\">\n",
  1027. " <thead>\n",
  1028. " <tr style=\"text-align: right;\">\n",
  1029. " <th></th>\n",
  1030. " <th>SentenceId</th>\n",
  1031. " <th>Sentence</th>\n",
  1032. " <th>Sentiment</th>\n",
  1033. " </tr>\n",
  1034. " </thead>\n",
  1035. " <tbody>\n",
  1036. " <tr>\n",
  1037. " <th>0</th>\n",
  1038. " <td>1</td>\n",
  1039. " <td>A series of escapades demonstrating the adage ...</td>\n",
  1040. " <td>negative</td>\n",
  1041. " </tr>\n",
  1042. " <tr>\n",
  1043. " <th>1</th>\n",
  1044. " <td>2</td>\n",
  1045. " <td>This quiet , introspective and entertaining in...</td>\n",
  1046. " <td>positive</td>\n",
  1047. " </tr>\n",
  1048. " <tr>\n",
  1049. " <th>2</th>\n",
  1050. " <td>3</td>\n",
  1051. " <td>Even fans of Ismail Merchant 's work , I suspe...</td>\n",
  1052. " <td>negative</td>\n",
  1053. " </tr>\n",
  1054. " <tr>\n",
  1055. " <th>3</th>\n",
  1056. " <td>4</td>\n",
  1057. " <td>A positively thrilling combination of ethnogra...</td>\n",
  1058. " <td>neutral</td>\n",
  1059. " </tr>\n",
  1060. " <tr>\n",
  1061. " <th>4</th>\n",
  1062. " <td>5</td>\n",
  1063. " <td>A comedy-drama of nearly epic proportions root...</td>\n",
  1064. " <td>positive</td>\n",
  1065. " </tr>\n",
  1066. " <tr>\n",
  1067. " <th>5</th>\n",
  1068. " <td>6</td>\n",
  1069. " <td>The Importance of Being Earnest , so thick wit...</td>\n",
  1070. " <td>neutral</td>\n",
  1071. " </tr>\n",
  1072. " </tbody>\n",
  1073. "</table>\n",
  1074. "</div>"
  1075. ],
  1076. "text/plain": [
  1077. " SentenceId Sentence Sentiment\n",
  1078. "0 1 A series of escapades demonstrating the adage ... negative\n",
  1079. "1 2 This quiet , introspective and entertaining in... positive\n",
  1080. "2 3 Even fans of Ismail Merchant 's work , I suspe... negative\n",
  1081. "3 4 A positively thrilling combination of ethnogra... neutral\n",
  1082. "4 5 A comedy-drama of nearly epic proportions root... positive\n",
  1083. "5 6 The Importance of Being Earnest , so thick wit... neutral"
  1084. ]
  1085. },
  1086. "execution_count": 24,
  1087. "metadata": {},
  1088. "output_type": "execute_result"
  1089. }
  1090. ],
  1091. "source": [
  1092. "import pandas as pd\n",
  1093. "\n",
  1094. "df = pd.read_csv('./data/test4dataset.tsv', sep='\\t')\n",
  1095. "df"
  1096. ]
  1097. },
  1098. {
  1099. "cell_type": "markdown",
  1100. "id": "919ab350",
  1101. "metadata": {},
  1102. "source": [
  1103. "接着,通过`dataset`中的`from_pandas`方法填充数据集,并使用`apply_more`方法对文本进行分词操作"
  1104. ]
  1105. },
  1106. {
  1107. "cell_type": "code",
  1108. "execution_count": 25,
  1109. "id": "4f634586",
  1110. "metadata": {},
  1111. "outputs": [
  1112. {
  1113. "data": {
  1114. "application/vnd.jupyter.widget-view+json": {
  1115. "model_id": "",
  1116. "version_major": 2,
  1117. "version_minor": 0
  1118. },
  1119. "text/plain": [
  1120. "Processing: 0%| | 0/6 [00:00<?, ?it/s]"
  1121. ]
  1122. },
  1123. "metadata": {},
  1124. "output_type": "display_data"
  1125. },
  1126. {
  1127. "name": "stdout",
  1128. "output_type": "stream",
  1129. "text": [
  1130. "+------------+------------------------------+-----------+\n",
  1131. "| SentenceId | Sentence | Sentiment |\n",
  1132. "+------------+------------------------------+-----------+\n",
  1133. "| 1 | ['a', 'series', 'of', 'es... | negative |\n",
  1134. "| 2 | ['this', 'quiet', ',', 'i... | positive |\n",
  1135. "| 3 | ['even', 'fans', 'of', 'i... | negative |\n",
  1136. "| 4 | ['a', 'positively', 'thri... | neutral |\n",
  1137. "| 5 | ['a', 'comedy-drama', 'of... | positive |\n",
  1138. "| 6 | ['the', 'importance', 'of... | neutral |\n",
  1139. "+------------+------------------------------+-----------+\n"
  1140. ]
  1141. }
  1142. ],
  1143. "source": [
  1144. "from fastNLP import DataSet\n",
  1145. "\n",
  1146. "dataset = DataSet()\n",
  1147. "dataset = dataset.from_pandas(df)\n",
  1148. "dataset.apply_more(lambda ins:{'SentenceId': ins['SentenceId'], \n",
  1149. " 'Sentence': ins['Sentence'].lower().split(), 'Sentiment': ins['Sentiment']}, \n",
  1150. " progress_bar=\"tqdm\")\n",
  1151. "print(dataset)"
  1152. ]
  1153. },
  1154. {
  1155. "cell_type": "markdown",
  1156. "id": "5c1ae192",
  1157. "metadata": {},
  1158. "source": [
  1159. "&emsp; 如果需要保存中间结果,也可以使用`dataset`的`to_csv`方法,生成`.csv`或`.tsv`文件"
  1160. ]
  1161. },
  1162. {
  1163. "cell_type": "code",
  1164. "execution_count": 26,
  1165. "id": "46722efc",
  1166. "metadata": {},
  1167. "outputs": [],
  1168. "source": [
  1169. "dataset.to_csv('./data/test4dataset.csv')"
  1170. ]
  1171. },
  1172. {
  1173. "cell_type": "markdown",
  1174. "id": "5ba13989",
  1175. "metadata": {},
  1176. "source": [
  1177. "### 3.2 从 dataset 中获取 vocabulary\n",
  1178. "\n",
  1179. "然后,初始化`vocabulary`,使用`vocabulary`中的`from_dataset`方法,从`dataset`的指定字段中\n",
  1180. "\n",
  1181. "&emsp; 获取字段中的所有元素,然后编号;如果指定字段是个列表,则针对字段中所有列表包含的元素编号\n",
  1182. "\n",
  1183. "&emsp; 注:**使用`dataset`添加单词**,**不同于`add_word_list`**,**单词被添加次数越多**,**序号越靠前**,例如案例中的`a`"
  1184. ]
  1185. },
  1186. {
  1187. "cell_type": "code",
  1188. "execution_count": 27,
  1189. "id": "a2de615b",
  1190. "metadata": {},
  1191. "outputs": [
  1192. {
  1193. "name": "stdout",
  1194. "output_type": "stream",
  1195. "text": [
  1196. "Counter({'a': 9, 'of': 9, ',': 7, 'the': 6, '.': 5, 'is': 3, 'and': 3, 'good': 2, 'for': 2, 'which': 2, 'this': 2, \"'s\": 2, 'series': 1, 'escapades': 1, 'demonstrating': 1, 'adage': 1, 'that': 1, 'what': 1, 'goose': 1, 'also': 1, 'gander': 1, 'some': 1, 'occasionally': 1, 'amuses': 1, 'but': 1, 'none': 1, 'amounts': 1, 'to': 1, 'much': 1, 'story': 1, 'quiet': 1, 'introspective': 1, 'entertaining': 1, 'independent': 1, 'worth': 1, 'seeking': 1, 'even': 1, 'fans': 1, 'ismail': 1, 'merchant': 1, 'work': 1, 'i': 1, 'suspect': 1, 'would': 1, 'have': 1, 'hard': 1, 'time': 1, 'sitting': 1, 'through': 1, 'one': 1, 'positively': 1, 'thrilling': 1, 'combination': 1, 'ethnography': 1, 'all': 1, 'intrigue': 1, 'betrayal': 1, 'deceit': 1, 'murder': 1, 'shakespearean': 1, 'tragedy': 1, 'or': 1, 'juicy': 1, 'soap': 1, 'opera': 1, 'comedy-drama': 1, 'nearly': 1, 'epic': 1, 'proportions': 1, 'rooted': 1, 'in': 1, 'sincere': 1, 'performance': 1, 'by': 1, 'title': 1, 'character': 1, 'undergoing': 1, 'midlife': 1, 'crisis': 1, 'importance': 1, 'being': 1, 'earnest': 1, 'so': 1, 'thick': 1, 'with': 1, 'wit': 1, 'it': 1, 'plays': 1, 'like': 1, 'reading': 1, 'from': 1, 'bartlett': 1, 'familiar': 1, 'quotations': 1}) \n",
  1197. "\n",
  1198. "{'<pad>': 0, '<unk>': 1, 'a': 2, 'of': 3, ',': 4, 'the': 5, '.': 6, 'is': 7, 'and': 8, 'good': 9, 'for': 10, 'which': 11, 'this': 12, \"'s\": 13, 'series': 14, 'escapades': 15, 'demonstrating': 16, 'adage': 17, 'that': 18, 'what': 19, 'goose': 20, 'also': 21, 'gander': 22, 'some': 23, 'occasionally': 24, 'amuses': 25, 'but': 26, 'none': 27, 'amounts': 28, 'to': 29, 'much': 30, 'story': 31, 'quiet': 32, 'introspective': 33, 'entertaining': 34, 'independent': 35, 'worth': 36, 'seeking': 37, 'even': 38, 'fans': 39, 'ismail': 40, 'merchant': 41, 'work': 42, 'i': 43, 'suspect': 44, 'would': 45, 'have': 46, 'hard': 47, 'time': 48, 'sitting': 49, 'through': 50, 'one': 51, 'positively': 52, 'thrilling': 53, 'combination': 54, 'ethnography': 55, 'all': 56, 'intrigue': 57, 'betrayal': 58, 'deceit': 59, 'murder': 60, 'shakespearean': 61, 'tragedy': 62, 'or': 63, 'juicy': 64, 'soap': 65, 'opera': 66, 'comedy-drama': 67, 'nearly': 68, 'epic': 69, 'proportions': 70, 'rooted': 71, 'in': 72, 'sincere': 73, 'performance': 74, 'by': 75, 'title': 76, 'character': 77, 'undergoing': 78, 'midlife': 79, 'crisis': 80, 'importance': 81, 'being': 82, 'earnest': 83, 'so': 84, 'thick': 85, 'with': 86, 'wit': 87, 'it': 88, 'plays': 89, 'like': 90, 'reading': 91, 'from': 92, 'bartlett': 93, 'familiar': 94, 'quotations': 95} \n",
  1199. "\n",
  1200. "Vocabulary(['a', 'series', 'of', 'escapades', 'demonstrating']...)\n"
  1201. ]
  1202. }
  1203. ],
  1204. "source": [
  1205. "from fastNLP import Vocabulary\n",
  1206. "\n",
  1207. "vocab = Vocabulary()\n",
  1208. "vocab = vocab.from_dataset(dataset, field_name='Sentence')\n",
  1209. "print(vocab.word_count, '\\n')\n",
  1210. "print(vocab.word2idx, '\\n')\n",
  1211. "print(vocab)"
  1212. ]
  1213. },
  1214. {
  1215. "cell_type": "markdown",
  1216. "id": "f0857ccb",
  1217. "metadata": {},
  1218. "source": [
  1219. "之后,**通过`vocabulary`的`index_dataset`方法**,**调整`dataset`中指定字段的元素**,**使用编号将之代替**\n",
  1220. "\n",
  1221. "&emsp; 使用上述方法,可以将影评数据集中的单词序列转化为词编号序列,为接下来转化为词嵌入序列做准备"
  1222. ]
  1223. },
  1224. {
  1225. "cell_type": "code",
  1226. "execution_count": 28,
  1227. "id": "2f9a04b2",
  1228. "metadata": {},
  1229. "outputs": [
  1230. {
  1231. "name": "stdout",
  1232. "output_type": "stream",
  1233. "text": [
  1234. "+------------+------------------------------+-----------+\n",
  1235. "| SentenceId | Sentence | Sentiment |\n",
  1236. "+------------+------------------------------+-----------+\n",
  1237. "| 1 | [2, 14, 3, 15, 16, 5, 17,... | negative |\n",
  1238. "| 2 | [12, 32, 4, 33, 8, 34, 35... | positive |\n",
  1239. "| 3 | [38, 39, 3, 40, 41, 13, 4... | negative |\n",
  1240. "| 4 | [2, 52, 53, 54, 3, 55, 8,... | neutral |\n",
  1241. "| 5 | [2, 67, 3, 68, 69, 70, 71... | positive |\n",
  1242. "| 6 | [5, 81, 3, 82, 83, 4, 84,... | neutral |\n",
  1243. "+------------+------------------------------+-----------+\n"
  1244. ]
  1245. }
  1246. ],
  1247. "source": [
  1248. "vocab.index_dataset(dataset, field_name='Sentence')\n",
  1249. "print(dataset)"
  1250. ]
  1251. },
  1252. {
  1253. "cell_type": "markdown",
  1254. "id": "6b26b707",
  1255. "metadata": {},
  1256. "source": [
  1257. "最后,使用相同方法,再将`dataset`中`Sentiment`字段中的`negative`、`neutral`、`positive`转化为数字编号"
  1258. ]
  1259. },
  1260. {
  1261. "cell_type": "code",
  1262. "execution_count": 29,
  1263. "id": "5f5eed18",
  1264. "metadata": {},
  1265. "outputs": [
  1266. {
  1267. "name": "stdout",
  1268. "output_type": "stream",
  1269. "text": [
  1270. "{'negative': 0, 'positive': 1, 'neutral': 2}\n",
  1271. "+------------+------------------------------+-----------+\n",
  1272. "| SentenceId | Sentence | Sentiment |\n",
  1273. "+------------+------------------------------+-----------+\n",
  1274. "| 1 | [2, 14, 3, 15, 16, 5, 17,... | 0 |\n",
  1275. "| 2 | [12, 32, 4, 33, 8, 34, 35... | 1 |\n",
  1276. "| 3 | [38, 39, 3, 40, 41, 13, 4... | 0 |\n",
  1277. "| 4 | [2, 52, 53, 54, 3, 55, 8,... | 2 |\n",
  1278. "| 5 | [2, 67, 3, 68, 69, 70, 71... | 1 |\n",
  1279. "| 6 | [5, 81, 3, 82, 83, 4, 84,... | 2 |\n",
  1280. "+------------+------------------------------+-----------+\n"
  1281. ]
  1282. }
  1283. ],
  1284. "source": [
  1285. "target_vocab = Vocabulary(padding=None, unknown=None)\n",
  1286. "\n",
  1287. "target_vocab.from_dataset(dataset, field_name='Sentiment')\n",
  1288. "print(target_vocab.word2idx)\n",
  1289. "target_vocab.index_dataset(dataset, field_name='Sentiment')\n",
  1290. "print(dataset)"
  1291. ]
  1292. },
  1293. {
  1294. "cell_type": "markdown",
  1295. "id": "eed7ea64",
  1296. "metadata": {},
  1297. "source": [
  1298. "在最后的最后,通过以下的一张图,来总结本章关于`dataset`和`vocabulary`主要知识点的讲解,以及两者的联系\n",
  1299. "\n",
  1300. "<img src=\"./figures/T1-fig-dataset-and-vocabulary.png\" width=\"80%\" height=\"80%\" align=\"center\"></img>"
  1301. ]
  1302. },
  1303. {
  1304. "cell_type": "code",
  1305. "execution_count": null,
  1306. "id": "35b4f0f7",
  1307. "metadata": {},
  1308. "outputs": [],
  1309. "source": []
  1310. }
  1311. ],
  1312. "metadata": {
  1313. "kernelspec": {
  1314. "display_name": "Python 3 (ipykernel)",
  1315. "language": "python",
  1316. "name": "python3"
  1317. },
  1318. "language_info": {
  1319. "codemirror_mode": {
  1320. "name": "ipython",
  1321. "version": 3
  1322. },
  1323. "file_extension": ".py",
  1324. "mimetype": "text/x-python",
  1325. "name": "python",
  1326. "nbconvert_exporter": "python",
  1327. "pygments_lexer": "ipython3",
  1328. "version": "3.7.13"
  1329. }
  1330. },
  1331. "nbformat": 4,
  1332. "nbformat_minor": 5
  1333. }