You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tutorial_2_vocabulary.ipynb 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "markdown",
  5. "metadata": {},
  6. "source": [
  7. "# fastNLP中的 Vocabulary\n",
  8. "## 构建 Vocabulary"
  9. ]
  10. },
  11. {
  12. "cell_type": "code",
  13. "execution_count": 1,
  14. "metadata": {},
  15. "outputs": [],
  16. "source": [
  17. "from fastNLP import Vocabulary\n",
  18. "\n",
  19. "vocab = Vocabulary()\n",
  20. "vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字\n",
  21. "vocab.add_word('上海') # `上海`会作为一个整体\n",
  22. "vocab.to_index('复') # 应该会为3\n",
  23. "vocab.to_index('我') # 会输出1,Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1\n",
  24. "\n",
  25. "# 在构建target的Vocabulary时,词表中应该用不上pad和unk,可以通过以下的初始化\n",
  26. "vocab = Vocabulary(unknown=None, padding=None)"
  27. ]
  28. },
  29. {
  30. "cell_type": "code",
  31. "execution_count": 2,
  32. "metadata": {},
  33. "outputs": [
  34. {
  35. "data": {
  36. "text/plain": [
  37. "Vocabulary(['positive', 'negative']...)"
  38. ]
  39. },
  40. "execution_count": 2,
  41. "metadata": {},
  42. "output_type": "execute_result"
  43. }
  44. ],
  45. "source": [
  46. "vocab.add_word_lst(['positive', 'negative'])"
  47. ]
  48. },
  49. {
  50. "cell_type": "code",
  51. "execution_count": 3,
  52. "metadata": {
  53. "scrolled": true
  54. },
  55. "outputs": [
  56. {
  57. "data": {
  58. "text/plain": [
  59. "0"
  60. ]
  61. },
  62. "execution_count": 3,
  63. "metadata": {},
  64. "output_type": "execute_result"
  65. }
  66. ],
  67. "source": [
  68. "vocab.to_index('positive')"
  69. ]
  70. },
  71. {
  72. "cell_type": "markdown",
  73. "metadata": {},
  74. "source": [
  75. "### 没有设置 unk 的情况"
  76. ]
  77. },
  78. {
  79. "cell_type": "code",
  80. "execution_count": 4,
  81. "metadata": {
  82. "scrolled": true
  83. },
  84. "outputs": [
  85. {
  86. "ename": "ValueError",
  87. "evalue": "word `neutral` not in vocabulary",
  88. "output_type": "error",
  89. "traceback": [
  90. "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
  91. "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
  92. "\u001b[0;32m<ipython-input-4-c6d424040b45>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mvocab\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'neutral'\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# 会报错,因为没有unk这种情况\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
  93. "\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36mto_index\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 414\u001b[0m \u001b[0;34m:\u001b[0m\u001b[0;32mreturn\u001b[0m \u001b[0mint\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mthe\u001b[0m \u001b[0mnumber\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 415\u001b[0m \"\"\"\n\u001b[0;32m--> 416\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__getitem__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 417\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 418\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mproperty\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  94. "\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m_wrapper\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mrebuild\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 43\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mbuild_vocab\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 44\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfunc\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 45\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0m_wrapper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  95. "\u001b[0;32m~/anaconda3/envs/now/lib/python3.8/site-packages/FastNLP-0.5.0-py3.8.egg/fastNLP/core/vocabulary.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, w)\u001b[0m\n\u001b[1;32m 272\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_word2idx\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0munknown\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 274\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"word `{}` not in vocabulary\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mw\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 275\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 276\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0m_check_build_vocab\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
  96. "\u001b[0;31mValueError\u001b[0m: word `neutral` not in vocabulary"
  97. ]
  98. }
  99. ],
  100. "source": [
  101. "vocab.to_index('neutral') # 会报错,因为没有unk这种情况"
  102. ]
  103. },
  104. {
  105. "cell_type": "markdown",
  106. "metadata": {},
  107. "source": [
  108. "### 设置 unk 的情况"
  109. ]
  110. },
  111. {
  112. "cell_type": "code",
  113. "execution_count": 25,
  114. "metadata": {},
  115. "outputs": [
  116. {
  117. "data": {
  118. "text/plain": [
  119. "(0, '<unk>')"
  120. ]
  121. },
  122. "execution_count": 25,
  123. "metadata": {},
  124. "output_type": "execute_result"
  125. }
  126. ],
  127. "source": [
  128. "from fastNLP import Vocabulary\n",
  129. "\n",
  130. "vocab = Vocabulary(unknown='<unk>', padding=None)\n",
  131. "vocab.add_word_lst(['positive', 'negative'])\n",
  132. "vocab.to_index('neutral'), vocab.to_word(vocab.to_index('neutral'))"
  133. ]
  134. },
  135. {
  136. "cell_type": "code",
  137. "execution_count": 8,
  138. "metadata": {},
  139. "outputs": [
  140. {
  141. "data": {
  142. "text/plain": [
  143. "Vocabulary(['positive', 'negative']...)"
  144. ]
  145. },
  146. "execution_count": 8,
  147. "metadata": {},
  148. "output_type": "execute_result"
  149. }
  150. ],
  151. "source": [
  152. "vocab"
  153. ]
  154. },
  155. {
  156. "cell_type": "code",
  157. "execution_count": 7,
  158. "metadata": {},
  159. "outputs": [
  160. {
  161. "name": "stdout",
  162. "output_type": "stream",
  163. "text": [
  164. "+---------------------------------------------------+--------+\n",
  165. "| chars | target |\n",
  166. "+---------------------------------------------------+--------+\n",
  167. "| [4, 2, 2, 5, 6, 7, 3] | 0 |\n",
  168. "| [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 |\n",
  169. "+---------------------------------------------------+--------+\n"
  170. ]
  171. }
  172. ],
  173. "source": [
  174. "from fastNLP import Vocabulary\n",
  175. "from fastNLP import DataSet\n",
  176. "\n",
  177. "dataset = DataSet({'chars': [\n",
  178. " ['今', '天', '天', '气', '很', '好', '。'],\n",
  179. " ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
  180. " ],\n",
  181. " 'target': ['neutral', 'negative']\n",
  182. "})\n",
  183. "\n",
  184. "vocab = Vocabulary()\n",
  185. "vocab.from_dataset(dataset, field_name='chars')\n",
  186. "vocab.index_dataset(dataset, field_name='chars')\n",
  187. "\n",
  188. "target_vocab = Vocabulary(padding=None, unknown=None)\n",
  189. "target_vocab.from_dataset(dataset, field_name='target')\n",
  190. "target_vocab.index_dataset(dataset, field_name='target')\n",
  191. "print(dataset)"
  192. ]
  193. },
  194. {
  195. "cell_type": "code",
  196. "execution_count": 8,
  197. "metadata": {},
  198. "outputs": [
  199. {
  200. "data": {
  201. "text/plain": [
  202. "Vocabulary(['今', '天', '心', '情', '很']...)"
  203. ]
  204. },
  205. "execution_count": 8,
  206. "metadata": {},
  207. "output_type": "execute_result"
  208. }
  209. ],
  210. "source": [
  211. "from fastNLP import Vocabulary\n",
  212. "from fastNLP import DataSet\n",
  213. "\n",
  214. "tr_data = DataSet({'chars': [\n",
  215. " ['今', '天', '心', '情', '很', '好', '。'],\n",
  216. " ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']\n",
  217. " ],\n",
  218. " 'target': ['positive', 'negative']\n",
  219. "})\n",
  220. "dev_data = DataSet({'chars': [\n",
  221. " ['住', '宿', '条', '件', '还', '不', '错'],\n",
  222. " ['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。']\n",
  223. " ],\n",
  224. " 'target': ['positive', 'negative']\n",
  225. "})\n",
  226. "\n",
  227. "vocab = Vocabulary()\n",
  228. "# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。\n",
  229. "vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])\n"
  230. ]
  231. },
  232. {
  233. "cell_type": "code",
  234. "execution_count": 9,
  235. "metadata": {},
  236. "outputs": [
  237. {
  238. "name": "stderr",
  239. "output_type": "stream",
  240. "text": [
  241. " 4%|▎ | 2.31M/63.5M [00:00<00:02, 22.9MB/s]"
  242. ]
  243. },
  244. {
  245. "name": "stdout",
  246. "output_type": "stream",
  247. "text": [
  248. "http://212.129.155.247/embedding/glove.6B.50d.zip not found in cache, downloading to /tmp/tmpvziobj_e\n"
  249. ]
  250. },
  251. {
  252. "name": "stderr",
  253. "output_type": "stream",
  254. "text": [
  255. "100%|██████████| 63.5M/63.5M [00:01<00:00, 41.3MB/s]\n"
  256. ]
  257. },
  258. {
  259. "name": "stdout",
  260. "output_type": "stream",
  261. "text": [
  262. "Finish download from http://212.129.155.247/embedding/glove.6B.50d.zip\n",
  263. "Copy file to /remote-home/ynzheng/.fastNLP/embedding/glove.6B.50d\n",
  264. "Found 2 out of 6 words in the pre-training embedding.\n",
  265. "tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, -0.7208, -0.2931, -0.7468, 0.6512,\n",
  266. " 0.4730, -0.7401, 0.1877, -0.3828, -0.5590, 0.4295, -0.2698, -0.4238,\n",
  267. " -0.3124, 1.3423, -0.7857, -0.6302, 0.9182, 0.2113, -0.5744, 1.4549,\n",
  268. " 0.7546, -1.6165, -0.0085, 0.0029, 0.5130, -0.4745, 2.5306, 0.8594,\n",
  269. " -0.3067, 0.0578, 0.6623, 0.2080, 0.6424, -0.5246, -0.0534, 1.1404,\n",
  270. " -0.1370, -0.1836, 0.4546, -0.5096, -0.0255, -0.0286, 0.1805, -0.4483,\n",
  271. " 0.4053, -0.3682]], grad_fn=<EmbeddingBackward>)\n",
  272. "tensor([[ 0.1320, -0.2392, 0.1732, -0.2390, -0.0463, 0.0494, 0.0488, -0.0886,\n",
  273. " 0.0224, -0.1300, 0.0369, 0.1800, 0.0750, -0.0183, 0.2264, 0.1628,\n",
  274. " 0.1261, -0.1259, 0.1663, -0.1230, -0.1904, -0.0532, 0.1397, -0.0259,\n",
  275. " -0.1799, 0.0226, 0.1858, 0.1981, 0.1338, 0.2394, 0.0248, 0.0203,\n",
  276. " -0.1722, -0.1683, -0.1892, 0.0874, 0.0562, -0.0394, 0.0306, -0.1761,\n",
  277. " 0.1015, -0.0171, 0.1172, 0.1357, 0.1519, -0.0011, 0.1572, 0.1265,\n",
  278. " -0.2391, -0.0258]], grad_fn=<EmbeddingBackward>)\n",
  279. "tensor([[ 0.1318, -0.2552, -0.0679, 0.2619, -0.2616, 0.2357, 0.1308, -0.0118,\n",
  280. " 1.7659, 0.2078, 0.2620, -0.1643, -0.8464, 0.0201, 0.0702, 0.3978,\n",
  281. " 0.1528, -0.2021, -1.6184, -0.5433, -0.1786, 0.5389, 0.4987, -0.1017,\n",
  282. " 0.6626, -1.7051, 0.0572, -0.3241, -0.6683, 0.2665, 2.8420, 0.2684,\n",
  283. " -0.5954, -0.5004, 1.5199, 0.0396, 1.6659, 0.9976, -0.5597, -0.7049,\n",
  284. " -0.0309, -0.2830, -0.1356, 0.6429, 0.4149, 1.2362, 0.7659, 0.9780,\n",
  285. " 0.5851, -0.3018]], grad_fn=<EmbeddingBackward>)\n",
  286. "tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
  287. " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
  288. " 0., 0.]], grad_fn=<EmbeddingBackward>)\n",
  289. "tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
  290. " 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,\n",
  291. " 0., 0.]], grad_fn=<EmbeddingBackward>)\n"
  292. ]
  293. }
  294. ],
  295. "source": [
  296. "import torch\n",
  297. "from fastNLP.embeddings import StaticEmbedding\n",
  298. "from fastNLP import Vocabulary\n",
  299. "\n",
  300. "vocab = Vocabulary()\n",
  301. "vocab.add_word('train')\n",
  302. "vocab.add_word('only_in_train') # 仅在train出现,但肯定在预训练词表中不存在\n",
  303. "vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现\n",
  304. "vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到\n",
  305. "\n",
  306. "embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
  307. "print(embed(torch.LongTensor([vocab.to_index('train')])))\n",
  308. "print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))\n",
  309. "print(embed(torch.LongTensor([vocab.to_index('test')])))\n",
  310. "print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))\n",
  311. "print(embed(torch.LongTensor([vocab.unknown_idx])))"
  312. ]
  313. },
  314. {
  315. "cell_type": "code",
  316. "execution_count": null,
  317. "metadata": {},
  318. "outputs": [],
  319. "source": []
  320. }
  321. ],
  322. "metadata": {
  323. "kernelspec": {
  324. "display_name": "Python Now",
  325. "language": "python",
  326. "name": "now"
  327. },
  328. "language_info": {
  329. "codemirror_mode": {
  330. "name": "ipython",
  331. "version": 3
  332. },
  333. "file_extension": ".py",
  334. "mimetype": "text/x-python",
  335. "name": "python",
  336. "nbconvert_exporter": "python",
  337. "pygments_lexer": "ipython3",
  338. "version": "3.8.0"
  339. }
  340. },
  341. "nbformat": 4,
  342. "nbformat_minor": 2
  343. }