You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tutorial_3_embedding.ipynb 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524
  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": 1,
  6. "metadata": {},
  7. "outputs": [
  8. {
  9. "name": "stdout",
  10. "output_type": "stream",
  11. "text": [
  12. "Found 5 out of 7 words in the pre-training embedding.\n",
  13. "torch.Size([1, 5, 50])\n"
  14. ]
  15. }
  16. ],
  17. "source": [
  18. "import torch\n",
  19. "from fastNLP.embeddings import StaticEmbedding\n",
  20. "from fastNLP import Vocabulary\n",
  21. "\n",
  22. "vocab = Vocabulary()\n",
  23. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  24. "\n",
  25. "embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
  26. "\n",
  27. "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n",
  28. "print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
  29. ]
  30. },
  31. {
  32. "cell_type": "code",
  33. "execution_count": 2,
  34. "metadata": {},
  35. "outputs": [
  36. {
  37. "name": "stdout",
  38. "output_type": "stream",
  39. "text": [
  40. "torch.Size([1, 5, 30])\n"
  41. ]
  42. }
  43. ],
  44. "source": [
  45. "from fastNLP.embeddings import StaticEmbedding\n",
  46. "from fastNLP import Vocabulary\n",
  47. "\n",
  48. "vocab = Vocabulary()\n",
  49. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  50. "\n",
  51. "embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
  52. "\n",
  53. "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
  54. "print(embed(words).size())"
  55. ]
  56. },
  57. {
  58. "cell_type": "code",
  59. "execution_count": 3,
  60. "metadata": {},
  61. "outputs": [
  62. {
  63. "name": "stdout",
  64. "output_type": "stream",
  65. "text": [
  66. "22 out of 22 characters were found in pretrained elmo embedding.\n",
  67. "torch.Size([1, 5, 256])\n"
  68. ]
  69. }
  70. ],
  71. "source": [
  72. "from fastNLP.embeddings import ElmoEmbedding\n",
  73. "from fastNLP import Vocabulary\n",
  74. "\n",
  75. "vocab = Vocabulary()\n",
  76. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  77. "\n",
  78. "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
  79. "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
  80. "print(embed(words).size())"
  81. ]
  82. },
  83. {
  84. "cell_type": "code",
  85. "execution_count": 4,
  86. "metadata": {},
  87. "outputs": [
  88. {
  89. "name": "stdout",
  90. "output_type": "stream",
  91. "text": [
  92. "22 out of 22 characters were found in pretrained elmo embedding.\n",
  93. "torch.Size([1, 5, 512])\n"
  94. ]
  95. }
  96. ],
  97. "source": [
  98. "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
  99. "print(embed(words).size())"
  100. ]
  101. },
  102. {
  103. "cell_type": "code",
  104. "execution_count": 5,
  105. "metadata": {},
  106. "outputs": [
  107. {
  108. "name": "stdout",
  109. "output_type": "stream",
  110. "text": [
  111. "22 out of 22 characters were found in pretrained elmo embedding.\n",
  112. "torch.Size([1, 5, 256])\n"
  113. ]
  114. }
  115. ],
  116. "source": [
  117. "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
  118. "print(embed(words).size()) # 三层输出按照权重element-wise的加起来"
  119. ]
  120. },
  121. {
  122. "cell_type": "code",
  123. "execution_count": 6,
  124. "metadata": {},
  125. "outputs": [
  126. {
  127. "name": "stdout",
  128. "output_type": "stream",
  129. "text": [
  130. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  131. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  132. "Start to generate word pieces for word.\n",
  133. "Found(Or segment into word pieces) 7 words out of 7.\n",
  134. "torch.Size([1, 5, 768])\n"
  135. ]
  136. }
  137. ],
  138. "source": [
  139. "from fastNLP.embeddings import BertEmbedding\n",
  140. "from fastNLP import Vocabulary\n",
  141. "\n",
  142. "vocab = Vocabulary()\n",
  143. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  144. "\n",
  145. "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
  146. "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
  147. "print(embed(words).size())"
  148. ]
  149. },
  150. {
  151. "cell_type": "code",
  152. "execution_count": 7,
  153. "metadata": {},
  154. "outputs": [
  155. {
  156. "name": "stdout",
  157. "output_type": "stream",
  158. "text": [
  159. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  160. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  161. "Start to generate word pieces for word.\n",
  162. "Found(Or segment into word pieces) 7 words out of 7.\n",
  163. "torch.Size([1, 5, 1536])\n"
  164. ]
  165. }
  166. ],
  167. "source": [
  168. "# 使用后面两层的输出\n",
  169. "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
  170. "print(embed(words).size()) # 结果将是在最后一维做拼接"
  171. ]
  172. },
  173. {
  174. "cell_type": "code",
  175. "execution_count": 8,
  176. "metadata": {},
  177. "outputs": [
  178. {
  179. "name": "stdout",
  180. "output_type": "stream",
  181. "text": [
  182. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  183. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  184. "Start to generate word pieces for word.\n",
  185. "Found(Or segment into word pieces) 7 words out of 7.\n",
  186. "torch.Size([1, 7, 768])\n"
  187. ]
  188. }
  189. ],
  190. "source": [
  191. "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
  192. "print(embed(words).size()) # 结果将在序列维度上增加2\n",
  193. "# 取出句子的cls表示\n",
  194. "cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]"
  195. ]
  196. },
  197. {
  198. "cell_type": "code",
  199. "execution_count": 9,
  200. "metadata": {},
  201. "outputs": [
  202. {
  203. "name": "stdout",
  204. "output_type": "stream",
  205. "text": [
  206. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  207. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  208. "Start to generate word pieces for word.\n",
  209. "Found(Or segment into word pieces) 7 words out of 7.\n",
  210. "torch.Size([1, 5, 768])\n"
  211. ]
  212. }
  213. ],
  214. "source": [
  215. "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
  216. "print(embed(words).size())"
  217. ]
  218. },
  219. {
  220. "cell_type": "code",
  221. "execution_count": 10,
  222. "metadata": {},
  223. "outputs": [
  224. {
  225. "name": "stdout",
  226. "output_type": "stream",
  227. "text": [
  228. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  229. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  230. "Start to generate word pieces for word.\n",
  231. "Found(Or segment into word pieces) 10 words out of 10.\n",
  232. "torch.Size([1, 9, 768])\n"
  233. ]
  234. }
  235. ],
  236. "source": [
  237. "vocab = Vocabulary()\n",
  238. "vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
  239. "\n",
  240. "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
  241. "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
  242. "print(embed(words).size())"
  243. ]
  244. },
  245. {
  246. "cell_type": "code",
  247. "execution_count": 11,
  248. "metadata": {},
  249. "outputs": [
  250. {
  251. "name": "stdout",
  252. "output_type": "stream",
  253. "text": [
  254. "Start constructing character vocabulary.\n",
  255. "In total, there are 8 distinct characters.\n",
  256. "torch.Size([1, 5, 64])\n"
  257. ]
  258. }
  259. ],
  260. "source": [
  261. "from fastNLP.embeddings import CNNCharEmbedding\n",
  262. "from fastNLP import Vocabulary\n",
  263. "\n",
  264. "vocab = Vocabulary()\n",
  265. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  266. "\n",
  267. "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
  268. "embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
  269. "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
  270. "print(embed(words).size())"
  271. ]
  272. },
  273. {
  274. "cell_type": "code",
  275. "execution_count": 12,
  276. "metadata": {},
  277. "outputs": [
  278. {
  279. "name": "stdout",
  280. "output_type": "stream",
  281. "text": [
  282. "Start constructing character vocabulary.\n",
  283. "In total, there are 8 distinct characters.\n",
  284. "torch.Size([1, 5, 64])\n"
  285. ]
  286. }
  287. ],
  288. "source": [
  289. "from fastNLP.embeddings import LSTMCharEmbedding\n",
  290. "from fastNLP import Vocabulary\n",
  291. "\n",
  292. "vocab = Vocabulary()\n",
  293. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  294. "\n",
  295. "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
  296. "embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
  297. "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
  298. "print(embed(words).size())"
  299. ]
  300. },
  301. {
  302. "cell_type": "code",
  303. "execution_count": 13,
  304. "metadata": {},
  305. "outputs": [
  306. {
  307. "name": "stdout",
  308. "output_type": "stream",
  309. "text": [
  310. "Found 5 out of 7 words in the pre-training embedding.\n",
  311. "50\n",
  312. "Start constructing character vocabulary.\n",
  313. "In total, there are 8 distinct characters.\n",
  314. "30\n",
  315. "22 out of 22 characters were found in pretrained elmo embedding.\n",
  316. "256\n",
  317. "22 out of 22 characters were found in pretrained elmo embedding.\n",
  318. "512\n",
  319. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  320. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  321. "Start to generate word pieces for word.\n",
  322. "Found(Or segment into word pieces) 7 words out of 7.\n",
  323. "768\n",
  324. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  325. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  326. "Start to generate word pieces for word.\n",
  327. "Found(Or segment into word pieces) 7 words out of 7.\n",
  328. "1536\n",
  329. "80\n"
  330. ]
  331. }
  332. ],
  333. "source": [
  334. "from fastNLP.embeddings import *\n",
  335. "\n",
  336. "vocab = Vocabulary()\n",
  337. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  338. "\n",
  339. "static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
  340. "print(static_embed.embedding_dim) # 50\n",
  341. "char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
  342. "print(char_embed.embedding_dim) # 30\n",
  343. "elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
  344. "print(elmo_embed_1.embedding_dim) # 256\n",
  345. "elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
  346. "print(elmo_embed_2.embedding_dim) # 512\n",
  347. "bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
  348. "print(bert_embed_1.embedding_dim) # 768\n",
  349. "bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
  350. "print(bert_embed_2.embedding_dim) # 1536\n",
  351. "stack_embed = StackEmbedding([static_embed, char_embed])\n",
  352. "print(stack_embed.embedding_dim) # 80"
  353. ]
  354. },
  355. {
  356. "cell_type": "code",
  357. "execution_count": 14,
  358. "metadata": {},
  359. "outputs": [
  360. {
  361. "name": "stdout",
  362. "output_type": "stream",
  363. "text": [
  364. "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
  365. "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
  366. "Start to generate word pieces for word.\n",
  367. "Found(Or segment into word pieces) 7 words out of 7.\n"
  368. ]
  369. }
  370. ],
  371. "source": [
  372. "from fastNLP.embeddings import *\n",
  373. "\n",
  374. "vocab = Vocabulary()\n",
  375. "vocab.add_word_lst(\"this is a demo .\".split())\n",
  376. "\n",
  377. "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n",
  378. "embed.requires_grad = False # 修改BertEmbedding的权重为不更新"
  379. ]
  380. },
  381. {
  382. "cell_type": "code",
  383. "execution_count": 15,
  384. "metadata": {},
  385. "outputs": [
  386. {
  387. "name": "stdout",
  388. "output_type": "stream",
  389. "text": [
  390. "tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
  391. " grad_fn=<EmbeddingBackward>)\n",
  392. "tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n",
  393. " grad_fn=<EmbeddingBackward>)\n"
  394. ]
  395. }
  396. ],
  397. "source": [
  398. "from fastNLP.embeddings import StaticEmbedding\n",
  399. "from fastNLP import Vocabulary\n",
  400. "\n",
  401. "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
  402. "# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的\n",
  403. "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
  404. "print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
  405. "print(embed(torch.LongTensor([vocab.to_index('the')])))"
  406. ]
  407. },
  408. {
  409. "cell_type": "code",
  410. "execution_count": 16,
  411. "metadata": {},
  412. "outputs": [
  413. {
  414. "name": "stdout",
  415. "output_type": "stream",
  416. "text": [
  417. "All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
  418. "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
  419. " grad_fn=<EmbeddingBackward>)\n",
  420. "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
  421. " grad_fn=<EmbeddingBackward>)\n"
  422. ]
  423. }
  424. ],
  425. "source": [
  426. "from fastNLP.embeddings import StaticEmbedding\n",
  427. "from fastNLP import Vocabulary\n",
  428. "\n",
  429. "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
  430. "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
  431. "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
  432. "print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
  433. "print(embed(torch.LongTensor([vocab.to_index('the')])))"
  434. ]
  435. },
  436. {
  437. "cell_type": "code",
  438. "execution_count": 17,
  439. "metadata": {},
  440. "outputs": [
  441. {
  442. "name": "stdout",
  443. "output_type": "stream",
  444. "text": [
  445. "1 out of 4 words have frequency less than 2.\n",
  446. "tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
  447. " grad_fn=<EmbeddingBackward>)\n",
  448. "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
  449. " grad_fn=<EmbeddingBackward>)\n",
  450. "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
  451. " grad_fn=<EmbeddingBackward>)\n"
  452. ]
  453. }
  454. ],
  455. "source": [
  456. "from fastNLP.embeddings import StaticEmbedding\n",
  457. "from fastNLP import Vocabulary\n",
  458. "\n",
  459. "vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
  460. "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
  461. "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
  462. "print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
  463. "print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
  464. "print(embed(torch.LongTensor([vocab.unknown_idx])))"
  465. ]
  466. },
  467. {
  468. "cell_type": "code",
  469. "execution_count": 18,
  470. "metadata": {},
  471. "outputs": [
  472. {
  473. "name": "stdout",
  474. "output_type": "stream",
  475. "text": [
  476. "0 out of 5 words have frequency less than 2.\n",
  477. "All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
  478. "tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n",
  479. " grad_fn=<EmbeddingBackward>)\n",
  480. "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
  481. " grad_fn=<EmbeddingBackward>)\n",
  482. "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
  483. " grad_fn=<EmbeddingBackward>)\n",
  484. "tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n",
  485. " grad_fn=<EmbeddingBackward>)\n"
  486. ]
  487. }
  488. ],
  489. "source": [
  490. "from fastNLP.embeddings import StaticEmbedding\n",
  491. "from fastNLP import Vocabulary\n",
  492. "\n",
  493. "vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
  494. "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
  495. "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
  496. "print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
  497. "print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
  498. "print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
  499. "print(embed(torch.LongTensor([vocab.unknown_idx])))"
  500. ]
  501. }
  502. ],
  503. "metadata": {
  504. "kernelspec": {
  505. "display_name": "Python Now",
  506. "language": "python",
  507. "name": "now"
  508. },
  509. "language_info": {
  510. "codemirror_mode": {
  511. "name": "ipython",
  512. "version": 3
  513. },
  514. "file_extension": ".py",
  515. "mimetype": "text/x-python",
  516. "name": "python",
  517. "nbconvert_exporter": "python",
  518. "pygments_lexer": "ipython3",
  519. "version": "3.8.0"
  520. }
  521. },
  522. "nbformat": 4,
  523. "nbformat_minor": 2
  524. }