You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_gpt2_embedding.py 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268
  1. import unittest
  2. import torch
  3. import os
  4. from fastNLP.modules.tokenizer.gpt2_tokenizer import GPT2Tokenizer
  5. from fastNLP.embeddings import GPT2WordPieceEncoder, GPT2Embedding
  6. from fastNLP import DataSet, Vocabulary
  7. class TestGPT2Embedding(unittest.TestCase):
  8. @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
  9. def test_download(self):
  10. vocab = Vocabulary().add_word_lst("This is a test .".split())
  11. embed = GPT2Embedding(vocab, model_dir_or_name='en')
  12. words = torch.LongTensor([[2, 3, 4, 0]])
  13. print(embed(words).size())
  14. for pool_method in ['first', 'last', 'max', 'avg']:
  15. embed = GPT2Embedding(vocab, model_dir_or_name='en', pool_method=pool_method)
  16. print(embed(words).size())
  17. def test_gpt2_embedding(self):
  18. weight_path = 'test/data_for_tests/embedding/small_gpt2'
  19. vocab = Vocabulary().add_word_lst("this is a texta sentence".split())
  20. embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1)
  21. requires_grad = embed.requires_grad
  22. embed.requires_grad = not requires_grad
  23. embed.train()
  24. words = torch.LongTensor([[2, 3, 4, 0]])
  25. result = embed(words)
  26. self.assertEqual(result.size(), (1, 4, 16))
  27. embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1,
  28. only_use_pretrain_bpe=False, language_model=True)
  29. embed.eval()
  30. words = torch.LongTensor([[2, 3, 4, 0]])
  31. result = embed(words)
  32. self.assertEqual(result.size(), (1, 4, 16))
  33. embed.get_lm_loss()
  34. vocab.add_word("NotInGpt2")
  35. embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1,
  36. only_use_pretrain_bpe=False, auto_truncate=True, min_freq=1)
  37. words = torch.LongTensor([[2, 3, 4, 0]*20])
  38. result = embed(words)
  39. self.assertEqual(result.size(), (1, 80, 16))
  40. def test_gpt2_ebembedding_2(self):
  41. # 测试only_use_pretrain_vocab与truncate_embed是否正常工作
  42. Embedding = GPT2Embedding
  43. weight_path = 'test/data_for_tests/embedding/small_gpt2'
  44. vocab = Vocabulary().add_word_lst("this is a texta and".split())
  45. embed1 = Embedding(vocab, model_dir_or_name=weight_path,layers=list(range(3)),
  46. only_use_pretrain_bpe=True, truncate_embed=True, min_freq=1)
  47. # embed_bpe_vocab_size = len(vocab)-1 + 2 # 排除NotInBERT, 额外加##a, [CLS]
  48. # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab))
  49. embed2 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  50. only_use_pretrain_bpe=True, truncate_embed=False, min_freq=1)
  51. # embed_bpe_vocab_size = num_word # 排除NotInBERT
  52. # self.assertEqual(embed_bpe_vocab_size, len(embed2.model.tokenzier.vocab))
  53. embed3 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  54. only_use_pretrain_bpe=False, truncate_embed=True, min_freq=1)
  55. # embed_bpe_vocab_size = len(vocab)+2 # 新增##a, [CLS]
  56. # self.assertEqual(embed_bpe_vocab_size, len(embed3.model.tokenzier.vocab))
  57. embed4 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  58. only_use_pretrain_bpe=False, truncate_embed=False, min_freq=1)
  59. # embed_bpe_vocab_size = num_word+1 # 新增##a
  60. # self.assertEqual(embed_bpe_vocab_size, len(embed4.model.tokenzier.vocab))
  61. # 测试各种情况下以下tensor的值是相等的
  62. embed1.eval()
  63. embed2.eval()
  64. embed3.eval()
  65. embed4.eval()
  66. tensor = torch.LongTensor([[vocab.to_index(w) for w in 'this is a texta and'.split()]])
  67. t1 = embed1(tensor)
  68. t2 = embed2(tensor)
  69. t3 = embed3(tensor)
  70. t4 = embed4(tensor)
  71. self.assertEqual((t1-t2).sum(), 0)
  72. self.assertEqual((t1-t3).sum(), 0)
  73. self.assertEqual((t1-t4).sum(), 0)
  74. def test_gpt2_tokenizer(self):
  75. from fastNLP.modules.tokenizer import GPT2Tokenizer
  76. tokenizer = GPT2Tokenizer.from_pretrained('test/data_for_tests/embedding/small_gpt2')
  77. print(tokenizer.encode("this is a texta a sentence"))
  78. print(tokenizer.encode('this is'))
  79. def test_gpt2_embed_eq_gpt2_piece_encoder(self):
  80. # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致
  81. weight_path = 'test/data_for_tests/embedding/small_gpt2'
  82. ds = DataSet({'words': ["this is a texta a sentence".split(), 'this is'.split()]})
  83. encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path)
  84. encoder.eval()
  85. encoder.index_datasets(ds, field_name='words')
  86. word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
  87. word_pieces_res = encoder(word_pieces)
  88. vocab = Vocabulary()
  89. vocab.from_dataset(ds, field_name='words')
  90. vocab.index_dataset(ds, field_name='words', new_field_name='words')
  91. ds.set_input('words')
  92. words = torch.LongTensor(ds['words'].get([0, 1]))
  93. embed = GPT2Embedding(vocab, model_dir_or_name=weight_path, pool_method='first')
  94. embed.eval()
  95. words_res = embed(words)
  96. # 检查word piece什么的是正常work的
  97. self.assertEqual((word_pieces_res[0, :4]-words_res[0, :4]).sum(), 0)
  98. self.assertEqual((word_pieces_res[0, 5:]-words_res[0, 4:]).sum(), 0)
  99. self.assertEqual((word_pieces_res[1, :2]-words_res[1, :2]).sum(), 0)
  100. class TestGPT2WordPieceEncoder(unittest.TestCase):
  101. @unittest.skipIf(True, "Only for local debugging")
  102. def test_eq_transformers(self):
  103. # 测试能否正确得到类似于transformers的结果
  104. weight_path = ''
  105. # tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight_path)
  106. ds = DataSet({'words': ["this this this a is texta model vocab".split(), 'this is'.split()]})
  107. import transformers
  108. input1 = ' '.join(ds[0]['words'])
  109. input2 = ' '.join(ds[1]['words'])
  110. tokenizer = transformers.GPT2Tokenizer.from_pretrained(weight_path)
  111. idx_list1 = tokenizer.encode(input1)
  112. idx_list2 = tokenizer.encode(input2)
  113. pad_value = tokenizer.encode('<|endoftext|>')[0]
  114. tensor = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(idx_list1),
  115. torch.LongTensor(idx_list2)],
  116. batch_first=True,
  117. padding_value=pad_value)
  118. gpt2 = transformers.GPT2Model.from_pretrained(weight_path, output_hidden_states=True)
  119. gpt2.eval()
  120. tensor = tensor
  121. output, _, trans_hidden_states = gpt2(tensor, attention_mask=tensor.ne(pad_value))
  122. encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path, layers=list(range(13)))
  123. encoder.eval()
  124. encoder.index_datasets(ds, field_name='words', add_endoftext=False)
  125. word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
  126. self.assertEqual(idx_list1, ds[0]['word_pieces'])
  127. self.assertEqual(idx_list2, ds[1]['word_pieces'])
  128. word_pieces_res = encoder(word_pieces)
  129. self.assertEqual((torch.cat(trans_hidden_states, dim=-1)-word_pieces_res).sum(), 0)
  130. @unittest.skipIf(True, "Only for local usage")
  131. def test_generate_small_gpt2(self):
  132. # 因为GPT2使用的是GPT2的tokenizer,所以没办法直接生成权重,需要用点下面的方式
  133. weight_path = ''
  134. tokenizer = GPT2Tokenizer.from_pretrained(weight_path)
  135. used_pairs = {}
  136. used_vocab = {}
  137. # 修改这里即可获得更多的sentence的数据
  138. sent1 = "This is a demo sentence"
  139. sent2 = "another demo"
  140. sent3 = 'this is a texta model vocab'
  141. all_tokens = []
  142. for sent in [sent1, sent2, sent3]:
  143. tokens = []
  144. for word in sent.split():
  145. word = ' '+ word
  146. token = "".join(
  147. tokenizer.byte_encoder[b] for b in word.encode("utf-8")
  148. )
  149. _token, _used_pairs = tokenizer.get_used_merge_pair_vocab(token)
  150. tokens.extend(_token.split())
  151. used_pairs.update(_used_pairs)
  152. all_tokens.extend(tokens)
  153. token_ids = tokenizer.convert_tokens_to_ids(tokens)
  154. used_vocab.update({t:i for t,i in zip(tokens, token_ids)})
  155. print(used_pairs)
  156. import json
  157. with open('test/data_for_tests/embedding/small_gpt2/vocab.json', 'w') as f:
  158. new_used_vocab = {}
  159. for idx, key in enumerate(used_vocab.keys()):
  160. new_used_vocab[key] = len(new_used_vocab)
  161. new_used_vocab['<|endoftext|>'] = len(new_used_vocab)
  162. for i in range(65, 91):
  163. if chr(i) not in new_used_vocab:
  164. new_used_vocab[chr(i)] = len(new_used_vocab)
  165. for i in range(97, 123):
  166. if chr(i) not in new_used_vocab:
  167. new_used_vocab[chr(i)] = len(new_used_vocab)
  168. json.dump(new_used_vocab, f)
  169. with open('test/data_for_tests/embedding/small_gpt2/merges.txt', 'w') as f:
  170. f.write('#version: small\n')
  171. for k,v in sorted(sorted(used_pairs.items(), key=lambda kv:kv[1])):
  172. f.write('{} {}\n'.format(k[0], k[1]))
  173. new_tokenizer = GPT2Tokenizer.from_pretrained('test/data_for_tests/embedding/small_gpt2')
  174. new_all_tokens = []
  175. for sent in [sent1, sent2, sent3]:
  176. tokens = new_tokenizer.tokenize(sent, add_prefix_space=True)
  177. new_all_tokens.extend(tokens)
  178. print(all_tokens, new_all_tokens)
  179. self.assertSequenceEqual(all_tokens, new_all_tokens)
  180. config = {
  181. "architectures": [
  182. "GPT2LMHeadModel"
  183. ],
  184. "initializer_range": 0.02,
  185. "layer_norm_epsilon": 1e-05,
  186. "n_ctx": 20,
  187. "n_embd": 16,
  188. "n_head": 4,
  189. "n_layer": 2,
  190. "n_positions": 20,
  191. "vocab_size": len(new_used_vocab)
  192. }
  193. with open('test/data_for_tests/embedding/small_gpt2/config.json', 'w') as f:
  194. json.dump(config, f)
  195. # 生成更小的merges.txt与vocab.json, 方法是通过记录tokenizer中的值实现
  196. from fastNLP.modules.encoder.gpt2 import GPT2LMHeadModel, GPT2Config
  197. config = GPT2Config.from_pretrained('test/data_for_tests/embedding/small_gpt2')
  198. model = GPT2LMHeadModel(config)
  199. torch.save(model.state_dict(), 'test/data_for_tests/embedding/small_gpt2/small_pytorch_model.bin')
  200. print(model(torch.LongTensor([[0,1,2,3]])))
  201. def test_gpt2_word_piece_encoder(self):
  202. # 主要检查可以运行
  203. weight_path = 'test/data_for_tests/embedding/small_gpt2'
  204. ds = DataSet({'words': ["this is a test sentence".split()]})
  205. embed = GPT2WordPieceEncoder(model_dir_or_name=weight_path, word_dropout=0.1)
  206. embed.index_datasets(ds, field_name='words')
  207. self.assertTrue(ds.has_field('word_pieces'))
  208. result = embed(torch.LongTensor([[1, 2, 3, 4]]))
  209. embed = GPT2WordPieceEncoder(model_dir_or_name=weight_path, word_dropout=0.1,
  210. language_model=True)
  211. embed.index_datasets(ds, field_name='words')
  212. self.assertTrue(ds.has_field('word_pieces'))
  213. result = embed(torch.LongTensor([[1, 2, 3, 4]]))
  214. def test_generate(self):
  215. weight_path = 'test/data_for_tests/embedding/small_gpt2'
  216. encoder = GPT2WordPieceEncoder(model_dir_or_name=weight_path, language_model=True)
  217. # 测试一下各项东西是否正常work
  218. print(encoder.generate_from_str('this', max_len=20, do_sample=False, num_beams=1, temperature=1, top_k=50, top_p=1.0,
  219. repetition_penalty=1.0, length_penalty=1.0))
  220. print(encoder.generate_from_str('this', max_len=20, do_sample=True, num_beams=3, temperature=1, top_k=50, top_p=1.0,
  221. repetition_penalty=1.0, length_penalty=1.0))
  222. print(encoder.generate_from_str('this', max_len=20, do_sample=True, num_beams=3, temperature=2, top_k=20, top_p=2.0,
  223. repetition_penalty=2.0, length_penalty=1.5))