You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_roberta_embedding.py 12 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import unittest
  2. import torch
  3. import os
  4. from fastNLP import DataSet, Vocabulary
  5. from fastNLP.embeddings.roberta_embedding import RobertaWordPieceEncoder, RobertaEmbedding
  6. class TestRobertWordPieceEncoder(unittest.TestCase):
  7. @unittest.skipIf('TRAVIS' in os.environ, "Skip in travis")
  8. def test_download(self):
  9. vocab = Vocabulary().add_word_lst("This is a test .".split())
  10. embed = RobertaEmbedding(vocab, model_dir_or_name='en')
  11. words = torch.LongTensor([[2, 3, 4, 0]])
  12. print(embed(words).size())
  13. for pool_method in ['first', 'last', 'max', 'avg']:
  14. for include_cls_sep in [True, False]:
  15. embed = RobertaEmbedding(vocab, model_dir_or_name='en', pool_method=pool_method,
  16. include_cls_sep=include_cls_sep)
  17. print(embed(words).size())
  18. def test_robert_word_piece_encoder(self):
  19. # 可正常运行即可
  20. weight_path = 'test/data_for_tests/embedding/small_roberta'
  21. encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path, word_dropout=0.1)
  22. ds = DataSet({'words': ["this is a test . [SEP]".split()]})
  23. encoder.index_datasets(ds, field_name='words')
  24. self.assertTrue(ds.has_field('word_pieces'))
  25. result = encoder(torch.LongTensor([[1,2,3,4]]))
  26. def test_roberta_embed_eq_roberta_piece_encoder(self):
  27. # 主要检查一下embedding的结果与wordpieceencoder的结果是否一致
  28. weight_path = 'test/data_for_tests/embedding/small_roberta'
  29. ds = DataSet({'words': ["this is a texta a sentence".split(), 'this is'.split()]})
  30. encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
  31. encoder.eval()
  32. encoder.index_datasets(ds, field_name='words')
  33. word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
  34. word_pieces_res = encoder(word_pieces)
  35. vocab = Vocabulary()
  36. vocab.from_dataset(ds, field_name='words')
  37. vocab.index_dataset(ds, field_name='words', new_field_name='words')
  38. ds.set_input('words')
  39. words = torch.LongTensor(ds['words'].get([0, 1]))
  40. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path,
  41. pool_method='first', include_cls_sep=True, pooled_cls=False)
  42. embed.eval()
  43. words_res = embed(words)
  44. # 检查word piece什么的是正常work的
  45. self.assertEqual((word_pieces_res[0, :5]-words_res[0, :5]).sum(), 0)
  46. self.assertEqual((word_pieces_res[0, 6:]-words_res[0, 5:]).sum(), 0)
  47. self.assertEqual((word_pieces_res[1, :3]-words_res[1, :3]).sum(), 0)
  48. @unittest.skipIf(True, "Only for local debugging")
  49. def test_eq_transformers(self):
  50. weight_path = ''
  51. ds = DataSet({'words': ["this is a texta model vocab".split(), 'this is'.split()]})
  52. encoder = RobertaWordPieceEncoder(model_dir_or_name=weight_path)
  53. encoder.eval()
  54. encoder.index_datasets(ds, field_name='words')
  55. word_pieces = torch.LongTensor(ds['word_pieces'].get([0, 1]))
  56. word_pieces_res = encoder(word_pieces)
  57. import transformers
  58. input1 = ' '.join(ds[0]['words'])
  59. input2 = ' '.join(ds[1]['words'])
  60. tokenizer = transformers.RobertaTokenizer.from_pretrained(weight_path)
  61. idx_list1 = tokenizer.encode(input1)
  62. idx_list2 = tokenizer.encode(input2)
  63. self.assertEqual(idx_list1, ds[0]['word_pieces'])
  64. self.assertEqual(idx_list2, ds[1]['word_pieces'])
  65. pad_value = tokenizer.encode('<pad>')[0]
  66. tensor = torch.nn.utils.rnn.pad_sequence([torch.LongTensor(idx_list1),
  67. torch.LongTensor(idx_list2)],
  68. batch_first=True,
  69. padding_value=pad_value)
  70. roberta = transformers.RobertaModel.from_pretrained(weight_path, output_hidden_states=True)
  71. roberta.eval()
  72. output, pooled_output, hidden_states = roberta(tensor, attention_mask=tensor.ne(pad_value))
  73. self.assertEqual((output-word_pieces_res).sum(), 0)
  74. @unittest.skipIf(True, "Only for local usage")
  75. def test_generate_small_roberta(self):
  76. """
  77. 因为Roberta使用的是GPT2的tokenizer,所以没办法直接生成权重,需要用点下面的方式
  78. :return:
  79. """
  80. weight_path = ''
  81. from fastNLP.modules.tokenizer import RobertaTokenizer
  82. tokenizer = RobertaTokenizer.from_pretrained(weight_path)
  83. used_pairs = {}
  84. used_vocab = {}
  85. # 修改这里即可获得更多的sentence的数据
  86. sent1 = "This is a demo sentence"
  87. sent2 = "another demo"
  88. sent3 = 'this is a texta model vocab'
  89. all_tokens = []
  90. for sent in [sent1, sent2, sent3]:
  91. tokens = []
  92. for word in sent.split():
  93. word = ' '+ word
  94. token = "".join(
  95. tokenizer.byte_encoder[b] for b in word.encode("utf-8")
  96. )
  97. _token, _used_pairs = tokenizer.get_used_merge_pair_vocab(token)
  98. tokens.extend(_token.split())
  99. used_pairs.update(_used_pairs)
  100. all_tokens.extend(tokens)
  101. token_ids = tokenizer.convert_tokens_to_ids(tokens)
  102. used_vocab.update({t:i for t,i in zip(tokens, token_ids)})
  103. import json
  104. with open('test/data_for_tests/embedding/small_roberta/vocab.json', 'w') as f:
  105. new_used_vocab = {}
  106. for token in ['<s>', '<pad>', '</s>', '<unk>', '<mask>']: # <pad>必须为1
  107. new_used_vocab[token] = len(new_used_vocab)
  108. for i in range(65, 91):
  109. if chr(i) not in new_used_vocab:
  110. new_used_vocab[chr(i)] = len(new_used_vocab)
  111. for i in range(97, 123):
  112. if chr(i) not in new_used_vocab:
  113. new_used_vocab[chr(i)] = len(new_used_vocab)
  114. for idx, key in enumerate(used_vocab.keys()):
  115. if key not in new_used_vocab:
  116. new_used_vocab[key] = len(new_used_vocab)
  117. json.dump(new_used_vocab, f)
  118. with open('test/data_for_tests/embedding/small_roberta/merges.txt', 'w') as f:
  119. f.write('#version: tiny\n')
  120. for k,v in sorted(sorted(used_pairs.items(), key=lambda kv:kv[1])):
  121. f.write('{} {}\n'.format(k[0], k[1]))
  122. config = {
  123. "architectures": [
  124. "RobertaForMaskedLM"
  125. ],
  126. "attention_probs_dropout_prob": 0.1,
  127. "finetuning_task": None,
  128. "hidden_act": "gelu",
  129. "hidden_dropout_prob": 0.1,
  130. "hidden_size": 16,
  131. "initializer_range": 0.02,
  132. "intermediate_size": 20,
  133. "layer_norm_eps": 1e-05,
  134. "max_position_embeddings": 20,
  135. "num_attention_heads": 4,
  136. "num_hidden_layers": 2,
  137. "num_labels": 2,
  138. "output_attentions": False,
  139. "output_hidden_states": False,
  140. "torchscript": False,
  141. "type_vocab_size": 1,
  142. "vocab_size": len(new_used_vocab)
  143. }
  144. with open('test/data_for_tests/embedding/small_roberta/config.json', 'w') as f:
  145. json.dump(config, f)
  146. new_tokenizer = RobertaTokenizer.from_pretrained('test/data_for_tests/embedding/small_roberta')
  147. new_all_tokens = []
  148. for sent in [sent1, sent2, sent3]:
  149. tokens = new_tokenizer.tokenize(sent, add_prefix_space=True)
  150. new_all_tokens.extend(tokens)
  151. print(all_tokens, new_all_tokens)
  152. self.assertSequenceEqual(all_tokens, new_all_tokens)
  153. # 生成更小的merges.txt与vocab.json, 方法是通过记录tokenizer中的值实现
  154. from fastNLP.modules.encoder.roberta import RobertaModel, BertConfig
  155. config = BertConfig.from_json_file('test/data_for_tests/embedding/small_roberta/config.json')
  156. model = RobertaModel(config)
  157. torch.save(model.state_dict(), 'test/data_for_tests/embedding/small_roberta/small_pytorch_model.bin')
  158. print(model(torch.LongTensor([[0,1,2,3]])))
  159. class TestRobertaEmbedding(unittest.TestCase):
  160. def test_roberta_embedding_1(self):
  161. weight_path = 'test/data_for_tests/embedding/small_roberta'
  162. vocab = Vocabulary().add_word_lst("this is a test . [SEP] NotInRoberta".split())
  163. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1)
  164. requires_grad = embed.requires_grad
  165. embed.requires_grad = not requires_grad
  166. embed.train()
  167. words = torch.LongTensor([[2, 3, 4, 1]])
  168. result = embed(words)
  169. self.assertEqual(result.size(), (1, 4, 16))
  170. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1,
  171. only_use_pretrain_bpe=True)
  172. embed.eval()
  173. words = torch.LongTensor([[2, 3, 4, 1]])
  174. result = embed(words)
  175. self.assertEqual(result.size(), (1, 4, 16))
  176. # 自动截断而不报错
  177. embed = RobertaEmbedding(vocab, model_dir_or_name=weight_path, word_dropout=0.1,
  178. only_use_pretrain_bpe=True, auto_truncate=True)
  179. words = torch.LongTensor([[2, 3, 4, 1]*10,
  180. [2, 3]+[0]*38])
  181. result = embed(words)
  182. self.assertEqual(result.size(), (2, 40, 16))
  183. def test_roberta_ebembedding_2(self):
  184. # 测试only_use_pretrain_vocab与truncate_embed是否正常工作
  185. Embedding = RobertaEmbedding
  186. weight_path = 'test/data_for_tests/embedding/small_roberta'
  187. vocab = Vocabulary().add_word_lst("this is a texta and".split())
  188. embed1 = Embedding(vocab, model_dir_or_name=weight_path,layers=list(range(3)),
  189. only_use_pretrain_bpe=True, truncate_embed=True, min_freq=1)
  190. # embed_bpe_vocab_size = len(vocab)-1 + 2 # 排除NotInBERT, 额外加##a, [CLS]
  191. # self.assertEqual(embed_bpe_vocab_size, len(embed1.model.tokenzier.vocab))
  192. embed2 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  193. only_use_pretrain_bpe=True, truncate_embed=False, min_freq=1)
  194. # embed_bpe_vocab_size = num_word # 排除NotInBERT
  195. # self.assertEqual(embed_bpe_vocab_size, len(embed2.model.tokenzier.vocab))
  196. embed3 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  197. only_use_pretrain_bpe=False, truncate_embed=True, min_freq=1)
  198. # embed_bpe_vocab_size = len(vocab)+2 # 新增##a, [CLS]
  199. # self.assertEqual(embed_bpe_vocab_size, len(embed3.model.tokenzier.vocab))
  200. embed4 = Embedding(vocab, model_dir_or_name=weight_path, layers=list(range(3)),
  201. only_use_pretrain_bpe=False, truncate_embed=False, min_freq=1)
  202. # embed_bpe_vocab_size = num_word+1 # 新增##a
  203. # self.assertEqual(embed_bpe_vocab_size, len(embed4.model.tokenzier.vocab))
  204. # 测试各种情况下以下tensor的值是相等的
  205. embed1.eval()
  206. embed2.eval()
  207. embed3.eval()
  208. embed4.eval()
  209. tensor = torch.LongTensor([[vocab.to_index(w) for w in 'this is a texta and'.split()]])
  210. t1 = embed1(tensor)
  211. t2 = embed2(tensor)
  212. t3 = embed3(tensor)
  213. t4 = embed4(tensor)
  214. self.assertEqual((t1-t2).sum(), 0)
  215. self.assertEqual((t1-t3).sum(), 0)
  216. self.assertEqual((t1-t4).sum(), 0)