|
123456789101112131415161718192021222324 |
- import unittest
-
-
- from fastNLP.modules import BertTokenizer
-
-
- class TestBertTokenizer(unittest.TestCase):
- def test_run(self):
- # 测试支持的两种encode方式
- tokenizer = BertTokenizer.from_pretrained('test/data_for_tests/embedding/small_bert')
-
- tokens1 = tokenizer.encode("This is a demo")
- tokens2 = tokenizer.encode("This is a demo")
- tokens3 = tokenizer.encode("This is a demo".split())
- tokens4 = tokenizer.encode("This is a demo".split())
-
- self.assertEqual(len(tokens1)-2, len(tokens2))
- self.assertEqual(len(tokens3)-2, len(tokens4))
-
- self.assertEqual(tokens1[0], tokenizer.cls_index)
- self.assertEqual(tokens1[-1], tokenizer.sep_index)
-
-
-
|