You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

model_runner.py 5.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142
  1. """
  2. 此模块可以非常方便的测试模型。
  3. 若你的模型属于:文本分类,序列标注,自然语言推理(NLI),可以直接使用此模块测试
  4. 若模型不属于上述类别,也可以自己准备假数据,设定loss和metric进行测试
  5. 此模块的测试仅保证模型能使用fastNLP进行训练和测试,不测试模型实际性能
  6. Example::
  7. # import 全大写变量...
  8. from model_runner import *
  9. # 测试一个文本分类模型
  10. init_emb = (VOCAB_SIZE, 50)
  11. model = SomeModel(init_emb, num_cls=NUM_CLS)
  12. RUNNER.run_model_with_task(TEXT_CLS, model)
  13. # 序列标注模型
  14. RUNNER.run_model_with_task(POS_TAGGING, model)
  15. # NLI模型
  16. RUNNER.run_model_with_task(NLI, model)
  17. # 自定义模型
  18. RUNNER.run_model(model, data=get_mydata(),
  19. loss=Myloss(), metrics=Mymetric())
  20. """
  21. from fastNLP.envs.imports import _NEED_IMPORT_TORCH
  22. if _NEED_IMPORT_TORCH:
  23. from torch import optim
  24. from fastNLP import Trainer, Evaluator, DataSet, Callback
  25. from fastNLP import Accuracy
  26. from random import randrange
  27. from fastNLP import TorchDataLoader
  28. VOCAB_SIZE = 100
  29. NUM_CLS = 100
  30. MAX_LEN = 10
  31. N_SAMPLES = 100
  32. N_EPOCHS = 1
  33. BATCH_SIZE = 5
  34. TEXT_CLS = 'text_cls'
  35. POS_TAGGING = 'pos_tagging'
  36. NLI = 'nli'
  37. class ModelRunner():
  38. class Checker(Callback):
  39. def on_backward_begin(self, trainer, outputs):
  40. assert outputs['loss'].to('cpu').numpy().isfinate()
  41. def gen_seq(self, length, vocab_size):
  42. """generate fake sequence indexes with given length"""
  43. # reserve 0 for padding
  44. return [randrange(1, vocab_size) for _ in range(length)]
  45. def gen_var_seq(self, max_len, vocab_size):
  46. """generate fake sequence indexes in variant length"""
  47. length = randrange(3, max_len) # at least 3 words in a seq
  48. return self.gen_seq(length, vocab_size)
  49. def prepare_text_classification_data(self):
  50. index = 'index'
  51. ds = DataSet({index: list(range(N_SAMPLES))})
  52. ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
  53. field_name=index, new_field_name='words')
  54. ds.apply_field(lambda x: randrange(NUM_CLS),
  55. field_name=index, new_field_name='target')
  56. ds.apply_field(len, 'words', 'seq_len')
  57. dl = TorchDataLoader(ds, batch_size=BATCH_SIZE)
  58. return dl
  59. def prepare_pos_tagging_data(self):
  60. index = 'index'
  61. ds = DataSet({index: list(range(N_SAMPLES))})
  62. ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
  63. field_name=index, new_field_name='words')
  64. ds.apply_field(lambda x: self.gen_seq(len(x), NUM_CLS),
  65. field_name='words', new_field_name='target')
  66. ds.apply_field(len, 'words', 'seq_len')
  67. dl = TorchDataLoader(ds, batch_size=BATCH_SIZE)
  68. return dl
  69. def prepare_nli_data(self):
  70. index = 'index'
  71. ds = DataSet({index: list(range(N_SAMPLES))})
  72. ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
  73. field_name=index, new_field_name='words1')
  74. ds.apply_field(lambda x: self.gen_var_seq(MAX_LEN, VOCAB_SIZE),
  75. field_name=index, new_field_name='words2')
  76. ds.apply_field(lambda x: randrange(NUM_CLS),
  77. field_name=index, new_field_name='target')
  78. ds.apply_field(len, 'words1', 'seq_len1')
  79. ds.apply_field(len, 'words2', 'seq_len2')
  80. dl = TorchDataLoader(ds, batch_size=BATCH_SIZE)
  81. return dl
  82. def run_text_classification(self, model, data=None):
  83. if data is None:
  84. data = self.prepare_text_classification_data()
  85. metric = Accuracy()
  86. self.run_model(model, data, metric)
  87. def run_pos_tagging(self, model, data=None):
  88. if data is None:
  89. data = self.prepare_pos_tagging_data()
  90. metric = Accuracy()
  91. self.run_model(model, data, metric)
  92. def run_nli(self, model, data=None):
  93. if data is None:
  94. data = self.prepare_nli_data()
  95. metric = Accuracy()
  96. self.run_model(model, data, metric)
  97. def run_model(self, model, data, metrics):
  98. """run a model, test if it can run with fastNLP"""
  99. print('testing model:', model.__class__.__name__)
  100. tester = Evaluator(model, data, metrics={'metric': metrics}, driver='torch')
  101. before_train = tester.run()
  102. optimizer = optim.SGD(model.parameters(), lr=1e-3)
  103. trainer = Trainer(model, driver='torch', train_dataloader=data,
  104. n_epochs=N_EPOCHS, optimizers=optimizer)
  105. trainer.run()
  106. after_train = tester.run()
  107. for metric_name, v1 in before_train.items():
  108. assert metric_name in after_train
  109. # # at least we can sure model params changed, even if we don't know performance
  110. # v2 = after_train[metric_name]
  111. # assert v1 != v2
  112. def run_model_with_task(self, task, model):
  113. """run a model with certain task"""
  114. TASKS = {
  115. TEXT_CLS: self.run_text_classification,
  116. POS_TAGGING: self.run_pos_tagging,
  117. NLI: self.run_nli,
  118. }
  119. assert task in TASKS
  120. TASKS[task](model)
  121. RUNNER = ModelRunner()