Browse Source

Merge pull request #132 from FengZiYjun/v0.3.1

fastNLP V0.3.1
tags/v0.4.10
Coet GitHub 6 years ago
parent
commit
13faa2b410
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
70 changed files with 3957 additions and 4832 deletions
  1. +5
    -0
      codecov.yml
  2. +3
    -2
      docs/source/tutorials/fastnlp_10tmin_tutorial.rst
  3. +2
    -0
      docs/source/tutorials/fastnlp_1_minute_tutorial.rst
  4. +5
    -0
      docs/source/tutorials/fastnlp_advanced_tutorial.rst
  5. +5
    -0
      docs/source/tutorials/fastnlp_developer_guide.rst
  6. +1
    -0
      docs/source/user/installation.rst
  7. +2
    -0
      docs/source/user/quickstart.rst
  8. +11
    -10
      fastNLP/api/README.md
  9. +86
    -43
      fastNLP/api/api.py
  10. +27
    -0
      fastNLP/api/examples.py
  11. +108
    -1
      fastNLP/api/processor.py
  12. +76
    -17
      fastNLP/core/batch.py
  13. +272
    -39
      fastNLP/core/callback.py
  14. +50
    -17
      fastNLP/core/dataset.py
  15. +350
    -86
      fastNLP/core/fieldarray.py
  16. +5
    -1
      fastNLP/core/instance.py
  17. +45
    -23
      fastNLP/core/predictor.py
  18. +62
    -65
      fastNLP/core/trainer.py
  19. +3
    -2
      fastNLP/core/utils.py
  20. +6
    -0
      fastNLP/io/base_loader.py
  21. +1
    -8
      fastNLP/io/config_io.py
  22. +349
    -49
      fastNLP/io/dataset_loader.py
  23. +4
    -1
      fastNLP/io/embed_loader.py
  24. +362
    -0
      fastNLP/models/bert.py
  25. +73
    -57
      fastNLP/models/biaffine_parser.py
  26. +67
    -23
      fastNLP/modules/aggregator/attention.py
  27. +39
    -12
      fastNLP/modules/encoder/transformer.py
  28. +8
    -9
      reproduction/Biaffine_parser/cfg.cfg
  29. +1
    -1
      reproduction/Biaffine_parser/main.py
  30. +58
    -71
      reproduction/Biaffine_parser/run.py
  31. +0
    -51
      reproduction/Biaffine_parser/util.py
  32. +0
    -0
      reproduction/Chinese_word_segmentation/__init__.py
  33. +0
    -0
      reproduction/Chinese_word_segmentation/cws.cfg
  34. +0
    -0
      reproduction/Chinese_word_segmentation/cws_io/__init__.py
  35. +3
    -0
      reproduction/Chinese_word_segmentation/cws_io/cws_reader.py
  36. +0
    -0
      reproduction/Chinese_word_segmentation/models/__init__.py
  37. +4
    -4
      reproduction/Chinese_word_segmentation/models/cws_model.py
  38. +125
    -0
      reproduction/Chinese_word_segmentation/models/cws_transformer.py
  39. +0
    -0
      reproduction/Chinese_word_segmentation/process/__init__.py
  40. +1
    -104
      reproduction/Chinese_word_segmentation/process/cws_processor.py
  41. +0
    -0
      reproduction/Chinese_word_segmentation/process/span_converter.py
  42. +0
    -0
      reproduction/Chinese_word_segmentation/utils.py
  43. +1
    -1
      reproduction/LSTM+self_attention_sentiment_analysis/main.py
  44. +0
    -0
      reproduction/POS_tagging/pos_processor.py
  45. +29
    -0
      reproduction/POS_tagging/pos_reader.py
  46. +1
    -1
      reproduction/POS_tagging/pos_tag.cfg
  47. +163
    -0
      reproduction/POS_tagging/train_pos_tag.py
  48. +0
    -0
      reproduction/POS_tagging/utils.py
  49. +0
    -0
      reproduction/__init__.py
  50. +0
    -197
      reproduction/chinese_word_segment/cws_io/cws_reader.py
  51. +0
    -151
      reproduction/chinese_word_segment/run.py
  52. +0
    -153
      reproduction/pos_tag_model/pos_reader.py
  53. +0
    -113
      reproduction/pos_tag_model/train_pos_tag.py
  54. +48
    -2
      test/api/test_processor.py
  55. +144
    -0
      test/core/test_batch.py
  56. +112
    -18
      test/core/test_callbacks.py
  57. +18
    -2
      test/core/test_dataset.py
  58. +125
    -6
      test/core/test_fieldarray.py
  59. +21
    -6
      test/core/test_predictor.py
  60. +29
    -3
      test/core/test_trainer.py
  61. +0
    -3370
      test/data_for_tests/charlm.txt
  62. +0
    -2
      test/data_for_tests/people_infer.txt
  63. +100
    -0
      test/data_for_tests/zh_sample.conllx
  64. +16
    -13
      test/io/test_dataset_loader.py
  65. +21
    -0
      test/models/test_bert.py
  66. +11
    -7
      test/models/test_biaffine_parser.py
  67. +0
    -91
      test/test_tutorial.py
  68. +432
    -0
      test/test_tutorials.py
  69. +370
    -0
      tutorials/fastNLP_padding_tutorial.ipynb
  70. +97
    -0
      tutorials/fastnlp_test_tutorial.ipynb

+ 5
- 0
codecov.yml View File

@@ -0,0 +1,5 @@
ignore:
- "reproduction" # ignore folders and all its contents
- "setup.py"
- "docs"
- "tutorials"

+ 3
- 2
docs/source/tutorials/fastnlp_10tmin_tutorial.rst View File

@@ -1,7 +1,8 @@

fastNLP上手教程
fastNLP 10分钟上手教程
===============

教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_10min_tutorial.ipynb

fastNLP提供方便的数据预处理,训练和测试模型的功能

DataSet & Instance


+ 2
- 0
docs/source/tutorials/fastnlp_1_minute_tutorial.rst View File

@@ -2,6 +2,8 @@
FastNLP 1分钟上手教程
=====================

教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_1min_tutorial.ipynb

step 1
------



+ 5
- 0
docs/source/tutorials/fastnlp_advanced_tutorial.rst View File

@@ -0,0 +1,5 @@
fastNLP 进阶教程
===============

教程原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/fastnlp_advanced_tutorial/advance_tutorial.ipynb


+ 5
- 0
docs/source/tutorials/fastnlp_developer_guide.rst View File

@@ -0,0 +1,5 @@
fastNLP 开发者指南
===============

原文见 https://github.com/fastnlp/fastNLP/blob/master/tutorials/tutorial_for_developer.md


+ 1
- 0
docs/source/user/installation.rst View File

@@ -5,6 +5,7 @@ Installation
.. contents::
:local:

Make sure your environment satisfies https://github.com/fastnlp/fastNLP/blob/master/requirements.txt .

Run the following commands to install fastNLP package:



+ 2
- 0
docs/source/user/quickstart.rst View File

@@ -6,4 +6,6 @@ Quickstart

../tutorials/fastnlp_1_minute_tutorial
../tutorials/fastnlp_10tmin_tutorial
../tutorials/fastnlp_advanced_tutorial
../tutorials/fastnlp_developer_guide


+ 11
- 10
fastNLP/api/README.md View File

@@ -18,26 +18,27 @@ print(cws.predict(text))
# ['编者 按 : 7月 12日 , 英国 航空 航天 系统 公司 公布 了 该 公司 研制 的 第一 款 高 科技 隐形 无人 机雷电 之 神 。', '这 款 飞行 从 外型 上 来 看 酷似 电影 中 的 太空 飞行器 , 据 英国 方面 介绍 , 可以 实现 洲际 远程 打击 。', '那么 这 款 无人 机 到底 有 多 厉害 ?']
```

### 中文分词+词性标注
### 词性标注
```python
text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
from fastNLP.api import POS
pos = POS(device='cpu')
print(pos.predict(text))
# [['编者/NN', '按/P', ':/PU', '7月/NT', '12日/NR', ',/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一/OD', '款高/NN', '科技/NN', '隐形/NN', '无/VE', '人机/NN', '雷电/NN', '之/DEG', '神/NN', '。/PU'], ['这/DT', '款/NN', '飞行/VV', '从/P', '外型/NN', '上/LC', '来/MSP', '看/VV', '酷似/VV', '电影/NN', '中/LC', '的/DEG', '太空/NN', '飞行器/NN', ',/PU', '据/P', '英国/NR', '方面/NN', '介绍/VV', ',/PU', '可以/VV', '实现/VV', '洲际/NN', '远程/NN', '打击/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无/VE', '人机/NN', '到底/AD', '有/VE', '多/CD', '厉害/NN', '?/PU']]
# [['编者/NN', '按:/NN', '7月/NT', '12日/NT', ',/PU', '英国/NR', '航空/NN', '航天/NN', '系统/NN', '公司/NN', '公布/VV', '了/AS', '该/DT', '公司/NN', '研制/VV', '的/DEC', '第一款/NN', '高科技/NN', '隐形/AD', '无人机/VV', '雷电之神/NN', '。/PU'], ['那么/AD', '这/DT', '款/NN', '无人机/VV', '到底/AD', '有/VE', '多/AD', '厉害/VA', '?/PU']]
```

### 中文分词+词性标注+句法分析
### 句法分析
```python
text = ['编者按:7月12日,英国航空航天系统公司公布了该公司研制的第一款高科技隐形无人机雷电之神。',
'这款飞行从外型上来看酷似电影中的太空飞行器,据英国方面介绍,可以实现洲际远程打击。',
'那么这款无人机到底有多厉害?']
text = [['编者', '按:', '7月', '12日', '', '英国', '航空', '航天', '系统', '公司', '公布', '', '', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '', '', '无人机', '到底', '', '', '厉害', '?']]
from fastNLP.api import Parser
parser = Parser(device='cpu')
print(parser.predict(text))
# [['12/nsubj', '12/prep', '2/punct', '5/nn', '2/pobj', '12/punct', '11/nn', '11/nn', '11/nn', '11/nn', '2/pobj', '0/root', '12/asp', '15/det', '16/nsubj', '21/rcmod', '16/cpm', '21/nummod', '21/nn', '21/nn', '22/top', '12/ccomp', '24/nn', '26/assmod', '24/assm', '22/dobj', '12/punct'], ['2/det', '8/xsubj', '8/mmod', '8/prep', '6/lobj', '4/plmod', '8/prtmod', '0/root', '8/ccomp', '11/lobj', '14/assmod', '11/assm', '14/nn', '9/dobj', '8/punct', '22/prep', '18/nn', '19/nsubj', '16/pccomp', '22/punct', '22/mmod', '8/dep', '25/nn', '25/nn', '22/dobj', '8/punct'], ['4/advmod', '3/det', '4/nsubj', '0/root', '4/dobj', '7/advmod', '4/conj', '9/nummod', '7/dobj', '4/punct']]
# [['2/nn', '4/nn', '4/nn', '20/tmod', '11/punct', '10/nn', '10/nn', '10/nn', '10/nn', '11/nsubj', '20/dep', '11/asp', '14/det', '15/nsubj', '18/rcmod', '15/cpm', '18/nn', '11/dobj', '20/advmod', '0/root', '20/dobj', '20/punct'], ['4/advmod', '3/det', '8/xsubj', '8/dep', '8/advmod', '8/dep', '8/advmod', '0/root', '8/punct']]
```

完整样例见`examples.py`

+ 86
- 43
fastNLP/api/api.py View File

@@ -9,9 +9,7 @@ from fastNLP.core.dataset import DataSet

from fastNLP.api.utils import load_url
from fastNLP.api.processor import ModelProcessor
from reproduction.chinese_word_segment.cws_io.cws_reader import ConllCWSReader
from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader
from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
from fastNLP.io.dataset_loader import ConllCWSReader, ConllxDataLoader
from fastNLP.core.instance import Instance
from fastNLP.api.pipeline import Pipeline
from fastNLP.core.metrics import SpanFPreRecMetric
@@ -19,9 +17,9 @@ from fastNLP.api.processor import IndexerProcessor

# TODO add pretrain urls
model_urls = {
"cws": "http://123.206.98.91:8888/download/cws_crf_1_11-457fc899.pkl",
"pos": "http://123.206.98.91:8888/download/pos_tag_model_20190108-f3c60ee5.pkl",
"parser": "http://123.206.98.91:8888/download/biaffine_parser-3a2f052c.pkl"
"cws": "http://123.206.98.91:8888/download/cws_lstm_ctb9_1_20-09908656.pkl",
"pos": "http://123.206.98.91:8888/download/pos_tag_model_20190119-43f8b435.pkl",
"parser": "http://123.206.98.91:8888/download/parser_20190204-c72ca5c0.pkl"
}


@@ -31,6 +29,16 @@ class API:
self._dict = None

def predict(self, *args, **kwargs):
"""Do prediction for the given input.
"""
raise NotImplementedError

def test(self, file_path):
"""Test performance over the given data set.

:param str file_path:
:return: a dictionary of metric values
"""
raise NotImplementedError

def load(self, path, device):
@@ -69,12 +77,11 @@ class POS(API):
if not hasattr(self, "pipeline"):
raise ValueError("You have to load model first.")

sentence_list = []
sentence_list = content
# 1. 检查sentence的类型
if isinstance(content, str):
sentence_list.append(content)
elif isinstance(content, list):
sentence_list = content
for sentence in sentence_list:
if not all((type(obj) == str for obj in sentence)):
raise ValueError("Input must be list of list of string.")

# 2. 组建dataset
dataset = DataSet()
@@ -83,36 +90,28 @@ class POS(API):
# 3. 使用pipeline
self.pipeline(dataset)

def decode_tags(ins):
pred_tags = ins["tag"]
chars = ins["words"]
words = []
start_idx = 0
for idx, tag in enumerate(pred_tags):
if tag[0] == "S":
words.append(chars[start_idx:idx + 1] + "/" + tag[2:])
start_idx = idx + 1
elif tag[0] == "E":
words.append("".join(chars[start_idx:idx + 1]) + "/" + tag[2:])
start_idx = idx + 1
return words

dataset.apply(decode_tags, new_field_name="tag_output")

output = dataset.field_arrays["tag_output"].content
def merge_tag(words_list, tags_list):
rtn = []
for words, tags in zip(words_list, tags_list):
rtn.append([w + "/" + t for w, t in zip(words, tags)])
return rtn

output = dataset.field_arrays["tag"].content
if isinstance(content, str):
return output[0]
elif isinstance(content, list):
return output
return merge_tag(content, output)

def test(self, file_path):
test_data = ZhConllPOSReader().load(file_path)
test_data = ConllxDataLoader().load(file_path)

tag_vocab = self._dict["tag_vocab"]
pipeline = self._dict["pipeline"]
save_dict = self._dict
tag_vocab = save_dict["tag_vocab"]
pipeline = save_dict["pipeline"]
index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
pipeline.pipeline = [index_tag] + pipeline.pipeline

test_data.rename_field("pos_tags", "tag")
pipeline(test_data)
test_data.set_target("truth")
prediction = test_data.field_arrays["predict"].content
@@ -226,7 +225,7 @@ class CWS(API):
rec = eval_res['BMESF1PreRecMetric']['rec']
# print("f1:{:.2f}, pre:{:.2f}, rec:{:.2f}".format(f1, pre, rec))

return f1, pre, rec
return {"F1": f1, "precision": pre, "recall": rec}


class Parser(API):
@@ -251,6 +250,7 @@ class Parser(API):
dataset.add_field('wp', pos_out)
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[0] for w in x['wp']], new_field_name='words')
dataset.apply(lambda x: ['<BOS>'] + [w.split('/')[1] for w in x['wp']], new_field_name='pos')
dataset.rename_field("words", "raw_words")

# 3. 使用pipeline
self.pipeline(dataset)
@@ -260,31 +260,74 @@ class Parser(API):
# output like: [['2/top', '0/root', '4/nn', '2/dep']]
return dataset.field_arrays['output'].content

def test(self, filepath):
data = ConllxDataLoader().load(filepath)
ds = DataSet()
for ins1, ins2 in zip(add_seg_tag(data), data):
ds.append(Instance(words=ins1[0], tag=ins1[1],
gold_words=ins2[0], gold_pos=ins2[1],
gold_heads=ins2[2], gold_head_tags=ins2[3]))
def load_test_file(self, path):
def get_one(sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]

datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

data = [get_one(sample) for sample in datalist]
data_list = list(filter(lambda x: x is not None, data))
return data_list

def test(self, filepath):
data = self.load_test_file(filepath)

def convert(data):
BOS = '<BOS>'
dataset = DataSet()
for sample in data:
word_seq = [BOS] + sample[0]
pos_seq = [BOS] + sample[1]
heads = [0] + sample[2]
head_tags = [BOS] + sample[3]
dataset.append(Instance(raw_words=word_seq,
pos=pos_seq,
gold_heads=heads,
arc_true=heads,
tags=head_tags))
return dataset

ds = convert(data)
pp = self.pipeline
for p in pp:
if p.field_name == 'word_list':
p.field_name = 'gold_words'
elif p.field_name == 'pos_list':
p.field_name = 'gold_pos'
# ds.rename_field("words", "raw_words")
# ds.rename_field("tag", "pos")
pp(ds)
head_cor, label_cor, total = 0, 0, 0
for ins in ds:
head_gold = ins['gold_heads']
head_pred = ins['heads']
head_pred = ins['arc_pred']
length = len(head_gold)
total += length
for i in range(length):
head_cor += 1 if head_pred[i] == head_gold[i] else 0
uas = head_cor / total
print('uas:{:.2f}'.format(uas))
# print('uas:{:.2f}'.format(uas))

for p in pp:
if p.field_name == 'gold_words':
@@ -292,7 +335,7 @@ class Parser(API):
elif p.field_name == 'gold_pos':
p.field_name = 'pos_list'

return uas
return {"USA": round(uas, 5)}


class Analyzer:


+ 27
- 0
fastNLP/api/examples.py View File

@@ -15,15 +15,42 @@ def chinese_word_segmentation():
print(cws.predict(text))


def chinese_word_segmentation_test():
cws = CWS(device='cpu')
print(cws.test("../../test/data_for_tests/zh_sample.conllx"))


def pos_tagging():
# 输入已分词序列
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
pos = POS(device='cpu')
print(pos.predict(text))


def pos_tagging_test():
pos = POS(device='cpu')
print(pos.test("../../test/data_for_tests/zh_sample.conllx"))


def syntactic_parsing():
text = [['编者', '按:', '7月', '12日', ',', '英国', '航空', '航天', '系统', '公司', '公布', '了', '该', '公司',
'研制', '的', '第一款', '高科技', '隐形', '无人机', '雷电之神', '。'],
['那么', '这', '款', '无人机', '到底', '有', '多', '厉害', '?']]
parser = Parser(device='cpu')
print(parser.predict(text))


def syntactic_parsing_test():
parser = Parser(device='cpu')
print(parser.test("../../test/data_for_tests/zh_sample.conllx"))


if __name__ == "__main__":
# chinese_word_segmentation()
# chinese_word_segmentation_test()
# pos_tagging()
# pos_tagging_test()
syntactic_parsing()
# syntactic_parsing_test()

+ 108
- 1
fastNLP/api/processor.py View File

@@ -102,6 +102,7 @@ class PreAppendProcessor(Processor):
[data] + instance[field_name]

"""

def __init__(self, data, field_name, new_added_field_name=None):
super(PreAppendProcessor, self).__init__(field_name, new_added_field_name)
self.data = data
@@ -116,6 +117,7 @@ class SliceProcessor(Processor):
从某个field中只取部分内容。等价于instance[field_name][start:end:step]

"""

def __init__(self, start, end, step, field_name, new_added_field_name=None):
super(SliceProcessor, self).__init__(field_name, new_added_field_name)
for o in (start, end, step):
@@ -132,6 +134,7 @@ class Num2TagProcessor(Processor):
将一句话中的数字转换为某个tag。

"""

def __init__(self, tag, field_name, new_added_field_name=None):
"""

@@ -163,6 +166,7 @@ class IndexerProcessor(Processor):
给定一个vocabulary , 将指定field转换为index形式。指定field应该是一维的list,比如
['我', '是', xxx]
"""

def __init__(self, vocab, field_name, new_added_field_name, delete_old_field=False, is_input=True):

assert isinstance(vocab, Vocabulary), "Only Vocabulary class is allowed, not {}.".format(type(vocab))
@@ -215,6 +219,7 @@ class SeqLenProcessor(Processor):
根据某个field新增一个sequence length的field。取该field的第一维

"""

def __init__(self, field_name, new_added_field_name='seq_lens', is_input=True):
super(SeqLenProcessor, self).__init__(field_name, new_added_field_name)
self.is_input = is_input
@@ -229,6 +234,7 @@ class SeqLenProcessor(Processor):

from fastNLP.core.utils import _build_args


class ModelProcessor(Processor):
def __init__(self, model, seq_len_field_name='seq_lens', batch_size=32):
"""
@@ -292,6 +298,7 @@ class Index2WordProcessor(Processor):
将DataSet中某个为index的field根据vocab转换为str

"""

def __init__(self, vocab, field_name, new_added_field_name):
super(Index2WordProcessor, self).__init__(field_name, new_added_field_name)
self.vocab = vocab
@@ -303,7 +310,6 @@ class Index2WordProcessor(Processor):


class SetTargetProcessor(Processor):
# TODO; remove it.
def __init__(self, *fields, flag=True):
super(SetTargetProcessor, self).__init__(None, None)
self.fields = fields
@@ -313,6 +319,7 @@ class SetTargetProcessor(Processor):
dataset.set_target(*self.fields, flag=self.flag)
return dataset


class SetInputProcessor(Processor):
def __init__(self, *fields, flag=True):
super(SetInputProcessor, self).__init__(None, None)
@@ -322,3 +329,103 @@ class SetInputProcessor(Processor):
def process(self, dataset):
dataset.set_input(*self.fields, flag=self.flag)
return dataset


class VocabIndexerProcessor(Processor):
"""
根据DataSet创建Vocabulary,并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供
new_added_field_name, 则覆盖原有的field_name.

"""

def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
verbose=0, is_input=True):
"""

:param field_name: 从哪个field_name创建词表,以及对哪个field_name进行index操作
:param new_added_filed_name: index时,生成的index field的名称,如果不传入,则覆盖field_name.
:param min_freq: 创建的Vocabulary允许的单词最少出现次数.
:param max_size: 创建的Vocabulary允许的最大的单词数量
:param verbose: 0, 不输出任何信息;1,输出信息
:param bool is_input:
"""
super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name)
self.min_freq = min_freq
self.max_size = max_size

self.verbose = verbose
self.is_input = is_input

def construct_vocab(self, *datasets):
"""
使用传入的DataSet创建vocabulary

:param datasets: DataSet类型的数据,用于构建vocabulary
:return:
"""
self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
self.vocab.build_vocab()
if self.verbose:
print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))

def process(self, *datasets, only_index_dataset=None):
"""
若还未建立Vocabulary,则使用dataset中的DataSet建立vocabulary;若已经有了vocabulary则使用已有的vocabulary。得到vocabulary
后,则会index datasets与only_index_dataset。

:param datasets: DataSet类型的数据
:param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index,不会被用于生成vocabulary。
:return:
"""
if len(datasets) == 0 and not hasattr(self, 'vocab'):
raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.")
if not hasattr(self, 'vocab'):
self.construct_vocab(*datasets)
else:
if self.verbose:
print("Using constructed vocabulary with {} items.".format(len(self.vocab)))
to_index_datasets = []
if len(datasets) != 0:
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)

if not (only_index_dataset is None):
if isinstance(only_index_dataset, list):
for dataset in only_index_dataset:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
elif isinstance(only_index_dataset, DataSet):
to_index_datasets.append(only_index_dataset)
else:
raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset)))

for dataset in to_index_datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name, is_input=self.is_input)
# 只返回一个,infer时为了跟其他processor保持一致
if len(to_index_datasets) == 1:
return to_index_datasets[0]

def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab))
self.vocab = vocab

def delete_vocab(self):
del self.vocab

def get_vocab_size(self):
return len(self.vocab)

def set_verbose(self, verbose):
"""
设置processor verbose状态。

:param verbose: int, 0,不输出任何信息;1,输出vocab 信息。
:return:
"""
self.verbose = verbose

+ 76
- 17
fastNLP/core/batch.py View File

@@ -2,7 +2,7 @@ import numpy as np
import torch

from fastNLP.core.sampler import RandomSampler
import torch.multiprocessing as mp

class Batch(object):
"""Batch is an iterable object which iterates over mini-batches.
@@ -16,10 +16,11 @@ class Batch(object):
:param int batch_size: the size of the batch
:param Sampler sampler: a Sampler object
:param bool as_numpy: If True, return Numpy array. Otherwise, return torch tensors.

:param bool prefetch: If True, use multiprocessing to fetch next batch when training.
:param str or torch.device device: the batch's device, if as_numpy is True, device is ignored.
"""

def __init__(self, dataset, batch_size, sampler=RandomSampler(), as_numpy=False):
def __init__(self, dataset, batch_size, sampler=RandomSampler(), as_numpy=False, prefetch=False):
self.dataset = dataset
self.batch_size = batch_size
self.sampler = sampler
@@ -28,16 +29,12 @@ class Batch(object):
self.curidx = 0
self.num_batches = len(dataset) // batch_size + int(len(dataset) % batch_size != 0)
self.cur_batch_indices = None
self.prefetch = prefetch
self.lengths = 0

def __iter__(self):
self.idx_list = self.sampler(self.dataset)
self.curidx = 0
self.lengths = self.dataset.get_length()
return self

def __next__(self):
def fetch_one(self):
if self.curidx >= len(self.idx_list):
raise StopIteration
return None
else:
endidx = min(self.curidx + self.batch_size, len(self.idx_list))
batch_x, batch_y = {}, {}
@@ -48,7 +45,7 @@ class Batch(object):
for field_name, field in self.dataset.get_all_fields().items():
if field.is_target or field.is_input:
batch = field.get(indices)
if not self.as_numpy:
if not self.as_numpy and field.padder is not None:
batch = to_tensor(batch, field.dtype)
if field.is_target:
batch_y[field_name] = batch
@@ -56,9 +53,29 @@ class Batch(object):
batch_x[field_name] = batch

self.curidx = endidx

return batch_x, batch_y

def __iter__(self):
"""
Iterate on dataset, fetch batch data. Fetch process don't block the iterate process
:return:
"""
if self.prefetch:
return run_batch_iter(self)
def batch_iter():
self.init_iter()
while 1:
res = self.fetch_one()
if res is None:
break
yield res
return batch_iter()

def init_iter(self):
self.idx_list = self.sampler(self.dataset)
self.curidx = 0
self.lengths = self.dataset.get_length()

def __len__(self):
return self.num_batches

@@ -67,8 +84,50 @@ class Batch(object):


def to_tensor(batch, dtype):
if dtype in (int, np.int8, np.int16, np.int32, np.int64):
batch = torch.LongTensor(batch)
if dtype in (float, np.float32, np.float64):
batch = torch.FloatTensor(batch)
try:
if dtype in (int, np.int8, np.int16, np.int32, np.int64):
batch = torch.LongTensor(batch)
if dtype in (float, np.float32, np.float64):
batch = torch.FloatTensor(batch)
except:
pass
return batch


def run_fetch(batch, q):
batch.init_iter()
# print('start fetch')
while 1:
res = batch.fetch_one()
# print('fetch one')
q.put(res)
if res is None:
# print('fetch done, waiting processing')
q.join()
break
# print('fetch exit')


def run_batch_iter(batch):
q = mp.JoinableQueue(maxsize=10)
fetch_p = mp.Process(target=run_fetch, args=(batch, q))
fetch_p.daemon = True
fetch_p.start()
# print('fork fetch process')
while 1:
try:
res = q.get(timeout=1)
q.task_done()
# print('get fetched')
if res is None:
break
yield res
except Exception as e:
if fetch_p.is_alive():
continue
else:
break
fetch_p.terminate()
fetch_p.join()
# print('iter done')


+ 272
- 39
fastNLP/core/callback.py View File

@@ -1,3 +1,11 @@
import os

import torch
from tensorboardX import SummaryWriter

from fastNLP.io.model_io import ModelSaver, ModelLoader


class Callback(object):
"""An Interface for all callbacks.

@@ -7,38 +15,42 @@ class Callback(object):

def __init__(self):
super(Callback, self).__init__()
self.trainer = None # 在Trainer内部被重新赋值

def before_train(self):
def on_train_begin(self):
# before the main training loop
pass

def before_epoch(self, cur_epoch, total_epoch):
def on_epoch_begin(self, cur_epoch, total_epoch):
# at the beginning of each epoch
pass

def before_batch(self, batch_x, batch_y, indices):
def on_batch_begin(self, batch_x, batch_y, indices):
# at the beginning of each step/mini-batch
pass

def before_loss(self, batch_y, predict_y):
def on_loss_begin(self, batch_y, predict_y):
# after data_forward, and before loss computation
pass

def before_backward(self, loss, model):
def on_backward_begin(self, loss, model):
# after loss computation, and before gradient backward
pass

def after_backward(self, model):
def on_backward_end(self, model):
pass

def after_step(self, optimizer):
def on_step_end(self, optimizer):
pass

def after_batch(self, *args):
def on_batch_end(self, *args):
# at the end of each step/mini-batch
pass

def after_valid(self, eval_result, metric_key, optimizer):
def on_valid_begin(self):
pass

def on_valid_end(self, eval_result, metric_key, optimizer):
"""
每次执行验证机的evaluation后会调用。传入eval_result

@@ -49,7 +61,7 @@ class Callback(object):
"""
pass

def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
"""
每个epoch结束将会调用该方法

@@ -60,7 +72,7 @@ class Callback(object):
"""
pass

def after_train(self, model):
def on_train_end(self, model):
"""
训练结束,调用该方法

@@ -69,16 +81,16 @@ class Callback(object):
"""
pass

def on_exception(self, exception, model, indices):
def on_exception(self, exception, model):
"""
当训练过程出现异常,会触发该方法
:param exception: 某种类型的Exception,比如KeyboardInterrupt等
:param model: 传入Trainer的模型
:param indices: 当前batch的index
:return:
"""
pass


def transfer(func):
"""装饰器,将对CallbackManager的调用转发到各个Callback子类.

@@ -125,91 +137,95 @@ class CallbackManager(Callback):
raise TypeError(f"Expect callbacks in CallbackManager(callbacks) to be list. Got {type(callbacks)}.")

@transfer
def before_train(self):
def on_train_begin(self):
pass

@transfer
def on_epoch_begin(self, cur_epoch, total_epoch):
pass

@transfer
def before_epoch(self, cur_epoch, total_epoch):
def on_batch_begin(self, batch_x, batch_y, indices):
pass

@transfer
def before_batch(self, batch_x, batch_y, indices):
def on_loss_begin(self, batch_y, predict_y):
pass

@transfer
def before_loss(self, batch_y, predict_y):
def on_backward_begin(self, loss, model):
pass

@transfer
def before_backward(self, loss, model):
def on_backward_end(self, model):
pass

@transfer
def after_backward(self, model):
def on_step_end(self, optimizer):
pass

@transfer
def after_step(self, optimizer):
def on_batch_end(self):
pass

@transfer
def after_batch(self):
def on_valid_begin(self):
pass

@transfer
def after_valid(self, eval_result, metric_key, optimizer):
def on_valid_end(self, eval_result, metric_key, optimizer):
pass

@transfer
def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
pass

@transfer
def after_train(self, model):
def on_train_end(self, model):
pass

@transfer
def on_exception(self, exception, model, indices):
def on_exception(self, exception, model):
pass


class DummyCallback(Callback):
def before_train(self, *arg):
def on_train_begin(self, *arg):
print(arg)

def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
print(cur_epoch, n_epoch, optimizer)


class EchoCallback(Callback):
def before_train(self):
def on_train_begin(self):
print("before_train")

def before_epoch(self, cur_epoch, total_epoch):
def on_epoch_begin(self, cur_epoch, total_epoch):
print("before_epoch")

def before_batch(self, batch_x, batch_y, indices):
def on_batch_begin(self, batch_x, batch_y, indices):
print("before_batch")

def before_loss(self, batch_y, predict_y):
def on_loss_begin(self, batch_y, predict_y):
print("before_loss")

def before_backward(self, loss, model):
def on_backward_begin(self, loss, model):
print("before_backward")

def after_batch(self):
def on_batch_end(self):
print("after_batch")

def after_epoch(self, cur_epoch, n_epoch, optimizer):
def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
print("after_epoch")

def after_train(self, model):
def on_train_end(self, model):
print("after_train")


class GradientClipCallback(Callback):
def __init__(self, parameters=None, clip_value=1, clip_type='norm'):
"""
每次backward前,将parameter的gradient clip到某个范围。
"""每次backward前,将parameter的gradient clip到某个范围。

:param parameters: None, torch.Tensor或List[torch.Tensor], 一般通过model.parameters()获得。如果为None则默认对Trainer
的model中所有参数进行clip
@@ -231,12 +247,229 @@ class GradientClipCallback(Callback):
self.parameters = parameters
self.clip_value = clip_value

def after_backward(self, model):
def on_backward_end(self, model):
self.clip_fun(model.parameters(), self.clip_value)


class CallbackException(BaseException):
def __init__(self, msg):
super(CallbackException, self).__init__(msg)


class EarlyStopError(CallbackException):
def __init__(self, msg):
super(EarlyStopError, self).__init__(msg)


class EarlyStopCallback(Callback):
def __init__(self, patience):
"""

:param int patience: 停止之前等待的epoch数
"""
super(EarlyStopCallback, self).__init__()
self.trainer = None # override by CallbackManager
self.patience = patience
self.wait = 0
self.epoch = 0

def on_valid_end(self, eval_result, metric_key, optimizer):
self.epoch += 1
if not self.trainer._better_eval_result(eval_result):
# current result is getting worse
if self.wait == self.patience:
raise EarlyStopError("Early stopping raised.")
else:
self.wait += 1
else:
self.wait = 0

def on_exception(self, exception, model):
if isinstance(exception, EarlyStopError):
print("Early Stopping triggered in epoch {}!".format(self.epoch))
else:
raise exception # 抛出陌生Error


class LRScheduler(Callback):
def __init__(self, lr_scheduler):
"""对PyTorch LR Scheduler的包装

:param lr_scheduler: PyTorch的lr_scheduler
"""
super(LRScheduler, self).__init__()
import torch.optim
if isinstance(lr_scheduler, torch.optim.lr_scheduler._LRScheduler):
self.scheduler = lr_scheduler
else:
raise ValueError(f"Expect torch.optim.lr_scheduler for LRScheduler. Got {type(lr_scheduler)}.")

def on_epoch_begin(self, cur_epoch, total_epoch):
self.scheduler.step()
print("scheduler step ", "lr=", self.trainer.optimizer.param_groups[0]["lr"])


class ControlC(Callback):
def __init__(self, quit_all):
"""

:param quit_all: 若为True,则检测到control+C 直接退出程序;否则只退出Trainer
"""
super(ControlC, self).__init__()
if type(quit_all) != bool:
raise ValueError("In KeyBoardInterrupt, quit_all arguemnt must be a bool.")
self.quit_all = quit_all

def on_exception(self, exception, model):
if isinstance(exception, KeyboardInterrupt):
if self.quit_all is True:
import sys
sys.exit(0) # 直接退出程序
else:
pass
else:
raise exception # 抛出陌生Error


class SmoothValue(object):
def __init__(self, beta: float):
self.beta, self.n, self.mov_avg = beta, 0, 0
self.smooth = None

def add_value(self, val: float) -> None:
"Add `val` to calculate updated smoothed value."
self.n += 1
self.mov_avg = self.beta * self.mov_avg + (1 - self.beta) * val
self.smooth = self.mov_avg / (1 - self.beta ** self.n)


class LRFinder(Callback):
def __init__(self, n_batch, start_lr=1e-6, end_lr=10):
"""用第一个 epoch 找最佳的学习率,从第二个epoch开始应用它

:param n_batch: 一个epoch内的iteration数
:param start_lr: 学习率下界
:param end_lr: 学习率上界
"""
super(LRFinder, self).__init__()
self.start_lr, self.end_lr = start_lr, end_lr
self.num_it = n_batch
self.stop = False
self.best_loss = 0.
self.best_lr = None
self.loss_history = []
self.smooth_value = SmoothValue(0.8)
self.opt = None
scale = (self.end_lr - self.start_lr) / self.num_it

self.lr_gen = (self.start_lr + scale * (step + 1) for step in range(self.num_it))
self.find = None
self.loader = ModelLoader()

def on_epoch_begin(self, cur_epoch, total_epoch):
if cur_epoch == 1:
self.opt = self.trainer.optimizer # pytorch optimizer
self.opt.param_groups[0]["lr"] = self.start_lr
# save model
ModelSaver("tmp").save_pytorch(self.trainer.model, param_only=True)
self.find = True

def on_backward_begin(self, loss, model):
if self.find:
if torch.isnan(loss) or self.stop is True:
self.stop = True
return
loss_val = loss.detach().cpu().data
self.loss_history.append(loss_val)
self.smooth_value.add_value(loss_val)
if self.best_loss == 0. or self.smooth_value.smooth < self.best_loss:
self.best_loss = self.smooth_value.smooth
self.best_lr = self.opt.param_groups[0]["lr"]

def on_batch_end(self, *args):
if self.find:
lr = next(self.lr_gen, None)
if lr is None or self.stop is True or self.loss_history[-1] > 4 * self.best_loss:
self.stop = True
return
self.opt.param_groups[0]["lr"] = lr
# self.loader.load_pytorch(self.trainer.model, "tmp")

def on_epoch_end(self, cur_epoch, n_epoch, optimizer):
if cur_epoch == 1:
self.opt.param_groups[0]["lr"] = self.best_lr
self.find = False
# reset model
ModelLoader().load_pytorch(self.trainer.model, "tmp")
print("Model reset. \nFind best lr={}".format(self.best_lr))


class TensorboardCallback(Callback):
"""
接受以下一个或多个字符串作为参数:
- "model"
- "loss"
- "metric"
"""

def __init__(self, *options):
super(TensorboardCallback, self).__init__()
args = {"model", "loss", "metric"}
for opt in options:
if opt not in args:
raise ValueError("Unrecognized argument {}. Expect one of {}".format(opt, args))
self.options = options
self._summary_writer = None
self.graph_added = False

def on_train_begin(self):
save_dir = self.trainer.save_path
if save_dir is None:
path = os.path.join("./", 'tensorboard_logs_{}'.format(self.trainer.start_time))
else:
path = os.path.join(save_dir, 'tensorboard_logs_{}'.format(self.trainer.start_time))
self._summary_writer = SummaryWriter(path)

def on_batch_begin(self, batch_x, batch_y, indices):
if "model" in self.options and self.graph_added is False:
# tesorboardX 这里有大bug,暂时没法画模型图
# from fastNLP.core.utils import _build_args
# inputs = _build_args(self.trainer.model, **batch_x)
# args = tuple([value for value in inputs.values()])
# args = args[0] if len(args) == 1 else args
# self._summary_writer.add_graph(self.trainer.model, torch.zeros(32, 2))
self.graph_added = True

def on_backward_begin(self, loss, model):
if "loss" in self.options:
self._summary_writer.add_scalar("loss", loss.item(), global_step=self.trainer.step)

if "model" in self.options:
for name, param in self.trainer.model.named_parameters():
if param.requires_grad:
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.trainer.step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.trainer.step)
self._summary_writer.add_scalar(name + "_grad_mean", param.grad.mean(),
global_step=self.trainer.step)

def on_valid_end(self, eval_result, metric_key, optimizer):
if "metric" in self.options:
for name, metric in eval_result.items():
for metric_key, metric_val in metric.items():
self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val,
global_step=self.trainer.step)

def on_train_end(self, model):
self._summary_writer.close()
del self._summary_writer

def on_exception(self, exception, model):
if hasattr(self, "_summary_writer"):
self._summary_writer.close()
del self._summary_writer


if __name__ == "__main__":
manager = CallbackManager(env={"n_epoch": 3}, callbacks=[DummyCallback(), DummyCallback()])
manager.before_train(10, 11, 12)
manager.on_train_begin(10, 11, 12)
# print(manager.after_epoch())

+ 50
- 17
fastNLP/core/dataset.py View File

@@ -2,6 +2,7 @@ import _pickle as pickle

import numpy as np

from fastNLP.core.fieldarray import AutoPadder
from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.instance import Instance
from fastNLP.core.utils import get_func_signature
@@ -88,12 +89,13 @@ class DataSet(object):
raise RuntimeError(f"Start index {idx.start} out of range 0-{len(self)-1}")
data_set = DataSet()
for field in self.field_arrays.values():
data_set.add_field(name=field.name,
fields=field.content[idx],
padding_val=field.padding_val,
is_input=field.is_input,
is_target=field.is_target)
data_set.add_field(name=field.name, fields=field.content[idx], padder=field.padder,
is_input=field.is_input, is_target=field.is_target)
return data_set
elif isinstance(idx, str):
if idx not in self:
raise KeyError("No such field called {} in DataSet.".format(idx))
return self.field_arrays[idx]
else:
raise KeyError("Unrecognized type {} for idx in __getitem__ method".format(type(idx)))

@@ -144,19 +146,23 @@ class DataSet(object):
if len(self.field_arrays) == 0:
# DataSet has no field yet
for name, field in ins.fields.items():
self.field_arrays[name] = FieldArray(name, [field])
field = field.tolist() if isinstance(field, np.ndarray) else field
self.field_arrays[name] = FieldArray(name, [field]) # 第一个样本,必须用list包装起来
else:
assert len(self.field_arrays) == len(ins.fields)
if len(self.field_arrays) != len(ins.fields):
raise ValueError(
"DataSet object has {} fields, but attempt to append an Instance object with {} fields."
.format(len(self.field_arrays), len(ins.fields)))
for name, field in ins.fields.items():
assert name in self.field_arrays
self.field_arrays[name].append(field)

def add_field(self, name, fields, padding_val=0, is_input=False, is_target=False):
def add_field(self, name, fields, padder=AutoPadder(pad_val=0), is_input=False, is_target=False):
"""Add a new field to the DataSet.
:param str name: the name of the field.
:param fields: a list of int, float, or other objects.
:param int padding_val: integer for padding.
:param int padder: PadBase对象,如何对该Field进行padding。大部分情况使用默认值即可
:param bool is_input: whether this field is model input.
:param bool is_target: whether this field is label or target.
"""
@@ -164,8 +170,8 @@ class DataSet(object):
if len(self) != len(fields):
raise RuntimeError(f"The field to append must have the same size as dataset. "
f"Dataset size {len(self)} != field size {len(fields)}")
self.field_arrays[name] = FieldArray(name, fields, padding_val=padding_val, is_target=is_target,
is_input=is_input)
self.field_arrays[name] = FieldArray(name, fields, is_target=is_target, is_input=is_input,
padder=padder)

def delete_field(self, name):
"""Delete a field based on the field name.
@@ -229,6 +235,25 @@ class DataSet(object):
else:
raise KeyError("{} is not a valid field name.".format(name))

def set_padder(self, field_name, padder):
"""
为field_name设置padder
:param field_name: str, 设置field的padding方式为padder
:param padder: PadderBase类型或None. 设置为None即删除padder。即对该field不进行padding操作.
:return:
"""
self.field_arrays[field_name].set_padder(padder)

def set_pad_val(self, field_name, pad_val):
"""
为某个

:param field_name: str,修改该field的pad_val
:param pad_val: int,该field的padder会以pad_val作为padding index
:return:
"""
self.field_arrays[field_name].set_pad_val(pad_val)

def get_input_name(self):
"""Get all field names with `is_input` as True.

@@ -254,7 +279,7 @@ class DataSet(object):
:return results: if new_field_name is not passed, returned values of the function over all instances.
"""
results = [func(ins) for ins in self._inner_iter()]
if len(list(filter(lambda x: x is not None, results))) == 0 and not (new_field_name is None): # all None
if not (new_field_name is None) and len(list(filter(lambda x: x is not None, results))) == 0: # all None
raise ValueError("{} always return None.".format(get_func_signature(func=func)))

extra_param = {}
@@ -270,12 +295,11 @@ class DataSet(object):
extra_param['is_input'] = old_field.is_input
if 'is_target' not in extra_param:
extra_param['is_target'] = old_field.is_target
self.add_field(name=new_field_name,
fields=results,
padding_val=old_field.padding_val,
**extra_param)
self.add_field(name=new_field_name, fields=results, is_input=extra_param["is_input"],
is_target=extra_param["is_target"])
else:
self.add_field(name=new_field_name, fields=results, **extra_param)
self.add_field(name=new_field_name, fields=results, is_input=extra_param.get("is_input", None),
is_target=extra_param.get("is_target", None))
else:
return results

@@ -314,8 +338,17 @@ class DataSet(object):
for field_name in self.field_arrays:
train_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input
train_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target
train_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
train_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
train_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
train_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim

dev_set.field_arrays[field_name].is_input = self.field_arrays[field_name].is_input
dev_set.field_arrays[field_name].is_target = self.field_arrays[field_name].is_target
dev_set.field_arrays[field_name].padder = self.field_arrays[field_name].padder
dev_set.field_arrays[field_name].dtype = self.field_arrays[field_name].dtype
dev_set.field_arrays[field_name].pytype = self.field_arrays[field_name].pytype
dev_set.field_arrays[field_name].content_dim = self.field_arrays[field_name].content_dim

return train_set, dev_set



+ 350
- 86
fastNLP/core/fieldarray.py View File

@@ -1,51 +1,168 @@
import numpy as np


class PadderBase:
"""
所有padder都需要继承这个类,并覆盖__call__()方法。
用于对batch进行padding操作。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前deepcopy一份。
"""
def __init__(self, pad_val=0, **kwargs):
self.pad_val = pad_val

def set_pad_val(self, pad_val):
self.pad_val = pad_val

def __call__(self, contents, field_name, field_ele_dtype):
"""
传入的是List内容。假设有以下的DataSet。
from fastNLP import DataSet
from fastNLP import Instance
dataset = DataSet()
dataset.append(Instance(word='this is a demo', length=4,
chars=[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']]))
dataset.append(Instance(word='another one', length=2,
chars=[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']]))
# 如果batch_size=2, 下面只是用str的方式看起来更直观一点,但实际上可能word和chars在pad时都已经为index了。
word这个field的pad_func会接收到的内容会是
[
'this is a demo',
'another one'
]
length这个field的pad_func会接收到的内容会是
[4, 2]
chars这个field的pad_func会接收到的内容会是
[
[['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['d', 'e', 'm', 'o']],
[['a', 'n', 'o', 't', 'h', 'e', 'r'], ['o', 'n', 'e']]
]
即把每个instance中某个field的内容合成一个List传入
:param contents: List[element]。传入的element是inplace的,即直接修改element可能导致数据变化,建议inplace修改之前
deepcopy一份。
:param field_name: str, field的名称,帮助定位错误
:param field_ele_dtype: np.int64, np.float64, np.str. 该field的内层list元素的类型。辅助判断是否pad,大多数情况用不上
:return: List[padded_element]或np.array([padded_element])
"""
raise NotImplementedError


class AutoPadder(PadderBase):
"""
根据contents的数据自动判定是否需要做padding。
(1) 如果元素类型(元素类型是指field中最里层List的元素的数据类型, 可以通过FieldArray.dtype查看,比如['This', 'is', ...]的元素类
型为np.str, [[1,2], ...]的元素类型为np.int64)的数据不为(np.int64, np.float64)则不会进行padding
(2) 如果元素类型为(np.int64, np.float64),
(2.1) 如果该field的内容只有一个,比如为sequence_length, 则不进行padding
(2.2) 如果该field的内容为List, 那么会将Batch中的List pad为一样长。若该List下还有里层的List需要padding,请使用其它padder。
如果某个instance中field为[1, 2, 3],则可以pad; 若为[[1,2], [3,4, ...]]则不能进行pad
"""
def __init__(self, pad_val=0):
"""
:param pad_val: int, padding的位置使用该index
"""
super().__init__(pad_val=pad_val)

def _is_two_dimension(self, contents):
"""
判断contents是不是只有两个维度。[[1,2], [3]]是两个维度. [[[1,2], [3, 4, 5]], [[4,5]]]有三个维度
:param contents:
:return:
"""
value = contents[0]
if isinstance(value , (np.ndarray, list)):
value = value[0]
if isinstance(value, (np.ndarray, list)):
return False
return True
return False

def __call__(self, contents, field_name, field_ele_dtype):
if not is_iterable(contents[0]):
array = np.array([content for content in contents], dtype=field_ele_dtype)
elif field_ele_dtype in (np.int64, np.float64) and self._is_two_dimension(contents):
max_len = max([len(content) for content in contents])
array = np.full((len(contents), max_len), self.pad_val, dtype=field_ele_dtype)
for i, content in enumerate(contents):
array[i][:len(content)] = content
else: # should only be str
array = np.array([content for content in contents])
return array


class FieldArray(object):
"""``FieldArray`` is the collection of ``Instance``s of the same field.
It is the basic element of ``DataSet`` class.

:param str name: the name of the FieldArray
:param list content: a list of int, float, str or np.ndarray, or a list of list of one, or a np.ndarray.
:param int padding_val: the integer for padding. Default: 0.
:param bool is_target: If True, this FieldArray is used to compute loss.
:param bool is_input: If True, this FieldArray is used to the model input.

:param padder: PadderBase类型。大多数情况下都不需要设置该值,除非需要在多个维度上进行padding(比如英文中对character进行padding)
"""

def __init__(self, name, content, padding_val=0, is_target=None, is_input=None):
def __init__(self, name, content, is_target=None, is_input=None, padder=AutoPadder(pad_val=0)):
"""DataSet在初始化时会有两类方法对FieldArray操作:
1) 如果DataSet使用dict初始化,那么在add_field中会构造FieldArray:
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
1.4) list of array: DataSet({"x": [np.array([1,2,3]), np.array([1,2,3])]})
2) 如果DataSet使用list of Instance 初始化,那么在append中会先对第一个样本初始化FieldArray;
然后后面的样本使用FieldArray.append进行添加。
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])

类型检查(dtype check)发生在当该field被设置为is_input或者is_target时。

"""
self.name = name
if isinstance(content, list):
content = content
# 如果DataSet使用dict初始化, content 可能是二维list/二维array/三维list
# 如果DataSet使用list of Instance 初始化, content可能是 [list]/[array]/[2D list]
for idx, item in enumerate(content):
# 这是使用list of Instance 初始化时第一个样本:FieldArray(name, [field])
# 将[np.array] 转化为 list of list
# 也可以支持[array, array, array]的情况
if isinstance(item, np.ndarray):
content[idx] = content[idx].tolist()
elif isinstance(content, np.ndarray):
content = content.tolist() # convert np.ndarray into 2-D list
else:
raise TypeError("content in FieldArray can only be list or numpy.ndarray, got {}.".format(type(content)))
self.content = content
self.padding_val = padding_val
if len(content) == 0:
raise RuntimeError("Cannot initialize FieldArray with empty list.")

self._is_target = None
self._is_input = None
self.content = content # 1维 或 2维 或 3维 list, 形状可能不对齐
self.content_dim = None # 表示content是多少维的list
self.set_padder(padder)

self.BASIC_TYPES = (int, float, str) # content中可接受的Python基本类型,这里没有np.array

self.BASIC_TYPES = (int, float, str, np.ndarray)
self.is_2d_list = False
self.pytype = None # int, float, str, or np.ndarray
self.dtype = None # np.int64, np.float64, np.str
self.pytype = None
self.dtype = None
self._is_input = None
self._is_target = None

if is_input is not None:
if is_input is not None or is_target is not None:
self.is_input = is_input
if is_target is not None:
self.is_target = is_target

def _set_dtype(self):
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)

@property
def is_input(self):
return self._is_input

@is_input.setter
def is_input(self, value):
"""
当 field_array.is_input = True / False 时被调用
"""
if value is True:
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)
self._set_dtype()
self._is_input = value

@property
@@ -54,46 +171,99 @@ class FieldArray(object):

@is_target.setter
def is_target(self, value):
"""
当 field_array.is_target = True / False 时被调用
"""
if value is True:
self.pytype = self._type_detection(self.content)
self.dtype = self._map_to_np_type(self.pytype)
self._set_dtype()
self._is_target = value

def _type_detection(self, content):
"""

:param content: a list of int, float, str or np.ndarray, or a list of list of one.
:return type: one of int, float, str, np.ndarray
"""当该field被设置为is_input或者is_target时被调用

"""
if isinstance(content, list) and len(content) > 0 and isinstance(content[0], list):
# content is a 2-D list
if not all(isinstance(_, list) for _ in content): # strict check 2-D list
raise TypeError("Please provide 2-D list.")
type_set = set([self._type_detection(x) for x in content])
if len(type_set) == 2 and int in type_set and float in type_set:
type_set = {float}
elif len(type_set) > 1:
raise TypeError("Cannot create FieldArray with more than one type. Provided {}".format(type_set))
self.is_2d_list = True
if len(content) == 0:
raise RuntimeError("Empty list in Field {}.".format(self.name))

type_set = set([type(item) for item in content])

if list in type_set:
if len(type_set) > 1:
# list 跟 非list 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))
# >1维list
inner_type_set = set()
for l in content:
[inner_type_set.add(type(obj)) for obj in l]
if list not in inner_type_set:
# 二维list
self.content_dim = 2
return self._basic_type_detection(inner_type_set)
else:
if len(inner_type_set) == 1:
# >2维list
inner_inner_type_set = set()
for _2d_list in content:
for _1d_list in _2d_list:
[inner_inner_type_set.add(type(obj)) for obj in _1d_list]
if list in inner_inner_type_set:
raise RuntimeError("FieldArray cannot handle 4-D or more-D list.")
# 3维list
self.content_dim = 3
return self._basic_type_detection(inner_inner_type_set)
else:
# list 跟 非list 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, inner_type_set))
else:
# 一维list
for content_type in type_set:
if content_type not in self.BASIC_TYPES:
raise RuntimeError("Unexpected data type in Field '{}'. Expect one of {}. Got {}.".format(
self.name, self.BASIC_TYPES, content_type))
self.content_dim = 1
return self._basic_type_detection(type_set)

def _basic_type_detection(self, type_set):
"""
:param type_set: a set of Python types
:return: one of self.BASIC_TYPES
"""
if len(type_set) == 1:
return type_set.pop()

elif isinstance(content, list):
# content is a 1-D list
if len(content) == 0:
# the old error is not informative enough.
raise RuntimeError("Cannot create FieldArray with an empty list. Or one element in the list is empty.")
type_set = set([type(item) for item in content])

if len(type_set) == 1 and tuple(type_set)[0] in self.BASIC_TYPES:
return type_set.pop()
elif len(type_set) == 2 and float in type_set and int in type_set:
elif len(type_set) == 2:
# 有多个basic type; 可能需要up-cast
if float in type_set and int in type_set:
# up-cast int to float
return float
else:
raise TypeError("Cannot create FieldArray with type {}".format(*type_set))
# str 跟 int 或者 float 混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))
else:
raise TypeError("Cannot create FieldArray with type {}".format(type(content)))
# str, int, float混在一起
raise RuntimeError("Mixed data types in Field {}: {}".format(self.name, type_set))

def _1d_list_check(self, val):
"""如果不是1D list就报错
"""
type_set = set((type(obj) for obj in val))
if any(obj not in self.BASIC_TYPES for obj in type_set):
raise ValueError("Mixed data types in Field {}: {}".format(self.name, type_set))
self._basic_type_detection(type_set)
# otherwise: _basic_type_detection will raise error
return True

def _2d_list_check(self, val):
"""如果不是2D list 就报错
"""
type_set = set(type(obj) for obj in val)
if list(type_set) != [list]:
raise ValueError("Mixed data types in Field {}: {}".format(self.name, type_set))
inner_type_set = set()
for l in val:
for obj in l:
inner_type_set.add(type(obj))
self._basic_type_detection(inner_type_set)
return True

@staticmethod
def _map_to_np_type(basic_type):
@@ -108,38 +278,39 @@ class FieldArray(object):

:param val: int, float, str, or a list of one.
"""
if self.is_target is True or self.is_input is True:
# only check type when used as target or input
if isinstance(val, list):
pass
elif isinstance(val, tuple): # 确保最外层是list
val = list(val)
elif isinstance(val, np.ndarray):
val = val.tolist()
elif any((isinstance(val, t) for t in self.BASIC_TYPES)):
pass
else:
raise RuntimeError(
"Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))

val_type = type(val)
if val_type == list: # shape check
if self.is_2d_list is False:
raise RuntimeError("Cannot append a list into a 1-D FieldArray. Please provide an element.")
if self.is_input is True or self.is_target is True:
if type(val) == list:
if len(val) == 0:
raise RuntimeError("Cannot append an empty list.")
val_list_type = set([type(_) for _ in val]) # type check
if len(val_list_type) == 2 and int in val_list_type and float in val_list_type:
# up-cast int to float
val_type = float
elif len(val_list_type) == 1:
val_type = val_list_type.pop()
raise ValueError("Cannot append an empty list.")
if self.content_dim == 2 and self._1d_list_check(val):
# 1维list检查
pass
elif self.content_dim == 3 and self._2d_list_check(val):
# 2维list检查
pass
else:
raise TypeError("Cannot append a list of {}".format(val_list_type))
raise RuntimeError(
"Dimension not matched: expect dim={}, got {}.".format(self.content_dim - 1, val))
elif type(val) in self.BASIC_TYPES and self.content_dim == 1:
# scalar检查
if type(val) == float and self.pytype == int:
self.pytype = float
self.dtype = self._map_to_np_type(self.pytype)
else:
if self.is_2d_list is True:
raise RuntimeError("Cannot append a non-list into a 2-D list. Please provide a list.")

if val_type == float and self.pytype == int:
# up-cast
self.pytype = float
self.dtype = self._map_to_np_type(self.pytype)
elif val_type == int and self.pytype == float:
pass
elif val_type == self.pytype:
pass
else:
raise TypeError("Cannot append type {} into type {}".format(val_type, self.pytype))

raise RuntimeError(
"Unexpected data type {}. Should be list, np.array, or {}".format(type(val), self.BASIC_TYPES))
self.content.append(val)

def __getitem__(self, indices):
@@ -149,28 +320,44 @@ class FieldArray(object):
assert isinstance(idx, int)
self.content[idx] = val

def get(self, indices):
def get(self, indices, pad=True):
"""Fetch instances based on indices.

:param indices: an int, or a list of int.
:param pad: bool, 是否对返回的结果进行padding。
:return:
"""
if isinstance(indices, int):
return self.content[indices]
if self.is_input is False and self.is_target is False:
raise RuntimeError("Please specify either is_input or is_target is True for {}".format(self.name))
batch_size = len(indices)

if not is_iterable(self.content[0]):
array = np.array([self.content[i] for i in indices], dtype=self.dtype)
elif self.dtype in (np.int64, np.float64):
max_len = max([len(self.content[i]) for i in indices])
array = np.full((batch_size, max_len), self.padding_val, dtype=self.dtype)
for i, idx in enumerate(indices):
array[i][:len(self.content[idx])] = self.content[idx]
else: # should only be str
array = np.array([self.content[i] for i in indices])
return array

contents = [self.content[i] for i in indices]
if self.padder is None or pad is False:
return np.array(contents)
else:
return self.padder(contents, field_name=self.name, field_ele_dtype=self.dtype)

def set_padder(self, padder):
"""
设置padding方式

:param padder: PadderBase类型或None. 设置为None即删除padder.
:return:
"""
if padder is not None:
assert isinstance(padder, PadderBase), "padder must be of type PadderBase."
self.padder = padder

def set_pad_val(self, pad_val):
"""
修改padder的pad_val.
:param pad_val: int。
:return:
"""
if self.padder is not None:
self.padder.set_pad_val(pad_val)


def __len__(self):
"""Returns the size of FieldArray.
@@ -186,3 +373,80 @@ def is_iterable(content):
except TypeError:
return False
return True


class EngChar2DPadder(PadderBase):
"""
用于为英语执行character级别的2D padding操作。对应的field内容应该为[['T', 'h', 'i', 's'], ['a'], ['d', 'e', 'm', 'o']](这里为
了更直观,把它们写为str,但实际使用时它们应该是character的index)。
padded过后的batch内容,形状为(batch_size, max_sentence_length, max_word_length). max_sentence_length最大句子长度。
max_word_length最长的word的长度

"""
def __init__(self, pad_val=0, pad_length=0):
"""
:param pad_val: int, padding的位置使用该index
:param pad_length: int, 如果为0则取一个batch中最大的单词长度作为padding长度。如果为大于0的数,则将所有单词的长度都pad或截
取到该长度.
"""
super().__init__(pad_val=pad_val)

self.pad_length = pad_length

def _exactly_three_dims(self, contents, field_name):
"""
检查传入的contents是否刚好是3维,如果不是3维就报错。理论上,第一个维度是batch,第二个维度是word,第三个维度是character
:param contents:
:param field_name: str
:return:
"""
if not isinstance(contents, list):
raise TypeError("contents should be a list, not {}.".format(type(contents)))
value = contents[0]
try:
value = value[0]
except:
raise ValueError("Field:{} only has one dimension.".format(field_name))
try:
value = value[0]
except:
raise ValueError("Field:{} only has two dimensions.".format(field_name))

if is_iterable(value):
raise ValueError("Field:{} has more than 3 dimension.".format(field_name))

def __call__(self, contents, field_name, field_ele_dtype):
"""
期望输入类似于
[
[[0, 2], [2, 3, 4], ..],
[[9, 8, 2, 4], [1, 2,], ...],
....
]

:param contents:
:param field_name:
:param field_ele_dtype
:return:
"""
if field_ele_dtype not in (np.int64, np.float64):
raise TypeError('dtype of Field:{} should be np.int64 or np.float64 to do 2D padding, get {}.'.format(
field_name, field_ele_dtype
))
self._exactly_three_dims(contents, field_name)
if self.pad_length < 1:
max_char_length = max(max([[len(char_lst) for char_lst in word_lst] for word_lst in contents]))
else:
max_char_length = self.pad_length
max_sent_length = max(len(word_lst) for word_lst in contents)
batch_size = len(contents)
dtype = type(contents[0][0][0])

padded_array = np.full((batch_size, max_sent_length, max_char_length), fill_value=self.pad_val,
dtype=dtype)
for b_idx, word_lst in enumerate(contents):
for c_idx, char_lst in enumerate(word_lst):
chars = char_lst[:max_char_length]
padded_array[b_idx, c_idx, :len(chars)] = chars

return padded_array

+ 5
- 1
fastNLP/core/instance.py View File

@@ -11,6 +11,10 @@ class Instance(object):
"""

def __init__(self, **fields):
"""

:param fields: 可能是一维或者二维的 list or np.array
"""
self.fields = fields

def add_field(self, field_name, field):
@@ -32,5 +36,5 @@ class Instance(object):
def __repr__(self):
s = '\''
return "{" + ",\n".join(
"\'" + field_name + "\': " + str(self.fields[field_name]) +\
"\'" + field_name + "\': " + str(self.fields[field_name]) + \
f" type={(str(type(self.fields[field_name]))).split(s)[1]}" for field_name in self.fields) + "}"

+ 45
- 23
fastNLP/core/predictor.py View File

@@ -1,7 +1,11 @@
from collections import defaultdict

import torch

from fastNLP.core.batch import Batch
from fastNLP.core.sampler import SequentialSampler
from fastNLP.core import Batch
from fastNLP.core import DataSet
from fastNLP.core import SequentialSampler
from fastNLP.core.utils import _build_args


class Predictor(object):
@@ -13,37 +17,55 @@ class Predictor(object):
Currently, Predictor does not support GPU.
"""

def __init__(self):
def __init__(self, network):
if not isinstance(network, torch.nn.Module):
raise ValueError(
"Only fastNLP.models.BaseModel or torch.nn,Module is allowed, not {}".format(type(network)))
self.network = network
self.batch_size = 1
self.batch_output = []

def predict(self, network, data):
def predict(self, data, seq_len_field_name=None):
"""Perform inference using the trained model.

:param network: a PyTorch model (cpu)
:param data: a DataSet object.
:param str seq_len_field_name: field name indicating sequence lengths
:return: list of batch outputs
"""
# turn on the testing mode; clean up the history
self.mode(network, test=True)
batch_output = []
if not isinstance(data, DataSet):
raise ValueError("Only Dataset class is allowed, not {}.".format(type(data)))
if seq_len_field_name is not None and seq_len_field_name not in data.field_arrays:
raise ValueError("Field name {} not found in DataSet {}.".format(seq_len_field_name, data))

data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False)
self.network.eval()
batch_output = defaultdict(list)
data_iterator = Batch(data, batch_size=self.batch_size, sampler=SequentialSampler(), as_numpy=False,
prefetch=False)

for batch_x, _ in data_iterator:
with torch.no_grad():
prediction = self.data_forward(network, batch_x)
batch_output.append(prediction)
if hasattr(self.network, "predict"):
predict_func = self.network.predict
else:
predict_func = self.network.forward

return batch_output
with torch.no_grad():
for batch_x, _ in data_iterator:
refined_batch_x = _build_args(predict_func, **batch_x)
prediction = predict_func(**refined_batch_x)

def mode(self, network, test=True):
if test:
network.eval()
else:
network.train()
if seq_len_field_name is not None:
seq_lens = batch_x[seq_len_field_name].tolist()

for key, value in prediction.items():
value = value.cpu().numpy()
if len(value.shape) == 1 or (len(value.shape) == 2 and value.shape[1] == 1):
batch_output[key].extend(value.tolist())
else:
if seq_len_field_name is not None:
tmp_batch = []
for idx, seq_len in enumerate(seq_lens):
tmp_batch.append(value[idx, :seq_len])
batch_output[key].extend(tmp_batch)
else:
batch_output[key].append(value)

def data_forward(self, network, x):
"""Forward through network."""
y = network(**x)
return y
return batch_output

+ 62
- 65
fastNLP/core/trainer.py View File

@@ -5,7 +5,6 @@ from datetime import timedelta

import numpy as np
import torch
from tensorboardX import SummaryWriter
from torch import nn

try:
@@ -14,7 +13,7 @@ except:
from fastNLP.core.utils import pseudo_tqdm as tqdm

from fastNLP.core.batch import Batch
from fastNLP.core.callback import CallbackManager
from fastNLP.core.callback import CallbackManager, CallbackException
from fastNLP.core.dataset import DataSet
from fastNLP.core.losses import _prepare_losser
from fastNLP.core.metrics import _prepare_metrics
@@ -34,8 +33,8 @@ from fastNLP.core.utils import get_func_signature
class Trainer(object):
def __init__(self, train_data, model, loss=None, metrics=None, n_epochs=3, batch_size=32, print_every=50,
validate_every=-1, dev_data=None, save_path=None, optimizer=Adam(lr=0.01, weight_decay=0),
check_code_level=0, metric_key=None, sampler=RandomSampler(), use_tqdm=True, use_cuda=False,
callbacks=None):
check_code_level=0, metric_key=None, sampler=RandomSampler(), prefetch=False, use_tqdm=True,
use_cuda=False, callbacks=None):
"""
:param DataSet train_data: the training data
:param torch.nn.modules.module model: a PyTorch model
@@ -46,20 +45,23 @@ class Trainer(object):
:param int print_every: step interval to print next training information. Default: -1(no print).
:param int validate_every: step interval to do next validation. Default: -1(validate every epoch).
:param DataSet dev_data: the validation data
:param bool use_cuda: whether to use CUDA in training.
:param str save_path: file path to save models
:param Optimizer optimizer: an optimizer object
:param int check_code_level: level of FastNLP code checker. -1: don't check, 0: ignore. 1: warning. 2: strict.\\
`ignore` will not check unused field; `warning` when warn if some field are not used; `strict` means
it will raise error if some field are not used.
it will raise error if some field are not used. 检查的原理是通过使用很小的batch(默认两个sample)来检查代码是
否能够运行,但是这个过程理论上不会修改任何参数,只是会检查能否运行。但如果(1)模型中存在将batch_size写为某个
固定值的情况;(2)模型中存在累加前向计算次数的,可能会多计算几次。以上情况建议将check_code_level设置为-1
:param str metric_key: a single indicator used to decide the best model based on metric results. It must be one
of the keys returned by the FIRST metric in `metrics`. If the overall result gets better if the indicator gets
smaller, add "-" in front of the string. For example::

metric_key="-PPL" # language model gets better as perplexity gets smaller
:param BaseSampler sampler: method used to generate batch data.
:param prefetch: bool, 是否使用额外的进程对产生batch数据。
:param bool use_tqdm: whether to use tqdm to show train progress.

:param callbacks: List[Callback]. 用于在train过程中起调节作用的回调函数。比如early stop,negative sampling等可以
通过callback机制实现。
"""
super(Trainer, self).__init__()

@@ -114,7 +116,11 @@ class Trainer(object):
self.print_every = int(print_every)
self.validate_every = int(validate_every) if validate_every!=0 else -1
self.best_metric_indicator = None
self.best_dev_epoch = None
self.best_dev_step = None
self.best_dev_perf = None
self.sampler = sampler
self.prefetch = prefetch
self.callback_manager = CallbackManager(env={"trainer": self}, callbacks=callbacks)

if isinstance(optimizer, torch.optim.Optimizer):
@@ -175,32 +181,26 @@ class Trainer(object):

"""
results = {}
if self.n_epochs <= 0:
print(f"training epoch is {self.n_epochs}, nothing was done.")
results['seconds'] = 0.
return results
try:
if torch.cuda.is_available() and self.use_cuda:
self.model = self.model.cuda()
self._model_device = self.model.parameters().__next__().device

self._mode(self.model, is_test=False)

self.start_time = str(datetime.now().strftime('%Y-%m-%d %H-%M-%S'))
self.start_time = str(datetime.now().strftime('%Y-%m-%d-%H-%M-%S'))
start_time = time.time()
print("training epochs started " + self.start_time, flush=True)
if self.save_path is None:
class psudoSW:
def __getattr__(self, item):
def pass_func(*args, **kwargs):
pass

return pass_func

self._summary_writer = psudoSW()
else:
path = os.path.join(self.save_path, 'tensorboard_logs_{}'.format(self.start_time))
self._summary_writer = SummaryWriter(path)

self.callback_manager.before_train()
self._train()
self.callback_manager.after_train(self.model)
try:
self.callback_manager.on_train_begin()
self._train()
self.callback_manager.on_train_end(self.model)
except (CallbackException, KeyboardInterrupt) as e:
self.callback_manager.on_exception(e, self.model)

if self.dev_data is not None:
print("\nIn Epoch:{}/Step:{}, got best dev performance:".format(self.best_dev_epoch, self.best_dev_step) +
@@ -216,8 +216,7 @@ class Trainer(object):
else:
print("Fail to reload best model.")
finally:
self._summary_writer.close()
del self._summary_writer
pass
results['seconds'] = round(time.time() - start_time, 2)

return results
@@ -229,42 +228,36 @@ class Trainer(object):
inner_tqdm = tqdm
self.step = 0
start = time.time()
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False)
total_steps = data_iterator.num_batches * self.n_epochs
total_steps = (len(self.train_data) // self.batch_size + int(
len(self.train_data) % self.batch_size != 0)) * self.n_epochs
with inner_tqdm(total=total_steps, postfix='loss:{0:<6.5f}', leave=False, dynamic_ncols=True) as pbar:
avg_loss = 0
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler, as_numpy=False,
prefetch=self.prefetch)
for epoch in range(1, self.n_epochs+1):
pbar.set_description_str(desc="Epoch {}/{}".format(epoch, self.n_epochs))
# early stopping
self.callback_manager.before_epoch(epoch, self.n_epochs)
self.callback_manager.on_epoch_begin(epoch, self.n_epochs)
for batch_x, batch_y in data_iterator:
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
indices = data_iterator.get_batch_indices()
# negative sampling; replace unknown; re-weight batch_y
self.callback_manager.before_batch(batch_x, batch_y, indices)
_move_dict_value_to_device(batch_x, batch_y, device=self._model_device)
self.callback_manager.on_batch_begin(batch_x, batch_y, indices)
prediction = self._data_forward(self.model, batch_x)

# edit prediction
self.callback_manager.before_loss(batch_y, prediction)
self.callback_manager.on_loss_begin(batch_y, prediction)
loss = self._compute_loss(prediction, batch_y)
avg_loss += loss.item()

# Is loss NaN or inf? requires_grad = False
self.callback_manager.before_backward(loss, self.model)
self.callback_manager.on_backward_begin(loss, self.model)
self._grad_backward(loss)
# gradient clipping
self.callback_manager.after_backward(self.model)
self.callback_manager.on_backward_end(self.model)

self._update()
# lr scheduler; lr_finder; one_cycle
self.callback_manager.after_step(self.optimizer)

self._summary_writer.add_scalar("loss", loss.item(), global_step=self.step)
for name, param in self.model.named_parameters():
if param.requires_grad:
self._summary_writer.add_scalar(name + "_mean", param.mean(), global_step=self.step)
# self._summary_writer.add_scalar(name + "_std", param.std(), global_step=self.step)
# self._summary_writer.add_scalar(name + "_grad_sum", param.sum(), global_step=self.step)
self.callback_manager.on_step_end(self.optimizer)

if (self.step+1) % self.print_every == 0:
if self.use_tqdm:
print_output = "loss:{0:<6.5f}".format(avg_loss / self.print_every)
@@ -277,11 +270,10 @@ class Trainer(object):
pbar.set_postfix_str(print_output)
avg_loss = 0
self.step += 1
# do nothing
self.callback_manager.after_batch()
self.callback_manager.on_batch_end()

if ((self.validate_every > 0 and self.step % self.validate_every == 0) or
(self.validate_every < 0 and self.step % len(data_iterator)) == 0) \
(self.validate_every < 0 and self.step % len(data_iterator) == 0)) \
and self.dev_data is not None:
eval_res = self._do_validation(epoch=epoch, step=self.step)
eval_str = "Evaluation at Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step,
@@ -289,35 +281,29 @@ class Trainer(object):
self.tester._format_eval_results(eval_res)
pbar.write(eval_str)

# if self.validate_every < 0 and self.dev_data:
# eval_res = self._do_validation(epoch=epoch, step=self.step)
# eval_str = "Epoch {}/{}. Step:{}/{}. ".format(epoch, self.n_epochs, self.step, total_steps) + \
# self.tester._format_eval_results(eval_res)
# pbar.write(eval_str)
if epoch != self.n_epochs:
data_iterator = Batch(self.train_data, batch_size=self.batch_size, sampler=self.sampler,
as_numpy=False)
# ================= mini-batch end ==================== #

# lr decay; early stopping
self.callback_manager.after_epoch(epoch, self.n_epochs, self.optimizer)
self.callback_manager.on_epoch_end(epoch, self.n_epochs, self.optimizer)
# =============== epochs end =================== #
pbar.close()
# ============ tqdm end ============== #

def _do_validation(self, epoch, step):
self.callback_manager.on_valid_begin()
res = self.tester.test()
for name, metric in res.items():
for metric_key, metric_val in metric.items():
self._summary_writer.add_scalar("valid_{}_{}".format(name, metric_key), metric_val,
global_step=self.step)

if self._better_eval_result(res):
if self.save_path is not None:
self._save_model(self.model,
"best_" + "_".join([self.model.__class__.__name__, self.metric_key, self.start_time]))
else:
self._best_model_states = {name:param.cpu().clone() for name, param in self.model.named_parameters()}
self._best_model_states = {name: param.cpu().clone() for name, param in self.model.named_parameters()}
self.best_dev_perf = res
self.best_dev_epoch = epoch
self.best_dev_step = step
# get validation results; adjust optimizer
self.callback_manager.after_valid(res, self.metric_key, self.optimizer)
self.callback_manager.on_valid_end(res, self.metric_key, self.optimizer)
return res

def _mode(self, model, is_test=False):
@@ -365,12 +351,23 @@ class Trainer(object):
return self.losser(predict, truth)

def _save_model(self, model, model_name, only_param=False):
""" 存储不含有显卡信息的state_dict或model
:param model:
:param model_name:
:param only_param:
:return:
"""
if self.save_path is not None:
model_name = os.path.join(self.save_path, model_name)
model_path = os.path.join(self.save_path, model_name)
if only_param:
torch.save(model.state_dict(), model_name)
state_dict = model.state_dict()
for key in state_dict:
state_dict[key] = state_dict[key].cpu()
torch.save(state_dict, model_path)
else:
torch.save(model, model_name)
model.cpu()
torch.save(model, model_path)
model.cuda()

def _load_model(self, model, model_name, only_param=False):
# 返回bool值指示是否成功reload模型


+ 3
- 2
fastNLP/core/utils.py View File

@@ -186,11 +186,12 @@ def _check_function_or_method(func):
raise TypeError(f"{type(func)} is not a method or function.")


def _move_dict_value_to_device(*args, device: torch.device):
def _move_dict_value_to_device(*args, device: torch.device, non_blocking=False):
"""

move data to model's device, element in *args should be dict. This is a inplace change.
:param device: torch.device
:param non_blocking: bool, 是否异步将数据转移到cpu, 需要tensor使用pin_memory()
:param args:
:return:
"""
@@ -201,7 +202,7 @@ def _move_dict_value_to_device(*args, device: torch.device):
if isinstance(arg, dict):
for key, value in arg.items():
if isinstance(value, torch.Tensor):
arg[key] = value.to(device)
arg[key] = value.to(device, non_blocking=non_blocking)
else:
raise TypeError("Only support `dict` type right now.")



+ 6
- 0
fastNLP/io/base_loader.py View File

@@ -11,18 +11,24 @@ class BaseLoader(object):

@staticmethod
def load_lines(data_path):
"""按行读取,舍弃每行两侧空白字符,返回list of str
"""
with open(data_path, "r", encoding="utf=8") as f:
text = f.readlines()
return [line.strip() for line in text]

@classmethod
def load(cls, data_path):
"""先按行读取,去除一行两侧空白,再提取每行的字符。返回list of list of str
"""
with open(data_path, "r", encoding="utf-8") as f:
text = f.readlines()
return [[word for word in sent.strip()] for sent in text]

@classmethod
def load_with_cache(cls, data_path, cache_path):
"""缓存版的load
"""
if os.path.isfile(cache_path) and os.path.getmtime(data_path) < os.path.getmtime(cache_path):
with open(cache_path, 'rb') as f:
return pickle.load(f)


+ 1
- 8
fastNLP/io/config_io.py View File

@@ -11,7 +11,6 @@ class ConfigLoader(BaseLoader):
:param str data_path: path to the config

"""

def __init__(self, data_path=None):
super(ConfigLoader, self).__init__()
if data_path is not None:
@@ -30,7 +29,7 @@ class ConfigLoader(BaseLoader):
Example::

test_args = ConfigSection()
ConfigLoader("config.cfg", "").load_config("./data_for_tests/config", {"POS_test": test_args})
ConfigLoader("config.cfg").load_config("./data_for_tests/config", {"POS_test": test_args})

"""
assert isinstance(sections, dict)
@@ -202,8 +201,6 @@ class ConfigSaver(object):
continue

if '=' not in line:
# log = create_logger(__name__, './config_saver.log')
# log.error("can NOT load config file [%s]" % self.file_path)
raise RuntimeError("can NOT load config file {}".__format__(self.file_path))

key = line.split('=', maxsplit=1)[0].strip()
@@ -263,10 +260,6 @@ class ConfigSaver(object):
change_file = True
break
if section_file[k] != section[k]:
# logger = create_logger(__name__, "./config_loader.log")
# logger.warning("section [%s] in config file [%s] has been changed" % (
# section_name, self.file_path
# ))
change_file = True
break
if not change_file:


+ 349
- 49
fastNLP/io/dataset_loader.py View File

@@ -90,6 +90,7 @@ class NativeDataSetLoader(DataSetLoader):
"""A simple example of DataSetLoader

"""

def __init__(self):
super(NativeDataSetLoader, self).__init__()

@@ -107,6 +108,7 @@ class RawDataSetLoader(DataSetLoader):
"""A simple example of raw data reader

"""

def __init__(self):
super(RawDataSetLoader, self).__init__()

@@ -124,8 +126,8 @@ class RawDataSetLoader(DataSetLoader):
DataLoaderRegister.set_reader(RawDataSetLoader, 'read_rawdata')


class POSDataSetLoader(DataSetLoader):
"""Dataset Loader for a POS Tag dataset.
class DummyPOSReader(DataSetLoader):
"""A simple reader for a dummy POS tagging dataset.

In these datasets, each line are divided by "\t". The first Col is the vocabulary and the second
Col is the label. Different sentence are divided by an empty line.
@@ -142,8 +144,9 @@ class POSDataSetLoader(DataSetLoader):

In this example, there are two sentences "Tom and Jerry ." and "Hello world !". Each word has its own label.
"""

def __init__(self):
super(POSDataSetLoader, self).__init__()
super(DummyPOSReader, self).__init__()

def load(self, data_path):
"""
@@ -191,16 +194,14 @@ class POSDataSetLoader(DataSetLoader):
return convert_seq2seq_dataset(data)


DataLoaderRegister.set_reader(POSDataSetLoader, 'read_pos')
DataLoaderRegister.set_reader(DummyPOSReader, 'read_pos')


class TokenizeDataSetLoader(DataSetLoader):
class DummyCWSReader(DataSetLoader):
"""Load pku dataset for Chinese word segmentation.
"""
Data set loader for tokenization data sets
"""

def __init__(self):
super(TokenizeDataSetLoader, self).__init__()
super(DummyCWSReader, self).__init__()

def load(self, data_path, max_seq_len=32):
"""Load pku dataset for Chinese word segmentation.
@@ -253,11 +254,11 @@ class TokenizeDataSetLoader(DataSetLoader):
return convert_seq2seq_dataset(data)


class ClassDataSetLoader(DataSetLoader):
class DummyClassificationReader(DataSetLoader):
"""Loader for a dummy classification data set"""

def __init__(self):
super(ClassDataSetLoader, self).__init__()
super(DummyClassificationReader, self).__init__()

def load(self, data_path):
assert os.path.exists(data_path)
@@ -268,7 +269,7 @@ class ClassDataSetLoader(DataSetLoader):

@staticmethod
def parse(lines):
"""
"""每行第一个token是标签,其余是字/词;由空格分隔。

:param lines: lines from dataset
:return: list(list(list())): the three level of lists are words, sentence, and dataset
@@ -324,16 +325,11 @@ class ConllLoader(DataSetLoader):
pass


class LMDataSetLoader(DataSetLoader):
"""Language Model Dataset Loader

This loader produces data for language model training in a supervised way.
That means it has X and Y.

class DummyLMReader(DataSetLoader):
"""A Dummy Language Model Dataset Reader
"""

def __init__(self):
super(LMDataSetLoader, self).__init__()
super(DummyLMReader, self).__init__()

def load(self, data_path):
if not os.path.exists(data_path):
@@ -361,19 +357,25 @@ class LMDataSetLoader(DataSetLoader):


class PeopleDailyCorpusLoader(DataSetLoader):
"""人民日报数据集
"""
People Daily Corpus: Chinese word segmentation, POS tag, NER
"""

def __init__(self):
super(PeopleDailyCorpusLoader, self).__init__()
self.pos = True
self.ner = True

def load(self, data_path):
def load(self, data_path, pos=True, ner=True):
"""

:param str data_path: 数据路径
:param bool pos: 是否使用词性标签
:param bool ner: 是否使用命名实体标签
:return: a DataSet object
"""
self.pos, self.ner = pos, ner
with open(data_path, "r", encoding="utf-8") as f:
sents = f.readlines()

pos_tag_examples = []
ner_examples = []
examples = []
for sent in sents:
if len(sent) <= 2:
continue
@@ -407,40 +409,44 @@ class PeopleDailyCorpusLoader(DataSetLoader):
sent_ner.append(ner_tag)
sent_pos_tag.append(pos)
sent_words.append(token)
pos_tag_examples.append([sent_words, sent_pos_tag])
ner_examples.append([sent_words, sent_ner])
# List[List[List[str], List[str]]]
# ner_examples not used
return self.convert(pos_tag_examples)
example = [sent_words]
if self.pos is True:
example.append(sent_pos_tag)
if self.ner is True:
example.append(sent_ner)
examples.append(example)
return self.convert(examples)

def convert(self, data):
data_set = DataSet()
for item in data:
sent_words, sent_pos_tag = item[0], item[1]
data_set.append(Instance(words=sent_words, tags=sent_pos_tag))
data_set.apply(lambda ins: len(ins), new_field_name="seq_len")
data_set.set_target("tags")
data_set.set_input("sent_words")
data_set.set_input("seq_len")
sent_words = item[0]
if self.pos is True and self.ner is True:
instance = Instance(words=sent_words, pos_tags=item[1], ner=item[2])
elif self.pos is True:
instance = Instance(words=sent_words, pos_tags=item[1])
elif self.ner is True:
instance = Instance(words=sent_words, ner=item[1])
else:
instance = Instance(words=sent_words)
data_set.append(instance)
data_set.apply(lambda ins: len(ins["words"]), new_field_name="seq_len")
return data_set


class Conll2003Loader(DataSetLoader):
"""Self-defined loader of conll2003 dataset
"""Loader for conll2003 dataset
More information about the given dataset cound be found on
https://sites.google.com/site/ermasoftware/getting-started/ne-tagging-conll2003-data
"""

def __init__(self):
super(Conll2003Loader, self).__init__()

def load(self, dataset_path):
with open(dataset_path, "r", encoding="utf-8") as f:
lines = f.readlines()

##Parse the dataset line by line
parsed_data = []
sentence = []
tokens = []
@@ -467,21 +473,20 @@ class Conll2003Loader(DataSetLoader):
lambda labels: labels[1], sample[1]))
label2_list = list(map(
lambda labels: labels[2], sample[1]))
dataset.append(Instance(token_list=sample[0],
label0_list=label0_list,
label1_list=label1_list,
label2_list=label2_list))
dataset.append(Instance(tokens=sample[0],
pos=label0_list,
chucks=label1_list,
ner=label2_list))

return dataset


class SNLIDataSetLoader(DataSetLoader):
class SNLIDataSetReader(DataSetLoader):
"""A data set loader for SNLI data set.

"""

def __init__(self):
super(SNLIDataSetLoader, self).__init__()
super(SNLIDataSetReader, self).__init__()

def load(self, path_list):
"""
@@ -540,3 +545,298 @@ class SNLIDataSetLoader(DataSetLoader):
data_set.set_input("premise", "hypothesis", "premise_len", "hypothesis_len")
data_set.set_target("truth")
return data_set


class ConllCWSReader(object):
def __init__(self):
pass

def load(self, path, cut_long_sent=False):
"""
返回的DataSet只包含raw_sentence这个field,内容为str。
假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
::

1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct

1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep

"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.strip().split())
if len(sample) > 0:
datalist.append(sample)

ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_char_lst(sample)
if res is None:
continue
line = ' '.join(res)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for raw_sentence in sents:
ds.append(Instance(raw_sentence=raw_sentence))
return ds

def get_char_lst(self, sample):
if len(sample) == 0:
return None
text = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
return text


class NaiveCWSReader(DataSetLoader):
"""
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
例如::

这是 fastNLP , 一个 非常 good 的 包 .

或者,即每个part后面还有一个pos tag
例如::

也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY

"""

def __init__(self, in_word_splitter=None):
super(NaiveCWSReader, self).__init__()
self.in_word_splitter = in_word_splitter

def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
"""
允许使用的情况有(默认以\t或空格作为seg)
这是 fastNLP , 一个 非常 good 的 包 .
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]

:param filepath:
:param in_word_splitter:
:param cut_long_sent:
:return:
"""
if in_word_splitter == None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if len(line.replace(' ', '')) == 0: # 不能接受空行
continue

if not in_word_splitter is None:
words = []
for part in line.split():
word = part.split(in_word_splitter)[0]
words.append(word)
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)

return dataset


def cut_long_sentence(sent, max_sample_length=200):
"""
将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。所以截取的句子可能长于或者短于max_sample_length

:param sent: str.
:param max_sample_length: int.
:return: list of str.
"""
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence


class ZhConllPOSReader(object):
"""读取中文Conll格式。返回“字级别”的标签,使用BMES记号扩展原来的词级别标签。

"""
def __init__(self):
pass

def load(self, path):
"""
返回的DataSet, 包含以下的field
words:list of str,
tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..]
假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
::

1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct

1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep

"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if res is None:
continue
char_seq = []
pos_seq = []
for word, tag in zip(res[0], res[1]):
char_seq.extend(list(word))
if len(word) == 1:
pos_seq.append('S-{}'.format(tag))
elif len(word) > 1:
pos_seq.append('B-{}'.format(tag))
for _ in range(len(word) - 2):
pos_seq.append('M-{}'.format(tag))
pos_seq.append('E-{}'.format(tag))
else:
raise ValueError("Zero length of word detected.")

ds.append(Instance(words=char_seq,
tag=pos_seq))

return ds

def get_one(self, sample):
if len(sample) == 0:
return None
text = []
pos_tags = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
pos_tags.append(t2)
return text, pos_tags


class ConllxDataLoader(object):
"""返回“词级别”的标签信息,包括词、词性、(句法)头依赖、(句法)边标签。跟``ZhConllPOSReader``完全不同。

"""
def load(self, path):
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

data = [self.get_one(sample) for sample in datalist]
data_list = list(filter(lambda x: x is not None, data))

ds = DataSet()
for example in data_list:
ds.append(Instance(words=example[0],
pos_tags=example[1],
heads=example[2],
labels=example[3]))
return ds

def get_one(self, sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]


def add_seg_tag(data):
"""

:param data: list of ([word], [pos], [heads], [head_tags])
:return: list of ([word], [pos])
"""

_processed = []
for word_list, pos_list, _, _ in data:
new_sample = []
for word, pos in zip(word_list, pos_list):
if len(word) == 1:
new_sample.append((word, 'S-' + pos))
else:
new_sample.append((word[0], 'B-' + pos))
for c in word[1:-1]:
new_sample.append((c, 'M-' + pos))
new_sample.append((word[-1], 'E-' + pos))
_processed.append(list(map(list, zip(*new_sample))))
return _processed

+ 4
- 1
fastNLP/io/embed_loader.py View File

@@ -101,9 +101,12 @@ class EmbedLoader(BaseLoader):
"""
if vocab is None:
raise RuntimeError("You must provide a vocabulary.")
embedding_matrix = np.zeros(shape=(len(vocab), emb_dim))
embedding_matrix = np.zeros(shape=(len(vocab), emb_dim), dtype=np.float32)
hit_flags = np.zeros(shape=(len(vocab),), dtype=int)
with open(emb_file, "r", encoding="utf-8") as f:
startline = f.readline()
if len(startline.split()) > 2:
f.seek(0)
for line in f:
word, vector = EmbedLoader.parse_glove_line(line)
if word in vocab:


+ 362
- 0
fastNLP/models/bert.py View File

@@ -0,0 +1,362 @@
"""
bert.py is modified from huggingface/pytorch-pretrained-BERT, which is licensed under the Apache License 2.0.

"""
import copy
import json
import math
import os

import torch
from torch import nn

CONFIG_FILE = 'bert_config.json'
MODEL_WEIGHTS = 'pytorch_model.bin'


def gelu(x):
return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def swish(x):
return x * torch.sigmoid(x)


ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}


class BertLayerNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-12):
super(BertLayerNorm, self).__init__()
self.weight = nn.Parameter(torch.ones(hidden_size))
self.bias = nn.Parameter(torch.zeros(hidden_size))
self.variance_epsilon = eps

def forward(self, x):
u = x.mean(-1, keepdim=True)
s = (x - u).pow(2).mean(-1, keepdim=True)
x = (x - u) / torch.sqrt(s + self.variance_epsilon)
return self.weight * x + self.bias


class BertEmbeddings(nn.Module):
def __init__(self, vocab_size, hidden_size, max_position_embeddings, type_vocab_size, hidden_dropout_prob):
super(BertEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
self.token_type_embeddings = nn.Embedding(type_vocab_size, hidden_size)

# self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
# any TensorFlow checkpoint file
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None):
seq_length = input_ids.size(1)
position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

words_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)

embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings


class BertSelfAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob):
super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0:
raise ValueError(
"The hidden size (%d) is not a multiple of the number of attention "
"heads (%d)" % (hidden_size, num_attention_heads))
self.num_attention_heads = num_attention_heads
self.attention_head_size = int(hidden_size / num_attention_heads)
self.all_head_size = self.num_attention_heads * self.attention_head_size

self.query = nn.Linear(hidden_size, self.all_head_size)
self.key = nn.Linear(hidden_size, self.all_head_size)
self.value = nn.Linear(hidden_size, self.all_head_size)

self.dropout = nn.Dropout(attention_probs_dropout_prob)

def transpose_for_scores(self, x):
new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
x = x.view(*new_x_shape)
return x.permute(0, 2, 1, 3)

def forward(self, hidden_states, attention_mask):
mixed_query_layer = self.query(hidden_states)
mixed_key_layer = self.key(hidden_states)
mixed_value_layer = self.value(hidden_states)

query_layer = self.transpose_for_scores(mixed_query_layer)
key_layer = self.transpose_for_scores(mixed_key_layer)
value_layer = self.transpose_for_scores(mixed_value_layer)

# Take the dot product between "query" and "key" to get the raw attention scores.
attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
attention_scores = attention_scores / math.sqrt(self.attention_head_size)
# Apply the attention mask is (precomputed for all layers in BertModel forward() function)
attention_scores = attention_scores + attention_mask

# Normalize the attention scores to probabilities.
attention_probs = nn.Softmax(dim=-1)(attention_scores)

# This is actually dropping out entire tokens to attend to, which might
# seem a bit unusual, but is taken from the original Transformer paper.
attention_probs = self.dropout(attention_probs)

context_layer = torch.matmul(attention_probs, value_layer)
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(*new_context_layer_shape)
return context_layer


class BertSelfOutput(nn.Module):
def __init__(self, hidden_size, hidden_dropout_prob):
super(BertSelfOutput, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertAttention(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob):
super(BertAttention, self).__init__()
self.self = BertSelfAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob)
self.output = BertSelfOutput(hidden_size, hidden_dropout_prob)

def forward(self, input_tensor, attention_mask):
self_output = self.self(input_tensor, attention_mask)
attention_output = self.output(self_output, input_tensor)
return attention_output


class BertIntermediate(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_act):
super(BertIntermediate, self).__init__()
self.dense = nn.Linear(hidden_size, intermediate_size)
self.intermediate_act_fn = ACT2FN[hidden_act] \
if isinstance(hidden_act, str) else hidden_act

def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.intermediate_act_fn(hidden_states)
return hidden_states


class BertOutput(nn.Module):
def __init__(self, hidden_size, intermediate_size, hidden_dropout_prob):
super(BertOutput, self).__init__()
self.dense = nn.Linear(intermediate_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
hidden_states = self.LayerNorm(hidden_states + input_tensor)
return hidden_states


class BertLayer(nn.Module):
def __init__(self, hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertLayer, self).__init__()
self.attention = BertAttention(hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob)
self.intermediate = BertIntermediate(hidden_size, intermediate_size, hidden_act)
self.output = BertOutput(hidden_size, intermediate_size, hidden_dropout_prob)

def forward(self, hidden_states, attention_mask):
attention_output = self.attention(hidden_states, attention_mask)
intermediate_output = self.intermediate(attention_output)
layer_output = self.output(intermediate_output, attention_output)
return layer_output


class BertEncoder(nn.Module):
def __init__(self, num_hidden_layers, hidden_size, num_attention_heads, attention_probs_dropout_prob,
hidden_dropout_prob,
intermediate_size, hidden_act):
super(BertEncoder, self).__init__()
layer = BertLayer(hidden_size, num_attention_heads, attention_probs_dropout_prob, hidden_dropout_prob,
intermediate_size, hidden_act)
self.layer = nn.ModuleList([copy.deepcopy(layer) for _ in range(num_hidden_layers)])

def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True):
all_encoder_layers = []
for layer_module in self.layer:
hidden_states = layer_module(hidden_states, attention_mask)
if output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
if not output_all_encoded_layers:
all_encoder_layers.append(hidden_states)
return all_encoder_layers


class BertPooler(nn.Module):
def __init__(self, hidden_size):
super(BertPooler, self).__init__()
self.dense = nn.Linear(hidden_size, hidden_size)
self.activation = nn.Tanh()

def forward(self, hidden_states):
# We "pool" the model by simply taking the hidden state corresponding
# to the first token.
first_token_tensor = hidden_states[:, 0]
pooled_output = self.dense(first_token_tensor)
pooled_output = self.activation(pooled_output)
return pooled_output


class BertModel(nn.Module):
"""Bidirectional Embedding Representations from Transformers.

If you want to use pre-trained weights, please download from the following sources provided by pytorch-pretrained-BERT.
sources::

'bert-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz",
'bert-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased.tar.gz",
'bert-base-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased.tar.gz",
'bert-large-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased.tar.gz",
'bert-base-multilingual-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased.tar.gz",
'bert-base-multilingual-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased.tar.gz",
'bert-base-chinese': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese.tar.gz",


Construct a BERT model with pre-trained weights::

model = BertModel.from_pretrained("path/to/weights/directory")

"""

def __init__(self, vocab_size,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02, **kwargs):
super(BertModel, self).__init__()
self.embeddings = BertEmbeddings(vocab_size, hidden_size, max_position_embeddings,
type_vocab_size, hidden_dropout_prob)
self.encoder = BertEncoder(num_hidden_layers, hidden_size, num_attention_heads,
attention_probs_dropout_prob, hidden_dropout_prob, intermediate_size,
hidden_act)
self.pooler = BertPooler(hidden_size)
self.initializer_range = initializer_range

self.apply(self.init_bert_weights)

def init_bert_weights(self, module):
if isinstance(module, (nn.Linear, nn.Embedding)):
# Slightly different from the TF version which uses truncated_normal for initialization
# cf https://github.com/pytorch/pytorch/pull/5617
module.weight.data.normal_(mean=0.0, std=self.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()

def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True):
if attention_mask is None:
attention_mask = torch.ones_like(input_ids)
if token_type_ids is None:
token_type_ids = torch.zeros_like(input_ids)

# We create a 3D attention mask from a 2D tensor mask.
# Sizes are [batch_size, 1, 1, to_seq_length]
# So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
# this attention mask is more simple than the triangular masking of causal attention
# used in OpenAI GPT, we just need to prepare the broadcast dimension here.
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)

# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

embedding_output = self.embeddings(input_ids, token_type_ids)
encoded_layers = self.encoder(embedding_output,
extended_attention_mask,
output_all_encoded_layers=output_all_encoded_layers)
sequence_output = encoded_layers[-1]
pooled_output = self.pooler(sequence_output)
if not output_all_encoded_layers:
encoded_layers = encoded_layers[-1]
return encoded_layers, pooled_output

@classmethod
def from_pretrained(cls, pretrained_model_dir, state_dict=None, *inputs, **kwargs):
# Load config
config_file = os.path.join(pretrained_model_dir, CONFIG_FILE)
config = json.load(open(config_file, "r"))
# config = BertConfig.from_json_file(config_file)
# logger.info("Model config {}".format(config))
# Instantiate model.
model = cls(*inputs, **config, **kwargs)
if state_dict is None:
weights_path = os.path.join(pretrained_model_dir, MODEL_WEIGHTS)
state_dict = torch.load(weights_path)

old_keys = []
new_keys = []
for key in state_dict.keys():
new_key = None
if 'gamma' in key:
new_key = key.replace('gamma', 'weight')
if 'beta' in key:
new_key = key.replace('beta', 'bias')
if new_key:
old_keys.append(key)
new_keys.append(new_key)
for old_key, new_key in zip(old_keys, new_keys):
state_dict[new_key] = state_dict.pop(old_key)

missing_keys = []
unexpected_keys = []
error_msgs = []
# copy state_dict so _load_from_state_dict can modify it
metadata = getattr(state_dict, '_metadata', None)
state_dict = state_dict.copy()
if metadata is not None:
state_dict._metadata = metadata

def load(module, prefix=''):
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
module._load_from_state_dict(
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
for name, child in module._modules.items():
if child is not None:
load(child, prefix + name + '.')

load(model, prefix='' if hasattr(model, 'bert') else 'bert.')
if len(missing_keys) > 0:
print("Weights of {} not initialized from pretrained model: {}".format(
model.__class__.__name__, missing_keys))
if len(unexpected_keys) > 0:
print("Weights from pretrained model not used in {}: {}".format(
model.__class__.__name__, unexpected_keys))
return model

+ 73
- 57
fastNLP/models/biaffine_parser.py View File

@@ -1,17 +1,20 @@
import copy
from collections import defaultdict

import numpy as np
import torch
from collections import defaultdict
from torch import nn
from torch.nn import functional as F
from fastNLP.modules.utils import initial_parameter
from fastNLP.modules.encoder.variational_rnn import VarLSTM
from fastNLP.modules.dropout import TimestepDropout
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.utils import seq_mask

from fastNLP.core.losses import LossFunc
from fastNLP.core.metrics import MetricBase
from fastNLP.core.utils import seq_lens_to_masks
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.dropout import TimestepDropout
from fastNLP.modules.encoder.transformer import TransformerEncoder
from fastNLP.modules.encoder.variational_rnn import VarLSTM
from fastNLP.modules.utils import initial_parameter
from fastNLP.modules.utils import seq_mask


def mst(scores):
"""
@@ -197,53 +200,64 @@ class BiaffineParser(GraphParser):
pos_vocab_size,
pos_emb_dim,
num_label,
word_hid_dim=100,
pos_hid_dim=100,
rnn_layers=1,
rnn_hidden_size=200,
arc_mlp_size=100,
label_mlp_size=100,
dropout=0.3,
use_var_lstm=False,
encoder='lstm',
use_greedy_infer=False):

super(BiaffineParser, self).__init__()
rnn_out_size = 2 * rnn_hidden_size
word_hid_dim = pos_hid_dim = rnn_hidden_size
self.word_embedding = nn.Embedding(num_embeddings=word_vocab_size, embedding_dim=word_emb_dim)
self.pos_embedding = nn.Embedding(num_embeddings=pos_vocab_size, embedding_dim=pos_emb_dim)
self.word_fc = nn.Linear(word_emb_dim, word_hid_dim)
self.pos_fc = nn.Linear(pos_emb_dim, pos_hid_dim)
self.word_norm = nn.LayerNorm(word_hid_dim)
self.pos_norm = nn.LayerNorm(pos_hid_dim)
self.use_var_lstm = use_var_lstm
if use_var_lstm:
self.lstm = VarLSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
input_dropout=dropout,
hidden_dropout=dropout,
bidirectional=True)
self.encoder_name = encoder
self.max_len = 512
if encoder == 'var-lstm':
self.encoder = VarLSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
input_dropout=dropout,
hidden_dropout=dropout,
bidirectional=True)
elif encoder == 'lstm':
self.encoder = nn.LSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
dropout=dropout,
bidirectional=True)
elif encoder == 'transformer':
n_head = 16
d_k = d_v = int(rnn_out_size / n_head)
if (d_k * n_head) != rnn_out_size:
raise ValueError('unsupported rnn_out_size: {} for transformer'.format(rnn_out_size))
self.position_emb = nn.Embedding(num_embeddings=self.max_len,
embedding_dim=rnn_out_size,)
self.encoder = TransformerEncoder(num_layers=rnn_layers,
model_size=rnn_out_size,
inner_size=1024,
key_size=d_k,
value_size=d_v,
num_head=n_head,
dropout=dropout,)
else:
self.lstm = nn.LSTM(input_size=word_hid_dim + pos_hid_dim,
hidden_size=rnn_hidden_size,
num_layers=rnn_layers,
bias=True,
batch_first=True,
dropout=dropout,
bidirectional=True)

self.arc_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size),
nn.LayerNorm(arc_mlp_size),
raise ValueError('unsupported encoder type: {}'.format(encoder))

self.mlp = nn.Sequential(nn.Linear(rnn_out_size, arc_mlp_size * 2 + label_mlp_size * 2),
nn.ELU(),
TimestepDropout(p=dropout),)
self.arc_dep_mlp = copy.deepcopy(self.arc_head_mlp)
self.label_head_mlp = nn.Sequential(nn.Linear(rnn_out_size, label_mlp_size),
nn.LayerNorm(label_mlp_size),
nn.ELU(),
TimestepDropout(p=dropout),)
self.label_dep_mlp = copy.deepcopy(self.label_head_mlp)
self.arc_mlp_size = arc_mlp_size
self.label_mlp_size = label_mlp_size
self.arc_predictor = ArcBiaffine(arc_mlp_size, bias=True)
self.label_predictor = LabelBilinear(label_mlp_size, label_mlp_size, num_label, bias=True)
self.use_greedy_infer = use_greedy_infer
@@ -286,24 +300,27 @@ class BiaffineParser(GraphParser):
word, pos = self.word_fc(word), self.pos_fc(pos)
word, pos = self.word_norm(word), self.pos_norm(pos)
x = torch.cat([word, pos], dim=2) # -> [N,L,C]
del word, pos

# lstm, extract features
sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
x = x[sort_idx]
x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
feat, _ = self.lstm(x) # -> [N,L,C]
feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
feat = feat[unsort_idx]
# encoder, extract features
if self.encoder_name.endswith('lstm'):
sort_lens, sort_idx = torch.sort(seq_lens, dim=0, descending=True)
x = x[sort_idx]
x = nn.utils.rnn.pack_padded_sequence(x, sort_lens, batch_first=True)
feat, _ = self.encoder(x) # -> [N,L,C]
feat, _ = nn.utils.rnn.pad_packed_sequence(feat, batch_first=True)
_, unsort_idx = torch.sort(sort_idx, dim=0, descending=False)
feat = feat[unsort_idx]
else:
seq_range = torch.arange(seq_len, dtype=torch.long, device=x.device)[None,:]
x = x + self.position_emb(seq_range)
feat = self.encoder(x, mask.float())

# for arc biaffine
# mlp, reduce dim
arc_dep = self.arc_dep_mlp(feat)
arc_head = self.arc_head_mlp(feat)
label_dep = self.label_dep_mlp(feat)
label_head = self.label_head_mlp(feat)
del feat
feat = self.mlp(feat)
arc_sz, label_sz = self.arc_mlp_size, self.label_mlp_size
arc_dep, arc_head = feat[:,:,:arc_sz], feat[:,:,arc_sz:2*arc_sz]
label_dep, label_head = feat[:,:,2*arc_sz:2*arc_sz+label_sz], feat[:,:,2*arc_sz+label_sz:]

# biaffine arc classifier
arc_pred = self.arc_predictor(arc_head, arc_dep) # [N, L, L]
@@ -349,7 +366,7 @@ class BiaffineParser(GraphParser):
batch_size, seq_len, _ = arc_pred.shape
flip_mask = (mask == 0)
_arc_pred = arc_pred.clone()
_arc_pred.masked_fill_(flip_mask.unsqueeze(1), -np.inf)
_arc_pred.masked_fill_(flip_mask.unsqueeze(1), -float('inf'))
arc_logits = F.log_softmax(_arc_pred, dim=2)
label_logits = F.log_softmax(label_pred, dim=2)
batch_index = torch.arange(batch_size, device=arc_logits.device, dtype=torch.long).unsqueeze(1)
@@ -357,12 +374,11 @@ class BiaffineParser(GraphParser):
arc_loss = arc_logits[batch_index, child_index, arc_true]
label_loss = label_logits[batch_index, child_index, label_true]

arc_loss = arc_loss[:, 1:]
label_loss = label_loss[:, 1:]

float_mask = mask[:, 1:].float()
arc_nll = -(arc_loss*float_mask).mean()
label_nll = -(label_loss*float_mask).mean()
byte_mask = flip_mask.byte()
arc_loss.masked_fill_(byte_mask, 0)
label_loss.masked_fill_(byte_mask, 0)
arc_nll = -arc_loss.mean()
label_nll = -label_loss.mean()
return arc_nll + label_nll

def predict(self, word_seq, pos_seq, seq_lens):


+ 67
- 23
fastNLP/modules/aggregator/attention.py View File

@@ -4,6 +4,7 @@ import torch
import torch.nn.functional as F
from torch import nn

from fastNLP.modules.dropout import TimestepDropout
from fastNLP.modules.utils import mask_softmax


@@ -23,46 +24,89 @@ class Attention(torch.nn.Module):


class DotAtte(nn.Module):
def __init__(self, key_size, value_size):
def __init__(self, key_size, value_size, dropout=0.1):
super(DotAtte, self).__init__()
self.key_size = key_size
self.value_size = value_size
self.scale = math.sqrt(key_size)
self.drop = nn.Dropout(dropout)
self.softmax = nn.Softmax(dim=2)

def forward(self, Q, K, V, seq_mask=None):
def forward(self, Q, K, V, mask_out=None):
"""

:param Q: [batch, seq_len, key_size]
:param K: [batch, seq_len, key_size]
:param V: [batch, seq_len, value_size]
:param seq_mask: [batch, seq_len]
:param mask_out: [batch, seq_len]
"""
output = torch.matmul(Q, K.transpose(1, 2)) / self.scale
if seq_mask is not None:
output.masked_fill_(seq_mask.lt(1), -float('inf'))
output = nn.functional.softmax(output, dim=2)
if mask_out is not None:
output.masked_fill_(mask_out, -float('inf'))
output = self.softmax(output)
output = self.drop(output)
return torch.matmul(output, V)


class MultiHeadAtte(nn.Module):
def __init__(self, input_size, output_size, key_size, value_size, num_atte):
def __init__(self, input_size, key_size, value_size, num_head, dropout=0.1):
"""

:param input_size: int, 输入维度的大小。同时也是输出维度的大小。
:param key_size: int, 每个head的维度大小。
:param value_size: int,每个head中value的维度。
:param num_head: int,head的数量。
:param dropout: float。
"""
super(MultiHeadAtte, self).__init__()
self.in_linear = nn.ModuleList()
for i in range(num_atte * 3):
out_feat = key_size if (i % 3) != 2 else value_size
self.in_linear.append(nn.Linear(input_size, out_feat))
self.attes = nn.ModuleList([DotAtte(key_size, value_size) for _ in range(num_atte)])
self.out_linear = nn.Linear(value_size * num_atte, output_size)

def forward(self, Q, K, V, seq_mask=None):
heads = []
for i in range(len(self.attes)):
j = i * 3
qi, ki, vi = self.in_linear[j](Q), self.in_linear[j+1](K), self.in_linear[j+2](V)
headi = self.attes[i](qi, ki, vi, seq_mask)
heads.append(headi)
output = torch.cat(heads, dim=2)
return self.out_linear(output)
self.input_size = input_size
self.key_size = key_size
self.value_size = value_size
self.num_head = num_head

in_size = key_size * num_head
self.q_in = nn.Linear(input_size, in_size)
self.k_in = nn.Linear(input_size, in_size)
self.v_in = nn.Linear(input_size, in_size)
self.attention = DotAtte(key_size=key_size, value_size=value_size)
self.out = nn.Linear(value_size * num_head, input_size)
self.drop = TimestepDropout(dropout)
self.reset_parameters()

def reset_parameters(self):
sqrt = math.sqrt
nn.init.normal_(self.q_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
nn.init.normal_(self.k_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.key_size)))
nn.init.normal_(self.v_in.weight, mean=0, std=sqrt(2.0 / (self.input_size + self.value_size)))
nn.init.xavier_normal_(self.out.weight)

def forward(self, Q, K, V, atte_mask_out=None):
"""

:param Q: [batch, seq_len, model_size]
:param K: [batch, seq_len, model_size]
:param V: [batch, seq_len, model_size]
:param seq_mask: [batch, seq_len]
"""
batch, seq_len, _ = Q.size()
d_k, d_v, n_head = self.key_size, self.value_size, self.num_head
# input linear
q = self.q_in(Q).view(batch, seq_len, n_head, d_k)
k = self.k_in(K).view(batch, seq_len, n_head, d_k)
v = self.v_in(V).view(batch, seq_len, n_head, d_k)

# transpose q, k and v to do batch attention
q = q.permute(2, 0, 1, 3).contiguous().view(-1, seq_len, d_k)
k = k.permute(2, 0, 1, 3).contiguous().view(-1, seq_len, d_k)
v = v.permute(2, 0, 1, 3).contiguous().view(-1, seq_len, d_v)
if atte_mask_out is not None:
atte_mask_out = atte_mask_out.repeat(n_head, 1, 1)
atte = self.attention(q, k, v, atte_mask_out).view(n_head, batch, seq_len, d_v)

# concat all heads, do output linear
atte = atte.permute(1, 2, 0, 3).contiguous().view(batch, seq_len, -1)
output = self.drop(self.out(atte))
return output


class Bi_Attention(nn.Module):


+ 39
- 12
fastNLP/modules/encoder/transformer.py View File

@@ -1,29 +1,56 @@
from torch import nn

from ..aggregator.attention import MultiHeadAtte
from ..other_modules import LayerNormalization
from ..dropout import TimestepDropout


class TransformerEncoder(nn.Module):
class SubLayer(nn.Module):
def __init__(self, input_size, output_size, key_size, value_size, num_atte):
def __init__(self, model_size, inner_size, key_size, value_size, num_head, dropout=0.1):
"""

:param model_size: int, 输入维度的大小。同时也是输出维度的大小。
:param inner_size: int, FFN层的hidden大小
:param key_size: int, 每个head的维度大小。
:param value_size: int,每个head中value的维度。
:param num_head: int,head的数量。
:param dropout: float。
"""
super(TransformerEncoder.SubLayer, self).__init__()
self.atte = MultiHeadAtte(input_size, output_size, key_size, value_size, num_atte)
self.norm1 = LayerNormalization(output_size)
self.ffn = nn.Sequential(nn.Linear(output_size, output_size),
self.atte = MultiHeadAtte(model_size, key_size, value_size, num_head, dropout)
self.norm1 = nn.LayerNorm(model_size)
self.ffn = nn.Sequential(nn.Linear(model_size, inner_size),
nn.ReLU(),
nn.Linear(output_size, output_size))
self.norm2 = LayerNormalization(output_size)
nn.Linear(inner_size, model_size),
TimestepDropout(dropout),)
self.norm2 = nn.LayerNorm(model_size)

def forward(self, input, seq_mask=None, atte_mask_out=None):
"""

def forward(self, input, seq_mask):
attention = self.atte(input)
:param input: [batch, seq_len, model_size]
:param seq_mask: [batch, seq_len]
:return: [batch, seq_len, model_size]
"""
attention = self.atte(input, input, input, atte_mask_out)
norm_atte = self.norm1(attention + input)
attention *= seq_mask
output = self.ffn(norm_atte)
return self.norm2(output + norm_atte)
output = self.norm2(output + norm_atte)
output *= seq_mask
return output

def __init__(self, num_layers, **kargs):
super(TransformerEncoder, self).__init__()
self.layers = nn.Sequential(*[self.SubLayer(**kargs) for _ in range(num_layers)])
self.layers = nn.ModuleList([self.SubLayer(**kargs) for _ in range(num_layers)])

def forward(self, x, seq_mask=None):
return self.layers(x, seq_mask)
output = x
if seq_mask is None:
atte_mask_out = None
else:
atte_mask_out = (seq_mask < 1)[:,None,:]
seq_mask = seq_mask[:,:,None]
for layer in self.layers:
output = layer(output, seq_mask, atte_mask_out)
return output

+ 8
- 9
reproduction/Biaffine_parser/cfg.cfg View File

@@ -1,8 +1,9 @@
[train]
n_epochs = 40
n_epochs = 20
batch_size = 32
use_cuda = true
validate_every = 500
use_tqdm=true
validate_every = 1000
use_golden_train=true

[test]
@@ -16,20 +17,18 @@ use_cuda = true

[model]
word_vocab_size = -1
word_emb_dim = 100
word_emb_dim = 300
pos_vocab_size = -1
pos_emb_dim = 100
word_hid_dim = 100
pos_hid_dim = 100
rnn_layers = 3
rnn_hidden_size = 400
rnn_hidden_size = 256
arc_mlp_size = 500
label_mlp_size = 100
num_label = -1
dropout = 0.33
use_var_lstm=true
dropout = 0.3
encoder="var-lstm"
use_greedy_infer=false

[optim]
lr = 3e-4
lr = 2e-3
;weight_decay = 3e-5

+ 1
- 1
reproduction/Biaffine_parser/main.py View File

@@ -5,7 +5,7 @@ sys.path.extend(['/home/yfshao/workdir/dev_fastnlp'])
import torch
import argparse

from reproduction.Biaffine_parser.util import ConllxDataLoader, add_seg_tag
from fastNLP.io.dataset_loader import ConllxDataLoader, add_seg_tag
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance



+ 58
- 71
reproduction/Biaffine_parser/run.py View File

@@ -4,25 +4,23 @@ import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

import fastNLP
import torch

from fastNLP.core.trainer import Trainer
from fastNLP.core.instance import Instance
from fastNLP.api.pipeline import Pipeline
from fastNLP.models.biaffine_parser import BiaffineParser, ParserMetric, ParserLoss
from fastNLP.core.vocabulary import Vocabulary
from fastNLP.core.dataset import DataSet
from fastNLP.core.tester import Tester
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.io.model_io import ModelLoader
from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.io.model_io import ModelSaver
from reproduction.Biaffine_parser.util import ConllxDataLoader, MyDataloader
from fastNLP.io.dataset_loader import ConllxDataLoader
from fastNLP.api.processor import *
from fastNLP.io.embed_loader import EmbedLoader
from fastNLP.core.callback import Callback

BOS = '<BOS>'
EOS = '<EOS>'
UNK = '<UNK>'
PAD = '<PAD>'
NUM = '<NUM>'
ENG = '<ENG>'

@@ -33,11 +31,11 @@ if len(os.path.dirname(__file__)) != 0:
def convert(data):
dataset = DataSet()
for sample in data:
word_seq = [BOS] + sample[0]
pos_seq = [BOS] + sample[1]
heads = [0] + list(map(int, sample[2]))
head_tags = [BOS] + sample[3]
dataset.append(Instance(words=word_seq,
word_seq = [BOS] + sample['words']
pos_seq = [BOS] + sample['pos_tags']
heads = [0] + sample['heads']
head_tags = [BOS] + sample['labels']
dataset.append(Instance(raw_words=word_seq,
pos=pos_seq,
gold_heads=heads,
arc_true=heads,
@@ -50,24 +48,11 @@ def load(path):
return convert(data)


# datadir = "/mnt/c/Me/Dev/release-2.2-st-train-dev-data/ud-treebanks-v2.2/UD_English-EWT"
# datadir = "/home/yfshao/UD_English-EWT"
# train_data_name = "en_ewt-ud-train.conllu"
# dev_data_name = "en_ewt-ud-dev.conllu"
# emb_file_name = '/home/yfshao/glove.6B.100d.txt'
# loader = ConlluDataLoader()

# datadir = '/home/yfshao/workdir/parser-data/'
# train_data_name = "train_ctb5.txt"
# dev_data_name = "dev_ctb5.txt"
# test_data_name = "test_ctb5.txt"

datadir = "/home/yfshao/workdir/ctb7.0/"
datadir = "/remote-home/yfshao/workdir/ctb9.0/"
train_data_name = "train.conllx"
dev_data_name = "dev.conllx"
test_data_name = "test.conllx"
# emb_file_name = "/home/yfshao/workdir/parser-data/word_OOVthr_30_100v.txt"
emb_file_name = "/home/yfshao/workdir/word_vector/cc.zh.300.vec"
emb_file_name = "/remote-home/yfshao/workdir/word_vector/cc.zh.300.vec"

cfgfile = './cfg.cfg'
processed_datadir = './save'
@@ -113,27 +98,23 @@ def update_v(vocab, data, field):
data.apply(lambda x: vocab.add_word_lst(x[field]), new_field_name=None)


print('load raw data and preprocess')
# use pretrain embedding
word_v = Vocabulary()
word_v.unknown_label = UNK
pos_v = Vocabulary()
word_v = Vocabulary(unknown=UNK, padding=PAD)
pos_v = Vocabulary(unknown=None, padding=PAD)
tag_v = Vocabulary(unknown=None, padding=None)
train_data = load(os.path.join(datadir, train_data_name))
dev_data = load(os.path.join(datadir, dev_data_name))
test_data = load(os.path.join(datadir, test_data_name))
print(train_data[0])
num_p = Num2TagProcessor('words', 'words')
print('load raw data and preprocess')

num_p = Num2TagProcessor(tag=NUM, field_name='raw_words', new_added_field_name='words')
for ds in (train_data, dev_data, test_data):
num_p(ds)

update_v(word_v, train_data, 'words')
update_v(pos_v, train_data, 'pos')
update_v(tag_v, train_data, 'tags')

print('vocab build success {}, {}, {}'.format(len(word_v), len(pos_v), len(tag_v)))
# embed, _ = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v)
# print(embed.size())

# Model
model_args['word_vocab_size'] = len(word_v)
@@ -141,7 +122,7 @@ model_args['pos_vocab_size'] = len(pos_v)
model_args['num_label'] = len(tag_v)

model = BiaffineParser(**model_args.data)
model.reset_parameters()
print(model)

word_idxp = IndexerProcessor(word_v, 'words', 'word_seq')
pos_idxp = IndexerProcessor(pos_v, 'pos', 'pos_seq')
@@ -164,7 +145,6 @@ for ds in (train_data, dev_data, test_data):
if train_args['use_golden_train']:
train_data.set_input('gold_heads', flag=True)
train_args.data.pop('use_golden_train')
ignore_label = pos_v['punct']

print(test_data[0])
print('train len {}'.format(len(train_data)))
@@ -172,44 +152,62 @@ print('dev len {}'.format(len(dev_data)))
print('test len {}'.format(len(test_data)))



def train(path):
# test saving pipeline
save_pipe(path)
embed = EmbedLoader.fast_load_embedding(model_args['word_emb_dim'], emb_file_name, word_v)
embed = torch.tensor(embed, dtype=torch.float32)

# Trainer
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
**train_args.data,
optimizer=fastNLP.Adam(**optim_args.data),
save_path=path)

# model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=False)
# embed = EmbedLoader.fast_load_embedding(emb_dim=model_args['word_emb_dim'], emb_file=emb_file_name, vocab=word_v)
# embed = torch.tensor(embed, dtype=torch.float32)
# model.word_embedding = torch.nn.Embedding.from_pretrained(embed, freeze=True)
model.word_embedding.padding_idx = word_v.padding_idx
model.word_embedding.weight.data[word_v.padding_idx].fill_(0)
model.pos_embedding.padding_idx = pos_v.padding_idx
model.pos_embedding.weight.data[pos_v.padding_idx].fill_(0)

# try:
# ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
# print('model parameter loaded!')
# except Exception as _:
# print("No saved model. Continue.")
# pass
class MyCallback(Callback):
def on_step_end(self, optimizer):
step = self.trainer.step
# learning rate decay
if step > 0 and step % 1000 == 0:
for pg in optimizer.param_groups:
pg['lr'] *= 0.93
print('decay lr to {}'.format([pg['lr'] for pg in optimizer.param_groups]))

if step == 3000:
# start training embedding
print('start training embedding at {}'.format(step))
model = self.trainer.model
for m in model.modules():
if isinstance(m, torch.nn.Embedding):
m.weight.requires_grad = True

# Start training
trainer.train()
print("Training finished!")
# Trainer
trainer = Trainer(model=model, train_data=train_data, dev_data=dev_data,
loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
**train_args.data,
optimizer=fastNLP.Adam(**optim_args.data),
save_path=path,
callbacks=[MyCallback()])

# save pipeline
save_pipe(path)
print('pipe saved')
# Start training
try:
trainer.train()
print("Training finished!")
finally:
# save pipeline
save_pipe(path)
print('pipe saved')

def save_pipe(path):
pipe = Pipeline(processors=[num_p, word_idxp, pos_idxp, seq_p, set_input_p])
pipe.add_processor(ModelProcessor(model=model, batch_size=32))
pipe.add_processor(label_toword_p)
torch.save(pipe, os.path.join(path, 'pipe.pkl'))
os.makedirs(path, exist_ok=True)
torch.save({'pipeline': pipe,
'names':['num word_idx pos_idx seq set_input model tag_to_word'.split()],
}, os.path.join(path, 'pipe.pkl'))


def test(path):
@@ -234,16 +232,11 @@ def test(path):
print("Testing Test data")
tester.test(model, test_data)

def build_pipe(parser_pipe_path):
parser_pipe = torch.load(parser_pipe_path)




if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer', 'save'])
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
parser.add_argument('--path', type=str, default='')
# parser.add_argument('--dst', type=str, default='')
args = parser.parse_args()
@@ -253,12 +246,6 @@ if __name__ == "__main__":
test(args.path)
elif args.mode == 'infer':
pass
# elif args.mode == 'save':
# print(f'save model from {args.path} to {args.dst}')
# save_model(args.path, args.dst)
# load_path = os.path.dirname(args.dst)
# print(f'save pipeline in {load_path}')
# build(load_path)
else:
print('no mode specified for model!')
parser.print_help()

+ 0
- 51
reproduction/Biaffine_parser/util.py View File

@@ -1,34 +1,3 @@
class ConllxDataLoader(object):
def load(self, path):
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

data = [self.get_one(sample) for sample in datalist]
return list(filter(lambda x: x is not None, data))

def get_one(self, sample):
sample = list(map(list, zip(*sample)))
if len(sample) == 0:
return None
for w in sample[7]:
if w == '_':
print('Error Sample {}'.format(sample))
return None
# return word_seq, pos_seq, head_seq, head_tag_seq
return sample[1], sample[3], list(map(int, sample[6])), sample[7]


class MyDataloader:
def load(self, data_path):
with open(data_path, "r", encoding="utf-8") as f:
@@ -56,23 +25,3 @@ class MyDataloader:
return data


def add_seg_tag(data):
"""

:param data: list of ([word], [pos], [heads], [head_tags])
:return: list of ([word], [pos])
"""

_processed = []
for word_list, pos_list, _, _ in data:
new_sample = []
for word, pos in zip(word_list, pos_list):
if len(word) == 1:
new_sample.append((word, 'S-' + pos))
else:
new_sample.append((word[0], 'B-' + pos))
for c in word[1:-1]:
new_sample.append((c, 'M-' + pos))
new_sample.append((word[-1], 'E-' + pos))
_processed.append(list(map(list, zip(*new_sample))))
return _processed

reproduction/chinese_word_segment/cws_io/__init__.py → reproduction/Chinese_word_segmentation/__init__.py View File


reproduction/chinese_word_segment/cws.cfg → reproduction/Chinese_word_segmentation/cws.cfg View File


reproduction/chinese_word_segment/models/__init__.py → reproduction/Chinese_word_segmentation/cws_io/__init__.py View File


+ 3
- 0
reproduction/Chinese_word_segmentation/cws_io/cws_reader.py View File

@@ -0,0 +1,3 @@




reproduction/chinese_word_segment/process/__init__.py → reproduction/Chinese_word_segmentation/models/__init__.py View File


reproduction/chinese_word_segment/models/cws_model.py → reproduction/Chinese_word_segmentation/models/cws_model.py View File

@@ -1,11 +1,11 @@

from torch import nn
import torch
import torch.nn.functional as F
from torch import nn

from fastNLP.modules.decoder.MLP import MLP
from fastNLP.models.base_model import BaseModel
from reproduction.chinese_word_segment.utils import seq_lens_to_mask
from fastNLP.modules.decoder.MLP import MLP
from reproduction.Chinese_word_segmentation.utils import seq_lens_to_mask


class CWSBiLSTMEncoder(BaseModel):
def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,

+ 125
- 0
reproduction/Chinese_word_segmentation/models/cws_transformer.py View File

@@ -0,0 +1,125 @@



"""
使用transformer作为分词的encoder端

"""

from torch import nn
import torch
from fastNLP.modules.encoder.transformer import TransformerEncoder
from fastNLP.modules.decoder.CRF import ConditionalRandomField,seq_len_to_byte_mask
from fastNLP.modules.decoder.CRF import allowed_transitions

class TransformerCWS(nn.Module):
def __init__(self, vocab_num, embed_dim=100, bigram_vocab_num=None, bigram_embed_dim=100, num_bigram_per_char=None,
hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4):
super().__init__()

self.embedding = nn.Embedding(vocab_num, embed_dim)
input_size = embed_dim
if bigram_vocab_num:
self.bigram_embedding = nn.Embedding(bigram_vocab_num, bigram_embed_dim)
input_size += num_bigram_per_char*bigram_embed_dim

self.drop = nn.Dropout(embed_drop_p, inplace=True)

self.fc1 = nn.Linear(input_size, hidden_size)

value_size = hidden_size//num_heads
self.transformer = TransformerEncoder(num_layers, model_size=hidden_size, inner_size=hidden_size,
key_size=value_size,
value_size=value_size, num_head=num_heads)

self.fc2 = nn.Linear(hidden_size, tag_size)

allowed_trans = allowed_transitions({0:'b', 1:'m', 2:'e', 3:'s'}, encoding_type='bmes')
self.crf = ConditionalRandomField(num_tags=tag_size, include_start_end_trans=False,
allowed_transitions=allowed_trans)

def forward(self, chars, target, seq_lens, bigrams=None):
masks = seq_len_to_byte_mask(seq_lens).float()
x = self.embedding(chars)
batch_size = x.size(0)
length = x.size(1)
if hasattr(self, 'bigram_embedding'):
bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
self.drop(x)
x = self.fc1(x)
feats = self.transformer(x, masks)
feats = self.fc2(feats)
losses = self.crf(feats, target, masks.float())

pred_dict = {}
pred_dict['seq_lens'] = seq_lens
pred_dict['loss'] = torch.mean(losses)

return pred_dict

def predict(self, chars, seq_lens, bigrams=None):
masks = seq_len_to_byte_mask(seq_lens).float()

x = self.embedding(chars)
batch_size = x.size(0)
length = x.size(1)
if hasattr(self, 'bigram_embedding'):
bigrams = self.bigram_embedding(bigrams) # batch_size x seq_lens x per_char x embed_size
x = torch.cat([x, bigrams.view(batch_size, length, -1)], dim=-1)
self.drop(x)
x = self.fc1(x)
feats = self.transformer(x, masks)
feats = self.fc2(feats)

probs = self.crf.viterbi_decode(feats, masks, get_score=False)

return {'pred': probs, 'seq_lens':seq_lens}


class NoamOpt(torch.optim.Optimizer):
"Optim wrapper that implements rate."

def __init__(self, model_size, factor, warmup, optimizer):
super().__init__([torch.nn.Parameter(torch.ones(1))], {})

self.optimizer = optimizer
self._step = 0
self.warmup = warmup
self.factor = factor
self.model_size = model_size
self._rate = 0

def step(self, **kwargs):
"Update parameters and rate"
self._step += 1
rate = self.rate()
for p in self.optimizer.param_groups:
p['lr'] = rate
self._rate = rate
self.optimizer.step()

def rate(self, step=None):
"Implement `lrate` above"
if step is None:
step = self._step
return self.factor * \
(self.model_size ** (-0.5) *
min(step ** (-0.5), step * self.warmup ** (-1.5)))


if __name__ == '__main__':


transformer = TransformerCWS(10, embed_dim=100, bigram_vocab_num=10, bigram_embed_dim=100, num_bigram_per_char=8,
hidden_size=200, embed_drop_p=0.3, num_layers=1, num_heads=8, tag_size=4)
chars = torch.randint(10, size=(4, 7)).long()
bigrams = torch.randint(10, size=(4, 56)).long()
seq_lens = torch.ones(4).long()*7
target = torch.randint(4, size=(4, 7))

print(transformer(chars, target, seq_lens, bigrams))

optimizer = torch.optim.Adam(transformer.parameters())

opt = NoamOpt(10 ,1, 400, optimizer)

+ 0
- 0
reproduction/Chinese_word_segmentation/process/__init__.py View File


reproduction/chinese_word_segment/process/cws_processor.py → reproduction/Chinese_word_segmentation/process/cws_processor.py View File

@@ -4,7 +4,7 @@ import re
from fastNLP.api.processor import Processor
from fastNLP.core.dataset import DataSet
from fastNLP.core.vocabulary import Vocabulary
from reproduction.chinese_word_segment.process.span_converter import SpanConverter
from reproduction.Chinese_word_segmentation.process.span_converter import SpanConverter

_SPECIAL_TAG_PATTERN = '<[a-zA-Z]+>'

@@ -226,109 +226,6 @@ class Pre2Post2BigramProcessor(BigramProcessor):
return bigrams


# 这里需要建立vocabulary了,但是遇到了以下的问题
# (1) 如果使用Processor的方式的话,但是在这种情况返回的不是dataset。所以建立vocabulary的工作用另外的方式实现,不借用
# Processor了
# TODO 如何将建立vocab和index这两步统一了?

class VocabIndexerProcessor(Processor):
"""
根据DataSet创建Vocabulary,并将其用数字index。新生成的index的field会被放在new_added_filed_name, 如果没有提供
new_added_field_name, 则覆盖原有的field_name.

"""
def __init__(self, field_name, new_added_filed_name=None, min_freq=1, max_size=None,
verbose=0, is_input=True):
"""

:param field_name: 从哪个field_name创建词表,以及对哪个field_name进行index操作
:param new_added_filed_name: index时,生成的index field的名称,如果不传入,则覆盖field_name.
:param min_freq: 创建的Vocabulary允许的单词最少出现次数.
:param max_size: 创建的Vocabulary允许的最大的单词数量
:param verbose: 0, 不输出任何信息;1,输出信息
:param bool is_input:
"""
super(VocabIndexerProcessor, self).__init__(field_name, new_added_filed_name)
self.min_freq = min_freq
self.max_size = max_size

self.verbose =verbose
self.is_input = is_input

def construct_vocab(self, *datasets):
"""
使用传入的DataSet创建vocabulary

:param datasets: DataSet类型的数据,用于构建vocabulary
:return:
"""
self.vocab = Vocabulary(min_freq=self.min_freq, max_size=self.max_size)
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only Dataset class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: self.vocab.update(ins[self.field_name]))
self.vocab.build_vocab()
if self.verbose:
print("Vocabulary Constructed, has {} items.".format(len(self.vocab)))

def process(self, *datasets, only_index_dataset=None):
"""
若还未建立Vocabulary,则使用dataset中的DataSet建立vocabulary;若已经有了vocabulary则使用已有的vocabulary。得到vocabulary
后,则会index datasets与only_index_dataset。

:param datasets: DataSet类型的数据
:param only_index_dataset: DataSet, or list of DataSet. 该参数中的内容只会被用于index,不会被用于生成vocabulary。
:return:
"""
if len(datasets)==0 and not hasattr(self,'vocab'):
raise RuntimeError("You have to construct vocabulary first. Or you have to pass datasets to construct it.")
if not hasattr(self, 'vocab'):
self.construct_vocab(*datasets)
else:
if self.verbose:
print("Using constructed vocabulary with {} items.".format(len(self.vocab)))
to_index_datasets = []
if len(datasets)!=0:
for dataset in datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)

if not (only_index_dataset is None):
if isinstance(only_index_dataset, list):
for dataset in only_index_dataset:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
to_index_datasets.append(dataset)
elif isinstance(only_index_dataset, DataSet):
to_index_datasets.append(only_index_dataset)
else:
raise TypeError('Only DataSet or list of DataSet is allowed, not {}.'.format(type(only_index_dataset)))

for dataset in to_index_datasets:
assert isinstance(dataset, DataSet), "Only DataSet class is allowed, not {}.".format(type(dataset))
dataset.apply(lambda ins: [self.vocab.to_index(token) for token in ins[self.field_name]],
new_field_name=self.new_added_field_name, is_input=self.is_input)
# 只返回一个,infer时为了跟其他processor保持一致
if len(to_index_datasets) == 1:
return to_index_datasets[0]

def set_vocab(self, vocab):
assert isinstance(vocab, Vocabulary), "Only fastNLP.core.Vocabulary is allowed, not {}.".format(type(vocab))
self.vocab = vocab

def delete_vocab(self):
del self.vocab

def get_vocab_size(self):
return len(self.vocab)

def set_verbose(self, verbose):
"""
设置processor verbose状态。

:param verbose: int, 0,不输出任何信息;1,输出vocab 信息。
:return:
"""
self.verbose = verbose

class VocabProcessor(Processor):
def __init__(self, field_name, min_freq=1, max_size=None):


reproduction/chinese_word_segment/process/span_converter.py → reproduction/Chinese_word_segmentation/process/span_converter.py View File


reproduction/chinese_word_segment/utils.py → reproduction/Chinese_word_segmentation/utils.py View File


+ 1
- 1
reproduction/LSTM+self_attention_sentiment_analysis/main.py View File

@@ -4,7 +4,7 @@ from fastNLP.core.trainer import ClassificationTrainer
from fastNLP.core.utils import ClassPreprocess as Preprocess
from fastNLP.io.config_io import ConfigLoader
from fastNLP.io.config_io import ConfigSection
from fastNLP.io.dataset_loader import ClassDataSetLoader as Dataset_loader
from fastNLP.io.dataset_loader import DummyClassificationReader as Dataset_loader
from fastNLP.models.base_model import BaseModel
from fastNLP.modules.aggregator.self_attention import SelfAttention
from fastNLP.modules.decoder.MLP import MLP


reproduction/pos_tag_model/pos_processor.py → reproduction/POS_tagging/pos_processor.py View File


+ 29
- 0
reproduction/POS_tagging/pos_reader.py View File

@@ -0,0 +1,29 @@
from fastNLP.io.dataset_loader import ZhConllPOSReader


def cut_long_sentence(sent, max_sample_length=200):
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence


if __name__ == '__main__':
reader = ZhConllPOSReader()
d = reader.load('/home/hyan/train.conllx')
print(d)

reproduction/pos_tag_model/pos_tag.cfg → reproduction/POS_tagging/pos_tag.cfg View File

@@ -10,7 +10,7 @@ eval_sort_key = 'accuracy'

[model]
rnn_hidden_units = 300
word_emb_dim = 100
word_emb_dim = 300
dropout = 0.5
use_crf = true
print_every_step = 10

+ 163
- 0
reproduction/POS_tagging/train_pos_tag.py View File

@@ -0,0 +1,163 @@
import argparse
import os
import pickle
import sys

import torch

# in order to run fastNLP without installation
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import SeqLenProcessor, VocabIndexerProcessor, SetInputProcessor, IndexerProcessor
from fastNLP.core.metrics import SpanFPreRecMetric
from fastNLP.core.trainer import Trainer
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.io.dataset_loader import ConllxDataLoader
from fastNLP.api.processor import ModelProcessor, Index2WordProcessor


cfgfile = './pos_tag.cfg'
pickle_path = "save"


def load_tencent_embed(embed_path, word2id):
hit = 0
with open(embed_path, "rb") as f:
embed_dict = pickle.load(f)
embedding_tensor = torch.randn(len(word2id), 200)
for key in word2id:
if key in embed_dict:
embedding_tensor[word2id[key]] = torch.Tensor(embed_dict[key])
hit += 1
print("vocab_size={} hit={} hit/vocab_size={}".format(len(word2id), hit, hit / len(word2id)))
return embedding_tensor


def train(train_data_path, dev_data_path, checkpoint=None, save=None):
# load config
train_param = ConfigSection()
model_param = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_param, "model": model_param})
print("config loaded")

# Data Loader
print("loading training set...")
dataset = ConllxDataLoader().load(train_data_path, return_dataset=True)
print("loading dev set...")
dev_data = ConllxDataLoader().load(dev_data_path, return_dataset=True)
print(dataset)
print("================= dataset ready =====================")

dataset.rename_field("tag", "truth")
dev_data.rename_field("tag", "truth")

vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq")
tag_proc = VocabIndexerProcessor("truth", is_input=True)
seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True)
set_input_proc = SetInputProcessor("word_seq", "word_seq_origin_len")

vocab_proc(dataset)
tag_proc(dataset)
seq_len_proc(dataset)

# index dev set
word_vocab, tag_vocab = vocab_proc.vocab, tag_proc.vocab
dev_data.apply(lambda ins: [word_vocab.to_index(w) for w in ins["words"]], new_field_name="word_seq")
dev_data.apply(lambda ins: [tag_vocab.to_index(w) for w in ins["truth"]], new_field_name="truth")
dev_data.apply(lambda ins: len(ins["word_seq"]), new_field_name="word_seq_origin_len")

# set input & target
dataset.set_input("word_seq", "word_seq_origin_len", "truth")
dev_data.set_input("word_seq", "word_seq_origin_len", "truth")
dataset.set_target("truth", "word_seq_origin_len")
dev_data.set_target("truth", "word_seq_origin_len")

# dataset.set_is_target(tag_ids=True)
model_param["vocab_size"] = vocab_proc.get_vocab_size()
model_param["num_classes"] = tag_proc.get_vocab_size()
print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"]))

# define a model
if checkpoint is None:
# pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx)
pre_trained = None
model = AdvSeqLabel(model_param, id2words=None, emb=pre_trained)
print(model)
else:
model = torch.load(checkpoint)

# call trainer to train
trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
target="truth",
seq_lens="word_seq_origin_len"),
dev_data=dev_data, metric_key="f",
use_tqdm=True, use_cuda=True, print_every=10, n_epochs=20, save_path=save)
trainer.train(load_best_model=True)

# save model & pipeline
model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len")
id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag")

pp = Pipeline([vocab_proc, seq_len_proc, set_input_proc, model_proc, id2tag])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab}
torch.save(save_dict, os.path.join(save, "model_pp.pkl"))
print("pipeline saved")


def run_test(test_path):
test_data = ConllxDataLoader().load(test_path, return_dataset=True)

with open("model_pp_0117.pkl", "rb") as f:
save_dict = torch.load(f)
tag_vocab = save_dict["tag_vocab"]
pipeline = save_dict["pipeline"]
index_tag = IndexerProcessor(vocab=tag_vocab, field_name="tag", new_added_field_name="truth", is_input=False)
pipeline.pipeline = [index_tag] + pipeline.pipeline

pipeline(test_data)
test_data.set_target("truth")
prediction = test_data.field_arrays["predict"].content
truth = test_data.field_arrays["truth"].content
seq_len = test_data.field_arrays["word_seq_origin_len"].content

# padding by hand
max_length = max([len(seq) for seq in prediction])
for idx in range(len(prediction)):
prediction[idx] = list(prediction[idx]) + ([0] * (max_length - len(prediction[idx])))
truth[idx] = list(truth[idx]) + ([0] * (max_length - len(truth[idx])))
evaluator = SpanFPreRecMetric(tag_vocab=tag_vocab, pred="predict", target="truth",
seq_lens="word_seq_origin_len")
evaluator({"predict": torch.Tensor(prediction), "word_seq_origin_len": torch.Tensor(seq_len)},
{"truth": torch.Tensor(truth)})
test_result = evaluator.get_metric()
f1 = round(test_result['f'] * 100, 2)
pre = round(test_result['pre'] * 100, 2)
rec = round(test_result['rec'] * 100, 2)

return {"F1": f1, "precision": pre, "recall": rec}


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--train", type=str, help="training conll file", default="/home/zyfeng/data/sample.conllx")
parser.add_argument("--dev", type=str, help="dev conll file", default="/home/zyfeng/data/sample.conllx")
parser.add_argument("--test", type=str, help="test conll file", default=None)
parser.add_argument("--save", type=str, help="path to save", default=None)

parser.add_argument("-c", "--restart", action="store_true", help="whether to continue training")
parser.add_argument("-cp", "--checkpoint", type=str, help="checkpoint of the trained model")
args = parser.parse_args()

if args.test is not None:
print(run_test(args.test))
else:
if args.restart is True:
# 继续训练 python train_pos_tag.py -c -cp ./save/best_model.pkl
if args.checkpoint is None:
raise RuntimeError("Please provide the checkpoint. -cp ")
train(args.train, args.dev, args.checkpoint, save=args.save)
else:
# 一次训练 python train_pos_tag.py
train(args.train, args.dev, save=args.save)

reproduction/pos_tag_model/utils.py → reproduction/POS_tagging/utils.py View File


+ 0
- 0
reproduction/__init__.py View File


+ 0
- 197
reproduction/chinese_word_segment/cws_io/cws_reader.py View File

@@ -1,197 +0,0 @@


from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.io.dataset_loader import DataSetLoader


def cut_long_sentence(sent, max_sample_length=200):
"""
将长于max_sample_length的sentence截成多段,只会在有空格的地方发生截断。所以截取的句子可能长于或者短于max_sample_length

:param sent: str.
:param max_sample_length: int.
:return: list of str.
"""
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence

class NaiveCWSReader(DataSetLoader):
"""
这个reader假设了分词数据集为以下形式, 即已经用空格分割好内容了
这是 fastNLP , 一个 非常 good 的 包 .
或者,即每个part后面还有一个pos tag
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY
"""
def __init__(self, in_word_splitter=None):
super().__init__()

self.in_word_splitter = in_word_splitter

def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
"""
允许使用的情况有(默认以\t或空格作为seg)
这是 fastNLP , 一个 非常 good 的 包 .
也/D 在/P 團員/Na 之中/Ng ,/COMMACATEGORY
如果splitter不为None则认为是第二种情况, 且我们会按splitter分割"也/D", 然后取第一部分. 例如"也/D".split('/')[0]
:param filepath:
:param in_word_splitter:
:return:
"""
if in_word_splitter == None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
for line in f:
line = line.strip()
if len(line.replace(' ', ''))==0: # 不能接受空行
continue

if not in_word_splitter is None:
words = []
for part in line.split():
word = part.split(in_word_splitter)[0]
words.append(word)
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)

return dataset


class POSCWSReader(DataSetLoader):
"""
支持读取以下的情况, 即每一行是一个词, 用空行作为两句话的界限.
迈 N
向 N
充 N
...
泽 I-PER
民 I-PER

( N
一 N
九 N
...


:param filepath:
:return:
"""
def __init__(self, in_word_splitter=None):
super().__init__()
self.in_word_splitter = in_word_splitter

def load(self, filepath, in_word_splitter=None, cut_long_sent=False):
if in_word_splitter is None:
in_word_splitter = self.in_word_splitter
dataset = DataSet()
with open(filepath, 'r') as f:
words = []
for line in f:
line = line.strip()
if len(line) == 0: # new line
if len(words)==0: # 不能接受空行
continue
line = ' '.join(words)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for sent in sents:
instance = Instance(raw_sentence=sent)
dataset.append(instance)
words = []
else:
line = line.split()[0]
if in_word_splitter is None:
words.append(line)
else:
words.append(line.split(in_word_splitter)[0])
return dataset


class ConllCWSReader(object):
def __init__(self):
pass

def load(self, path, cut_long_sent=False):
"""
返回的DataSet只包含raw_sentence这个field,内容为str。
假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct

1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_char_lst(sample)
if res is None:
continue
line = ' '.join(res)
if cut_long_sent:
sents = cut_long_sentence(line)
else:
sents = [line]
for raw_sentence in sents:
ds.append(Instance(raw_sentence=raw_sentence))

return ds

def get_char_lst(self, sample):
if len(sample)==0:
return None
text = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
return text


+ 0
- 151
reproduction/chinese_word_segment/run.py View File

@@ -1,151 +0,0 @@
import os
import sys

sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))

from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.core.trainer import SeqLabelTrainer
from fastNLP.io.dataset_loader import BaseLoader, TokenizeDataSetLoader
from fastNLP.core.utils import load_pickle
from fastNLP.io.model_io import ModelLoader, ModelSaver
from fastNLP.core.tester import SeqLabelTester
from fastNLP.models.sequence_modeling import AdvSeqLabel
from fastNLP.core.predictor import SeqLabelInfer
from fastNLP.core.utils import save_pickle
from fastNLP.core.metrics import SeqLabelEvaluator

# not in the file's dir
if len(os.path.dirname(__file__)) != 0:
os.chdir(os.path.dirname(__file__))
datadir = "/home/zyfeng/data/"
cfgfile = './cws.cfg'

cws_data_path = os.path.join(datadir, "pku_training.utf8")
pickle_path = "save"
data_infer_path = os.path.join(datadir, "infer.utf8")


def infer():
# Config Loader
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)

# Define the same model
model = AdvSeqLabel(test_args)

try:
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print('model loaded!')
except Exception as e:
print('cannot load model!')
raise

# Data Loader
infer_data = SeqLabelDataSet(load_func=BaseLoader.load_lines)
infer_data.load(data_infer_path, vocabs={"word_vocab": word2index}, infer=True)
print('data loaded')

# Inference interface
infer = SeqLabelInfer(pickle_path)
results = infer.predict(model, infer_data)

print(results)
print("Inference finished!")


def train():
# Config Loader
train_args = ConfigSection()
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_args, "test": test_args})

print("loading data set...")
data = SeqLabelDataSet(load_func=TokenizeDataSetLoader.load)
data.load(cws_data_path)
data_train, data_dev = data.split(ratio=0.3)
train_args["vocab_size"] = len(data.word_vocab)
train_args["num_classes"] = len(data.label_vocab)
print("vocab size={}, num_classes={}".format(len(data.word_vocab), len(data.label_vocab)))

change_field_is_target(data_dev, "truth", True)
save_pickle(data_dev, "./save/", "data_dev.pkl")
save_pickle(data.word_vocab, "./save/", "word2id.pkl")
save_pickle(data.label_vocab, "./save/", "label2id.pkl")

# Trainer
trainer = SeqLabelTrainer(epochs=train_args["epochs"], batch_size=train_args["batch_size"],
validate=train_args["validate"],
use_cuda=train_args["use_cuda"], pickle_path=train_args["pickle_path"],
save_best_dev=True, print_every_step=10, model_name="trained_model.pkl",
evaluator=SeqLabelEvaluator())

# Model
model = AdvSeqLabel(train_args)
try:
ModelLoader.load_pytorch(model, "./save/saved_model.pkl")
print('model parameter loaded!')
except Exception as e:
print("No saved model. Continue.")
pass

# Start training
trainer.train(model, data_train, data_dev)
print("Training finished!")

# Saver
saver = ModelSaver("./save/trained_model.pkl")
saver.save_pytorch(model)
print("Model saved!")


def predict():
# Config Loader
test_args = ConfigSection()
ConfigLoader().load_config(cfgfile, {"POS_test": test_args})

# fetch dictionary size and number of labels from pickle files
word2index = load_pickle(pickle_path, "word2id.pkl")
test_args["vocab_size"] = len(word2index)
index2label = load_pickle(pickle_path, "label2id.pkl")
test_args["num_classes"] = len(index2label)

# load dev data
dev_data = load_pickle(pickle_path, "data_dev.pkl")

# Define the same model
model = AdvSeqLabel(test_args)

# Dump trained parameters into the model
ModelLoader.load_pytorch(model, "./save/trained_model.pkl")
print("model loaded!")

# Tester
test_args["evaluator"] = SeqLabelEvaluator()
tester = SeqLabelTester(**test_args.data)

# Start testing
tester.test(model, dev_data)


if __name__ == "__main__":

import argparse

parser = argparse.ArgumentParser(description='Run a chinese word segmentation model')
parser.add_argument('--mode', help='set the model\'s model', choices=['train', 'test', 'infer'])
args = parser.parse_args()
if args.mode == 'train':
train()
elif args.mode == 'test':
predict()
elif args.mode == 'infer':
infer()
else:
print('no mode specified for model!')
parser.print_help()

+ 0
- 153
reproduction/pos_tag_model/pos_reader.py View File

@@ -1,153 +0,0 @@

from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance

def cut_long_sentence(sent, max_sample_length=200):
sent_no_space = sent.replace(' ', '')
cutted_sentence = []
if len(sent_no_space) > max_sample_length:
parts = sent.strip().split()
new_line = ''
length = 0
for part in parts:
length += len(part)
new_line += part + ' '
if length > max_sample_length:
new_line = new_line[:-1]
cutted_sentence.append(new_line)
length = 0
new_line = ''
if new_line != '':
cutted_sentence.append(new_line[:-1])
else:
cutted_sentence.append(sent)
return cutted_sentence


class ConllPOSReader(object):
# 返回的Dataset包含words(list of list, 里层的list是character), tag两个field(list of str, str是标有BIO的tag)。
def __init__(self):
pass

def load(self, path):
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if res is None:
continue
char_seq = []
pos_seq = []
for word, tag in zip(res[0], res[1]):
if len(word)==1:
char_seq.append(word)
pos_seq.append('S-{}'.format(tag))
elif len(word)>1:
pos_seq.append('B-{}'.format(tag))
for _ in range(len(word)-2):
pos_seq.append('M-{}'.format(tag))
pos_seq.append('E-{}'.format(tag))
char_seq.extend(list(word))
else:
raise ValueError("Zero length of word detected.")

ds.append(Instance(words=char_seq,
tag=pos_seq))

return ds



class ZhConllPOSReader(object):
# 中文colln格式reader
def __init__(self):
pass

def load(self, path):
"""
返回的DataSet, 包含以下的field
words:list of str,
tag: list of str, 被加入了BMES tag, 比如原来的序列为['VP', 'NN', 'NN', ..],会被认为是["S-VP", "B-NN", "M-NN",..]
假定了输入为conll的格式,以空行隔开两个句子,每行共7列,即
1 编者按 编者按 NN O 11 nmod:topic
2 : : PU O 11 punct
3 7月 7月 NT DATE 4 compound:nn
4 12日 12日 NT DATE 11 nmod:tmod
5 , , PU O 11 punct

1 这 这 DT O 3 det
2 款 款 M O 1 mark:clf
3 飞行 飞行 NN O 8 nsubj
4 从 从 P O 5 case
5 外型 外型 NN O 8 nmod:prep
"""
datalist = []
with open(path, 'r', encoding='utf-8') as f:
sample = []
for line in f:
if line.startswith('\n'):
datalist.append(sample)
sample = []
elif line.startswith('#'):
continue
else:
sample.append(line.split('\t'))
if len(sample) > 0:
datalist.append(sample)

ds = DataSet()
for sample in datalist:
# print(sample)
res = self.get_one(sample)
if res is None:
continue
char_seq = []
pos_seq = []
for word, tag in zip(res[0], res[1]):
char_seq.extend(list(word))
if len(word)==1:
pos_seq.append('S-{}'.format(tag))
elif len(word)>1:
pos_seq.append('B-{}'.format(tag))
for _ in range(len(word)-2):
pos_seq.append('M-{}'.format(tag))
pos_seq.append('E-{}'.format(tag))
else:
raise ValueError("Zero length of word detected.")

ds.append(Instance(words=char_seq,
tag=pos_seq))

return ds

def get_one(self, sample):
if len(sample)==0:
return None
text = []
pos_tags = []
for w in sample:
t1, t2, t3, t4 = w[1], w[3], w[6], w[7]
if t3 == '_':
return None
text.append(t1)
pos_tags.append(t2)
return text, pos_tags

if __name__ == '__main__':
reader = ZhConllPOSReader()
d = reader.load('/home/hyan/train.conllx')
print(d)

+ 0
- 113
reproduction/pos_tag_model/train_pos_tag.py View File

@@ -1,113 +0,0 @@
import argparse
import os
import pickle
import sys

import torch

# in order to run fastNLP without installation
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))


from fastNLP.api.pipeline import Pipeline
from fastNLP.api.processor import SeqLenProcessor
from fastNLP.core.metrics import SpanFPreRecMetric
from fastNLP.core.trainer import Trainer
from fastNLP.io.config_io import ConfigLoader, ConfigSection
from fastNLP.models.sequence_modeling import AdvSeqLabel
from reproduction.chinese_word_segment.process.cws_processor import VocabIndexerProcessor
from reproduction.pos_tag_model.pos_reader import ZhConllPOSReader
from fastNLP.api.processor import ModelProcessor, Index2WordProcessor

cfgfile = './pos_tag.cfg'
pickle_path = "save"


def load_tencent_embed(embed_path, word2id):
hit = 0
with open(embed_path, "rb") as f:
embed_dict = pickle.load(f)
embedding_tensor = torch.randn(len(word2id), 200)
for key in word2id:
if key in embed_dict:
embedding_tensor[word2id[key]] = torch.Tensor(embed_dict[key])
hit += 1
print("vocab_size={} hit={} hit/vocab_size={}".format(len(word2id), hit, hit / len(word2id)))
return embedding_tensor


def train(checkpoint=None):
# load config
train_param = ConfigSection()
model_param = ConfigSection()
ConfigLoader().load_config(cfgfile, {"train": train_param, "model": model_param})
print("config loaded")

# Data Loader
dataset = ZhConllPOSReader().load("/home/hyan/train.conllx")
print(dataset)
print("dataset transformed")

dataset.rename_field("tag", "truth")

vocab_proc = VocabIndexerProcessor("words", new_added_filed_name="word_seq")
tag_proc = VocabIndexerProcessor("truth")
seq_len_proc = SeqLenProcessor(field_name="word_seq", new_added_field_name="word_seq_origin_len", is_input=True)

vocab_proc(dataset)
tag_proc(dataset)
seq_len_proc(dataset)

dataset.set_input("word_seq", "word_seq_origin_len", "truth")
dataset.set_target("truth", "word_seq_origin_len")

print("processors defined")

# dataset.set_is_target(tag_ids=True)
model_param["vocab_size"] = vocab_proc.get_vocab_size()
model_param["num_classes"] = tag_proc.get_vocab_size()
print("vocab_size={} num_classes={}".format(model_param["vocab_size"], model_param["num_classes"]))

# define a model
if checkpoint is None:
# pre_trained = load_tencent_embed("/home/zyfeng/data/char_tencent_embedding.pkl", vocab_proc.vocab.word2idx)
pre_trained = None
model = AdvSeqLabel(model_param, id2words=tag_proc.vocab.idx2word, emb=pre_trained)
print(model)
else:
model = torch.load(checkpoint)

# call trainer to train
trainer = Trainer(dataset, model, loss=None, metrics=SpanFPreRecMetric(tag_proc.vocab, pred="predict",
target="truth",
seq_lens="word_seq_origin_len"),
dev_data=dataset, metric_key="f",
use_tqdm=True, use_cuda=True, print_every=5, n_epochs=6, save_path="./save")
trainer.train(load_best_model=True)

# save model & pipeline
model_proc = ModelProcessor(model, seq_len_field_name="word_seq_origin_len")
id2tag = Index2WordProcessor(tag_proc.vocab, "predict", "tag")

pp = Pipeline([vocab_proc, seq_len_proc, model_proc, id2tag])
save_dict = {"pipeline": pp, "model": model, "tag_vocab": tag_proc.vocab}
torch.save(save_dict, "model_pp.pkl")
print("pipeline saved")

torch.save(model, "./save/best_model.pkl")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("-c", "--restart", action="store_true", help="whether to continue training")
parser.add_argument("-cp", "--checkpoint", type=str, help="checkpoint of the trained model")
args = parser.parse_args()

if args.restart is True:
# 继续训练 python train_pos_tag.py -c -cp ./save/best_model.pkl
if args.checkpoint is None:
raise RuntimeError("Please provide the checkpoint. -cp ")
train(args.checkpoint)
else:
# 一次训练 python train_pos_tag.py
train()

+ 48
- 2
test/api/test_processor.py View File

@@ -1,9 +1,12 @@
import random
import unittest

from fastNLP import Vocabulary
import numpy as np

from fastNLP import Vocabulary, Instance
from fastNLP.api.processor import FullSpaceToHalfSpaceProcessor, PreAppendProcessor, SliceProcessor, Num2TagProcessor, \
IndexerProcessor, VocabProcessor, SeqLenProcessor
IndexerProcessor, VocabProcessor, SeqLenProcessor, ModelProcessor, Index2WordProcessor, SetTargetProcessor, \
SetInputProcessor, VocabIndexerProcessor
from fastNLP.core.dataset import DataSet


@@ -53,3 +56,46 @@ class TestProcessor(unittest.TestCase):
ds = proc(ds)
for data in ds.field_arrays["len"].content:
self.assertEqual(data, 30)

def test_ModelProcessor(self):
from fastNLP.models.cnn_text_classification import CNNText
model = CNNText(100, 100, 5)
ins_list = []
for _ in range(64):
seq_len = np.random.randint(5, 30)
ins_list.append(Instance(word_seq=[np.random.randint(0, 100) for _ in range(seq_len)], seq_lens=seq_len))
data_set = DataSet(ins_list)
data_set.set_input("word_seq", "seq_lens")
proc = ModelProcessor(model)
data_set = proc(data_set)
self.assertTrue("pred" in data_set)

def test_Index2WordProcessor(self):
vocab = Vocabulary()
vocab.add_word_lst(["a", "b", "c", "d", "e"])
proc = Index2WordProcessor(vocab, "tag_id", "tag")
data_set = DataSet([Instance(tag_id=[np.random.randint(0, 7) for _ in range(32)])])
data_set = proc(data_set)
self.assertTrue("tag" in data_set)

def test_SetTargetProcessor(self):
proc = SetTargetProcessor("a", "b", "c")
data_set = DataSet({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
data_set = proc(data_set)
self.assertTrue(data_set["a"].is_target)
self.assertTrue(data_set["b"].is_target)
self.assertTrue(data_set["c"].is_target)

def test_SetInputProcessor(self):
proc = SetInputProcessor("a", "b", "c")
data_set = DataSet({"a": [1, 2, 3], "b": [1, 2, 3], "c": [1, 2, 3]})
data_set = proc(data_set)
self.assertTrue(data_set["a"].is_input)
self.assertTrue(data_set["b"].is_input)
self.assertTrue(data_set["c"].is_input)

def test_VocabIndexerProcessor(self):
proc = VocabIndexerProcessor("word_seq", "word_ids")
data_set = DataSet([Instance(word_seq=["a", "b", "c", "d", "e"])])
data_set = proc(data_set)
self.assertTrue("word_ids" in data_set)

+ 144
- 0
test/core/test_batch.py View File

@@ -1,13 +1,44 @@
import time
import unittest

import numpy as np
import torch

from fastNLP.core.batch import Batch
from fastNLP.core.dataset import DataSet
from fastNLP.core.dataset import construct_dataset
from fastNLP.core.instance import Instance
from fastNLP.core.sampler import SequentialSampler


def generate_fake_dataset(num_samples=1000):
"""
产生的DataSet包含以下的field {'1':[], '2':[], '3': [], '4':[]}
:param num_samples: sample的数量
:return:
"""

max_len = 50
min_len = 10
num_features = 4

data_dict = {}
for i in range(num_features):
data = []
lengths = np.random.randint(min_len, max_len, size=(num_samples))
for length in lengths:
data.append(np.random.randint(100, size=length))
data_dict[str(i)] = data

dataset = DataSet(data_dict)

for i in range(num_features):
if np.random.randint(2) == 0:
dataset.set_input(str(i))
else:
dataset.set_target(str(i))
return dataset

class TestCase1(unittest.TestCase):
def test_simple(self):
dataset = construct_dataset(
@@ -31,3 +62,116 @@ class TestCase1(unittest.TestCase):
self.assertEqual(len(y["y"]), 4)
self.assertListEqual(list(x["x"][-1]), [1, 2, 3, 4])
self.assertListEqual(list(y["y"][-1]), [5, 6])

def test_list_padding(self):
ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))

def test_numpy_padding(self):
ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=True)
for x, y in iter:
self.assertEqual(x["x"].shape, (4, 4))
self.assertEqual(y["y"].shape, (4, 4))

def test_list_to_tensor(self):
ds = DataSet({"x": [[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10,
"y": [[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))

def test_numpy_to_tensor(self):
ds = DataSet({"x": np.array([[1], [1, 2], [1, 2, 3], [1, 2, 3, 4]] * 10),
"y": np.array([[4, 3, 2, 1], [3, 2, 1], [2, 1], [1]] * 10)})
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))

def test_list_of_list_to_tensor(self):
ds = DataSet([Instance(x=[1, 2], y=[3, 4]) for _ in range(2)] +
[Instance(x=[1, 2, 3, 4], y=[3, 4, 5, 6]) for _ in range(2)])
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
self.assertTrue(isinstance(x["x"], torch.Tensor))
self.assertEqual(tuple(x["x"].shape), (4, 4))
self.assertTrue(isinstance(y["y"], torch.Tensor))
self.assertEqual(tuple(y["y"].shape), (4, 4))

def test_list_of_numpy_to_tensor(self):
ds = DataSet([Instance(x=np.array([1, 2]), y=np.array([3, 4])) for _ in range(2)] +
[Instance(x=np.array([1, 2, 3, 4]), y=np.array([3, 4, 5, 6])) for _ in range(2)])
ds.set_input("x")
ds.set_target("y")
iter = Batch(ds, batch_size=4, sampler=SequentialSampler(), as_numpy=False)
for x, y in iter:
print(x, y)

def test_sequential_batch(self):
batch_size = 32
pause_seconds = 0.01
num_samples = 1000
dataset = generate_fake_dataset(num_samples)

batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler())
for batch_x, batch_y in batch:
time.sleep(pause_seconds)

"""
def test_multi_workers_batch(self):
batch_size = 32
pause_seconds = 0.01
num_samples = 1000
dataset = generate_fake_dataset(num_samples)

num_workers = 1
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers)
for batch_x, batch_y in batch:
time.sleep(pause_seconds)

num_workers = 2
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers)
end1 = time.time()
for batch_x, batch_y in batch:
time.sleep(pause_seconds)
"""
"""
def test_pin_memory(self):
batch_size = 32
pause_seconds = 0.01
num_samples = 1000
dataset = generate_fake_dataset(num_samples)

batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), pin_memory=True)
# 这里发生OOM
# for batch_x, batch_y in batch:
# time.sleep(pause_seconds)

num_workers = 2
batch = Batch(dataset, batch_size=batch_size, sampler=SequentialSampler(), num_workers=num_workers,
pin_memory=True)
# 这里发生OOM
# for batch_x, batch_y in batch:
# time.sleep(pause_seconds)
"""

+ 112
- 18
test/core/test_callbacks.py View File

@@ -1,40 +1,47 @@
import unittest

import numpy as np
import torch

from fastNLP.core.callback import EchoCallback
from fastNLP.core.callback import EchoCallback, EarlyStopCallback, GradientClipCallback, LRScheduler, ControlC, \
LRFinder, \
TensorboardCallback
from fastNLP.core.dataset import DataSet
from fastNLP.core.instance import Instance
from fastNLP.core.losses import BCELoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.core.optimizer import SGD
from fastNLP.core.trainer import Trainer
from fastNLP.models.base_model import NaiveClassifier


class TestCallback(unittest.TestCase):
def test_case(self):
def prepare_fake_dataset():
mean = np.array([-3, -3])
cov = np.array([[1, 0], [0, 1]])
class_A = np.random.multivariate_normal(mean, cov, size=(1000,))
def prepare_env():
def prepare_fake_dataset():
mean = np.array([-3, -3])
cov = np.array([[1, 0], [0, 1]])
class_A = np.random.multivariate_normal(mean, cov, size=(1000,))

mean = np.array([3, 3])
cov = np.array([[1, 0], [0, 1]])
class_B = np.random.multivariate_normal(mean, cov, size=(1000,))
mean = np.array([3, 3])
cov = np.array([[1, 0], [0, 1]])
class_B = np.random.multivariate_normal(mean, cov, size=(1000,))

data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
[Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
return data_set
data_set = DataSet([Instance(x=[float(item[0]), float(item[1])], y=[0.0]) for item in class_A] +
[Instance(x=[float(item[0]), float(item[1])], y=[1.0]) for item in class_B])
return data_set

data_set = prepare_fake_dataset()
data_set.set_input("x")
data_set.set_target("y")
data_set = prepare_fake_dataset()
data_set.set_input("x")
data_set.set_target("y")
model = NaiveClassifier(2, 1)
return data_set, model

model = NaiveClassifier(2, 1)

class TestCallback(unittest.TestCase):
def test_echo_callback(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=1,
n_epochs=2,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
@@ -42,3 +49,90 @@ class TestCallback(unittest.TestCase):
use_tqdm=False,
callbacks=[EchoCallback()])
trainer.train()

def test_gradient_clip(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=20,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[GradientClipCallback(model.parameters(), clip_value=2)])
trainer.train()

def test_early_stop(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=20,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.01),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[EarlyStopCallback(5)])
trainer.train()

def test_lr_scheduler(self):
data_set, model = prepare_env()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=optimizer,
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[LRScheduler(torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1))])
trainer.train()

def test_KeyBoardInterrupt(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
callbacks=[ControlC(False)])
trainer.train()

def test_LRFinder(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
callbacks=[LRFinder(len(data_set) // 32)])
trainer.train()

def test_TensorboardCallback(self):
data_set, model = prepare_env()
trainer = Trainer(data_set, model,
loss=BCELoss(pred="predict", target="y"),
n_epochs=5,
batch_size=32,
print_every=50,
optimizer=SGD(lr=0.1),
check_code_level=2,
use_tqdm=False,
dev_data=data_set,
metrics=AccuracyMetric(pred="predict", target="y"),
callbacks=[TensorboardCallback("loss", "metric")])
trainer.train()

+ 18
- 2
test/core/test_dataset.py View File

@@ -6,15 +6,29 @@ from fastNLP.core.fieldarray import FieldArray
from fastNLP.core.instance import Instance


class TestDataSet(unittest.TestCase):

class TestDataSetInit(unittest.TestCase):
"""初始化DataSet的办法有以下几种:
1) 用dict:
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
2) 用list of Instance:
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])

只接受纯list或者最外层ndarray
"""
def test_init_v1(self):
# 一维list
ds = DataSet([Instance(x=[1, 2, 3, 4], y=[5, 6])] * 40)
self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
self.assertEqual(ds.field_arrays["y"].content, [[5, 6], ] * 40)

def test_init_v2(self):
# 用dict
ds = DataSet({"x": [[1, 2, 3, 4]] * 40, "y": [[5, 6]] * 40})
self.assertTrue("x" in ds.field_arrays and "y" in ds.field_arrays)
self.assertEqual(ds.field_arrays["x"].content, [[1, 2, 3, 4], ] * 40)
@@ -28,6 +42,8 @@ class TestDataSet(unittest.TestCase):
with self.assertRaises(ValueError):
_ = DataSet(0.00001)


class TestDataSetMethods(unittest.TestCase):
def test_append(self):
dd = DataSet()
for _ in range(3):


+ 125
- 6
test/core/test_fieldarray.py View File

@@ -5,8 +5,65 @@ import numpy as np
from fastNLP.core.fieldarray import FieldArray


class TestFieldArrayInit(unittest.TestCase):
"""
1) 如果DataSet使用dict初始化,那么在add_field中会构造FieldArray:
1.1) 二维list DataSet({"x": [[1, 2], [3, 4]]})
1.2) 二维array DataSet({"x": np.array([[1, 2], [3, 4]])})
1.3) 三维list DataSet({"x": [[[1, 2], [3, 4]], [[1, 2], [3, 4]]]})
2) 如果DataSet使用list of Instance 初始化,那么在append中会先对第一个样本初始化FieldArray;
然后后面的样本使用FieldArray.append进行添加。
2.1) 一维list DataSet([Instance(x=[1, 2, 3, 4])])
2.2) 一维array DataSet([Instance(x=np.array([1, 2, 3, 4]))])
2.3) 二维list DataSet([Instance(x=[[1, 2], [3, 4]])])
2.4) 二维array DataSet([Instance(x=np.array([[1, 2], [3, 4]]))])
"""

def test_init_v1(self):
# 二维list
fa = FieldArray("x", [[1, 2], [3, 4]] * 5, is_input=True)

def test_init_v2(self):
# 二维array
fa = FieldArray("x", np.array([[1, 2], [3, 4]] * 5), is_input=True)

def test_init_v3(self):
# 三维list
fa = FieldArray("x", [[[1, 2], [3, 4]], [[1, 2], [3, 4]]], is_input=True)

def test_init_v7(self):
# list of array
fa = FieldArray("x", [np.array([[1, 2], [3, 4]]), np.array([[1, 2], [3, 4]])], is_input=True)
self.assertEqual(fa.pytype, int)
self.assertEqual(fa.dtype, np.int)

def test_init_v4(self):
# 一维list
val = [1, 2, 3, 4]
fa = FieldArray("x", [val], is_input=True)
fa.append(val)

def test_init_v5(self):
# 一维array
val = np.array([1, 2, 3, 4])
fa = FieldArray("x", [val], is_input=True)
fa.append(val)

def test_init_v6(self):
# 二维array
val = [[1, 2], [3, 4]]
fa = FieldArray("x", [val], is_input=True)
fa.append(val)

def test_init_v7(self):
# 二维list
val = np.array([[1, 2], [3, 4]])
fa = FieldArray("x", [val], is_input=True)
fa.append(val)


class TestFieldArray(unittest.TestCase):
def test(self):
def test_main(self):
fa = FieldArray("x", [1, 2, 3, 4, 5], is_input=True)
self.assertEqual(len(fa), 5)
fa.append(6)
@@ -42,13 +99,13 @@ class TestFieldArray(unittest.TestCase):
self.assertEqual(fa.pytype, str)

def test_support_np_array(self):
fa = FieldArray("y", [np.array([1.1, 2.2, 3.3, 4.4, 5.5])], is_input=True)
self.assertEqual(fa.dtype, np.ndarray)
self.assertEqual(fa.pytype, np.ndarray)
fa = FieldArray("y", np.array([[1.1, 2.2, 3.3, 4.4, 5.5]]), is_input=True)
self.assertEqual(fa.dtype, np.float64)
self.assertEqual(fa.pytype, float)

fa.append(np.array([1.1, 2.2, 3.3, 4.4, 5.5]))
self.assertEqual(fa.dtype, np.ndarray)
self.assertEqual(fa.pytype, np.ndarray)
self.assertEqual(fa.dtype, np.float64)
self.assertEqual(fa.pytype, float)

fa = FieldArray("my_field", np.random.rand(3, 5), is_input=True)
# in this case, pytype is actually a float. We do not care about it.
@@ -97,3 +154,65 @@ class TestFieldArray(unittest.TestCase):
fa.append([1.2, 2.3, 3.4, 4.5, 5.6])
self.assertEqual(len(fa), 3)
self.assertEqual(fa[2], [1.2, 2.3, 3.4, 4.5, 5.6])


class TestPadder(unittest.TestCase):

def test01(self):
"""
测试AutoPadder能否正常工作
:return:
"""
from fastNLP.core.fieldarray import AutoPadder
padder = AutoPadder()
content = ['This is a str', 'this is another str']
self.assertListEqual(content, padder(content, None, np.str).tolist())

content = [1, 2]
self.assertListEqual(content, padder(content, None, np.int64).tolist())

content = [[1,2], [3], [4]]
self.assertListEqual([[1,2], [3, 0], [4, 0]],
padder(content, None, np.int64).tolist())

content = [
[[1, 2, 3], [4, 5], [7,8,9,10]],
[[1]]
]
self.assertListEqual(content,
padder(content, None, np.int64).tolist())

def test02(self):
"""
测试EngChar2DPadder能不能正确使用
:return:
"""
from fastNLP.core.fieldarray import EngChar2DPadder
padder = EngChar2DPadder(pad_length=0)

contents = [1, 2]
# 不能是1维
with self.assertRaises(ValueError):
padder(contents, None, np.int64)
contents = [[1, 2]]
# 不能是2维
with self.assertRaises(ValueError):
padder(contents, None, np.int64)
contents = [[[[1, 2]]]]
# 不能是3维以上
with self.assertRaises(ValueError):
padder(contents, None, np.int64)

contents = [
[[1, 2, 3], [4, 5], [7,8,9,10]],
[[1]]
]
self.assertListEqual([[[1, 2, 3, 0], [4, 5, 0, 0], [7, 8, 9, 10]], [[1, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]],
padder(contents, None, np.int64).tolist())

padder = EngChar2DPadder(pad_length=5, pad_val=-100)
self.assertListEqual(
[[[1, 2, 3, -100, -100], [4, 5, -100, -100, -100], [7, 8, 9, 10, -100]],
[[1, -100, -100, -100, -100], [-100, -100, -100, -100, -100], [-100, -100, -100, -100, -100]]],
padder(contents, None, np.int64).tolist()
)

+ 21
- 6
test/core/test_predictor.py View File

@@ -1,4 +1,5 @@
import unittest
from collections import defaultdict

import numpy as np
import torch
@@ -23,12 +24,26 @@ def prepare_fake_dataset():
return data_set


class LinearModel(torch.nn.Module):
def __init__(self):
super(LinearModel, self).__init__()
self.linear = Linear(2, 1)

def forward(self, x):
return {"predict": self.linear(x)}


class TestPredictor(unittest.TestCase):
def test(self):
predictor = Predictor()
model = Linear(2, 1)
def test_simple(self):
model = LinearModel()
predictor = Predictor(model)
data = prepare_fake_dataset()
data.set_input("x")
ans = predictor.predict(model, data)
self.assertEqual(len(ans), 2000)
self.assertTrue(isinstance(ans[0], torch.Tensor))
ans = predictor.predict(data)
self.assertTrue(isinstance(ans, defaultdict))
self.assertTrue("predict" in ans)
self.assertTrue(isinstance(ans["predict"], list))

def test_sequence(self):
# test sequence input/output
pass

+ 29
- 3
test/core/test_trainer.py View File

@@ -237,6 +237,32 @@ class TrainerTestGround(unittest.TestCase):
use_tqdm=False,
print_every=2)

def test_case2(self):
# check metrics Wrong
data_set = prepare_fake_dataset2('x1', 'x2')
"""
def test_trainer_multiprocess(self):
dataset = prepare_fake_dataset2('x1', 'x2')
dataset.set_input('x1', 'x2', 'y', flag=True)

class Model(nn.Module):
def __init__(self):
super().__init__()
self.fc = nn.Linear(5, 4)

def forward(self, x1, x2, y):
x1 = self.fc(x1)
x2 = self.fc(x2)
x = x1 + x2
loss = F.cross_entropy(x, y)
return {'loss': loss}

model = Model()
trainer = Trainer(
train_data=dataset,
model=model,
use_tqdm=True,
print_every=2,
num_workers=2,
pin_memory=False,
timeout=0,
)
trainer.train()
"""

+ 0
- 3370
test/data_for_tests/charlm.txt
File diff suppressed because it is too large
View File


+ 0
- 2
test/data_for_tests/people_infer.txt View File

@@ -1,2 +0,0 @@
迈向充满希望的新世纪——一九九八年新年讲话
(附图片1张)

+ 100
- 0
test/data_for_tests/zh_sample.conllx View File

@@ -0,0 +1,100 @@
1 上海 _ NR NR _ 3 nsubj _ _
2 积极 _ AD AD _ 3 advmod _ _
3 准备 _ VV VV _ 0 root _ _
4 迎接 _ VV VV _ 3 ccomp _ _
5 欧元 _ NN NN _ 6 nn _ _
6 诞生 _ NN NN _ 4 dobj _ _

1 新华社 _ NR NR _ 7 dep _ _
2 上海 _ NR NR _ 7 dep _ _
3 十二月 _ NT NT _ 7 dep _ _
4 三十日 _ NT NT _ 7 dep _ _
5 电 _ NN NN _ 7 dep _ _
6 ( _ PU PU _ 7 punct _ _
7 记者 _ NN NN _ 0 root _ _
8 潘清 _ NR NR _ 7 dep _ _
9 ) _ PU PU _ 7 punct _ _

1 即将 _ AD AD _ 2 advmod _ _
2 诞生 _ VV VV _ 4 rcmod _ _
3 的 _ DEC DEC _ 2 cpm _ _
4 欧元 _ NN NN _ 6 nsubj _ _
5 , _ PU PU _ 6 punct _ _
6 引起 _ VV VV _ 0 root _ _
7 了 _ AS AS _ 6 asp _ _
8 上海 _ NR NR _ 14 nn _ _
9 这 _ DT DT _ 14 det _ _
10 个 _ M M _ 9 clf _ _
11 中国 _ NR NR _ 13 nn _ _
12 金融 _ NN NN _ 13 nn _ _
13 中心 _ NN NN _ 14 nn _ _
14 城市 _ NN NN _ 16 assmod _ _
15 的 _ DEG DEG _ 14 assm _ _
16 关注 _ NN NN _ 6 dobj _ _
17 。 _ PU PU _ 6 punct _ _

1 上海 _ NR NR _ 2 nn _ _
2 银行界 _ NN NN _ 4 nsubj _ _
3 纷纷 _ AD AD _ 4 advmod _ _
4 推出 _ VV VV _ 0 root _ _
5 了 _ AS AS _ 4 asp _ _
6 与 _ P P _ 8 prep _ _
7 之 _ PN PN _ 6 pobj _ _
8 相关 _ VA VA _ 15 rcmod _ _
9 的 _ DEC DEC _ 8 cpm _ _
10 外汇 _ NN NN _ 15 nn _ _
11 业务 _ NN NN _ 15 nn _ _
12 品种 _ NN NN _ 15 conj _ _
13 和 _ CC CC _ 15 cc _ _
14 服务 _ NN NN _ 15 nn _ _
15 举措 _ NN NN _ 4 dobj _ _
16 , _ PU PU _ 4 punct _ _
17 积极 _ AD AD _ 18 advmod _ _
18 准备 _ VV VV _ 4 dep _ _
19 启动 _ VV VV _ 18 ccomp _ _
20 欧元 _ NN NN _ 21 nn _ _
21 业务 _ NN NN _ 19 dobj _ _
22 。 _ PU PU _ 4 punct _ _

1 一些 _ CD CD _ 8 nummod _ _
2 热衷于 _ VV VV _ 8 rcmod _ _
3 个人 _ NN NN _ 5 nn _ _
4 外汇 _ NN NN _ 5 nn _ _
5 交易 _ NN NN _ 2 dobj _ _
6 的 _ DEC DEC _ 2 cpm _ _
7 上海 _ NR NR _ 8 nn _ _
8 市民 _ NN NN _ 13 nsubj _ _
9 , _ PU PU _ 13 punct _ _
10 也 _ AD AD _ 13 advmod _ _
11 对 _ P P _ 13 prep _ _
12 欧元 _ NN NN _ 11 pobj _ _
13 表示 _ VV VV _ 0 root _ _
14 出 _ VV VV _ 13 rcomp _ _
15 极 _ AD AD _ 16 advmod _ _
16 大 _ VA VA _ 18 rcmod _ _
17 的 _ DEC DEC _ 16 cpm _ _
18 兴趣 _ NN NN _ 13 dobj _ _
19 。 _ PU PU _ 13 punct _ _

1 继 _ P P _ 38 prep _ _
2 上海 _ NR NR _ 6 nn _ _
3 大众 _ NR NR _ 6 nn _ _
4 汽车 _ NN NN _ 6 nn _ _
5 有限 _ JJ JJ _ 6 amod _ _
6 公司 _ NN NN _ 13 nsubj _ _
7 十八日 _ NT NT _ 13 tmod _ _
8 在 _ P P _ 13 prep _ _
9 中国 _ NR NR _ 10 nn _ _
10 银行 _ NN NN _ 12 nn _ _
11 上海 _ NR NR _ 12 nn _ _
12 分行 _ NN NN _ 8 pobj _ _
13 开立 _ VV VV _ 19 lccomp _ _
14 上海 _ NR NR _ 16 dep _ _
15 第一 _ OD OD _ 16 ordmod _ _
16 个 _ M M _ 18 clf _ _
17 欧元 _ NN NN _ 18 nn _ _
18 帐户 _ NN NN _ 13 dobj _ _
19 后 _ LC LC _ 1 plmod _ _
20 , _ PU PU _ 38 punct _ _
21 工商 _ NN NN _ 28 nn _ _
22 银行 _ NN NN _ 28 conj _ _

+ 16
- 13
test/io/test_dataset_loader.py View File

@@ -1,24 +1,27 @@
import unittest

from fastNLP.io.dataset_loader import Conll2003Loader
from fastNLP.io.dataset_loader import Conll2003Loader, PeopleDailyCorpusLoader, ConllCWSReader, \
ZhConllPOSReader, ConllxDataLoader


class TestDatasetLoader(unittest.TestCase):

def test_case_1(self):
'''
def test_Conll2003Loader(self):
"""
Test the the loader of Conll2003 dataset
'''

"""
dataset_path = "test/data_for_tests/conll_2003_example.txt"
loader = Conll2003Loader()
dataset_2003 = loader.load(dataset_path)

for item in dataset_2003:
len0 = len(item["label0_list"])
len1 = len(item["label1_list"])
len2 = len(item["label2_list"])
lentoken = len(item["token_list"])
self.assertNotEqual(len0, 0)
self.assertEqual(len0, len1)
self.assertEqual(len1, len2)
def test_PeopleDailyCorpusLoader(self):
data_set = PeopleDailyCorpusLoader().load("test/data_for_tests/people_daily_raw.txt")

def test_ConllCWSReader(self):
dataset = ConllCWSReader().load("test/data_for_tests/conll_example.txt")

def test_ZhConllPOSReader(self):
dataset = ZhConllPOSReader().load("test/data_for_tests/zh_sample.conllx")

def test_ConllxDataLoader(self):
dataset = ConllxDataLoader().load("test/data_for_tests/zh_sample.conllx")

+ 21
- 0
test/models/test_bert.py View File

@@ -0,0 +1,21 @@
import unittest

import torch

from fastNLP.models.bert import BertModel


class TestBert(unittest.TestCase):
def test_bert_1(self):
# model = BertModel.from_pretrained("/home/zyfeng/data/bert-base-chinese")
model = BertModel(vocab_size=32000, hidden_size=768,
num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072)

input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
input_mask = torch.LongTensor([[1, 1, 1], [1, 1, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1], [0, 1, 0]])

all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask)
for layer in all_encoder_layers:
self.assertEqual(tuple(layer.shape), (2, 3, 768))
self.assertEqual(tuple(pooled_output.shape), (2, 768))

+ 11
- 7
test/models/test_biaffine_parser.py View File

@@ -1,8 +1,8 @@
from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric
import fastNLP

import unittest

import fastNLP
from fastNLP.models.biaffine_parser import BiaffineParser, ParserLoss, ParserMetric

data_file = """
1 The _ DET DT _ 3 det _ _
2 new _ ADJ JJ _ 3 amod _ _
@@ -41,6 +41,7 @@ data_file = """

"""


def init_data():
ds = fastNLP.DataSet()
v = {'word_seq': fastNLP.Vocabulary(),
@@ -60,28 +61,31 @@ def init_data():
data.append(line)

for name in ['word_seq', 'pos_seq', 'label_true']:
ds.apply(lambda x: ['<st>']+list(x[name]), new_field_name=name)
ds.apply(lambda x: ['<st>'] + list(x[name]), new_field_name=name)
ds.apply(lambda x: v[name].add_word_lst(x[name]))

for name in ['word_seq', 'pos_seq', 'label_true']:
ds.apply(lambda x: [v[name].to_index(w) for w in x[name]], new_field_name=name)

ds.apply(lambda x: [0]+list(map(int, x['arc_true'])), new_field_name='arc_true')
ds.apply(lambda x: [0] + list(map(int, x['arc_true'])), new_field_name='arc_true')
ds.apply(lambda x: len(x['word_seq']), new_field_name='seq_lens')
ds.set_input('word_seq', 'pos_seq', 'seq_lens', flag=True)
ds.set_target('arc_true', 'label_true', 'seq_lens', flag=True)
return ds, v['word_seq'], v['pos_seq'], v['label_true']


class TestBiaffineParser(unittest.TestCase):
def test_train(self):
ds, v1, v2, v3 = init_data()
model = BiaffineParser(word_vocab_size=len(v1), word_emb_dim=30,
pos_vocab_size=len(v2), pos_emb_dim=30,
num_label=len(v3), use_var_lstm=True)
num_label=len(v3), encoder='var-lstm')
trainer = fastNLP.Trainer(model=model, train_data=ds, dev_data=ds,
loss=ParserLoss(), metrics=ParserMetric(), metric_key='UAS',
batch_size=1, validate_every=10,
n_epochs=10, use_cuda=False, use_tqdm=False)
trainer.train(load_best_model=False)


if __name__ == '__main__':
unittest.main()
unittest.main()

+ 0
- 91
test/test_tutorial.py View File

@@ -1,91 +0,0 @@
import unittest

from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Tester
from fastNLP import Vocabulary
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric
from fastNLP.models import CNNText


class TestTutorial(unittest.TestCase):
def test_tutorial(self):
# 从csv读取数据到DataSet
sample_path = "test/data_for_tests/tutorial_sample_dataset.csv"
dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'),
sep='\t')
print(len(dataset))
print(dataset[0])

dataset.append(Instance(raw_sentence='fake data', label='0'))
dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
dataset.apply(lambda x: int(x['label']), new_field_name='label')

# 使用空格分割句子
def split_sent(ins):
return ins['raw_sentence'].split()

dataset.apply(split_sent, new_field_name='words')
# 增加长度信息
dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
print(len(dataset))
print(dataset[0])

# DataSet.drop(func)筛除数据
dataset.drop(lambda x: x['seq_len'] <= 3)
print(len(dataset))

# 设置DataSet中,哪些field要转为tensor
# set target,loss或evaluate中的golden,计算loss,模型评估时使用
dataset.set_target("label")
# set input,模型forward时使用
dataset.set_input("words")

# 分出测试集、训练集
test_data, train_data = dataset.split(0.5)
print(len(test_data))
print(len(train_data))

# 构建词表, Vocabulary.add(word)
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
vocab.build_vocab()

# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
print(test_data[0])

model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)

from fastNLP import Trainer
from copy import deepcopy

# 更改DataSet中对应field的名称,要以模型的forward等参数名一致
train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致
train_data.rename_field('label', 'label_seq')
test_data.rename_field('words', 'word_seq')
test_data.rename_field('label', 'label_seq')

# 实例化Trainer,传入模型和数据,进行训练
copy_model = deepcopy(model)
overfit_trainer = Trainer(train_data=test_data, model=copy_model,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4,
dev_data=test_data, save_path="./save")
overfit_trainer.train()

trainer = Trainer(train_data=train_data, model=model,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"), n_epochs=10, batch_size=4,
dev_data=test_data, save_path="./save")
trainer.train()
print('Train finished!')

# 使用fastNLP的Tester测试脚本
tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
batch_size=4)
acc = tester.test()
print(acc)

+ 432
- 0
test/test_tutorials.py View File

@@ -0,0 +1,432 @@
import unittest

from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP.core.losses import CrossEntropyLoss
from fastNLP.core.metrics import AccuracyMetric


class TestTutorial(unittest.TestCase):
def test_fastnlp_10min_tutorial(self):
# 从csv读取数据到DataSet
sample_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
dataset = DataSet.read_csv(sample_path, headers=('raw_sentence', 'label'),
sep='\t')
print(len(dataset))
print(dataset[0])
print(dataset[-3])

dataset.append(Instance(raw_sentence='fake data', label='0'))
# 将所有数字转为小写
dataset.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
dataset.apply(lambda x: int(x['label']), new_field_name='label')

# 使用空格分割句子
def split_sent(ins):
return ins['raw_sentence'].split()

dataset.apply(split_sent, new_field_name='words')

# 增加长度信息
dataset.apply(lambda x: len(x['words']), new_field_name='seq_len')
print(len(dataset))
print(dataset[0])

# DataSet.drop(func)筛除数据
dataset.drop(lambda x: x['seq_len'] <= 3)
print(len(dataset))

# 设置DataSet中,哪些field要转为tensor
# set target,loss或evaluate中的golden,计算loss,模型评估时使用
dataset.set_target("label")
# set input,模型forward时使用
dataset.set_input("words", "seq_len")

# 分出测试集、训练集
test_data, train_data = dataset.split(0.5)
print(len(test_data))
print(len(train_data))

# 构建词表, Vocabulary.add(word)
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])
vocab.build_vocab()

# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='words')
print(test_data[0])

# 如果你们需要做强化学习或者GAN之类的项目,你们也可以使用这些数据预处理的工具
from fastNLP.core.batch import Batch
from fastNLP.core.sampler import RandomSampler

batch_iterator = Batch(dataset=train_data, batch_size=2, sampler=RandomSampler())
for batch_x, batch_y in batch_iterator:
print("batch_x has: ", batch_x)
print("batch_y has: ", batch_y)
break

from fastNLP.models import CNNText
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)

from fastNLP import Trainer
from copy import deepcopy

# 更改DataSet中对应field的名称,要以模型的forward等参数名一致
train_data.rename_field('words', 'word_seq') # input field 与 forward 参数一致
train_data.rename_field('label', 'label_seq')
test_data.rename_field('words', 'word_seq')
test_data.rename_field('label', 'label_seq')

loss = CrossEntropyLoss(pred="output", target="label_seq")
metric = AccuracyMetric(pred="predict", target="label_seq")

# 实例化Trainer,传入模型和数据,进行训练
# 先在test_data拟合(确保模型的实现是正确的)
copy_model = deepcopy(model)
overfit_trainer = Trainer(model=copy_model, train_data=test_data, dev_data=test_data,
loss=loss,
metrics=metric,
save_path=None,
batch_size=32,
n_epochs=5)
overfit_trainer.train()

# 用train_data训练,在test_data验证
trainer = Trainer(model=model, train_data=train_data, dev_data=test_data,
loss=CrossEntropyLoss(pred="output", target="label_seq"),
metrics=AccuracyMetric(pred="predict", target="label_seq"),
save_path=None,
batch_size=32,
n_epochs=5)
trainer.train()
print('Train finished!')

# 调用Tester在test_data上评价效果
from fastNLP import Tester

tester = Tester(data=test_data, model=model, metrics=AccuracyMetric(pred="predict", target="label_seq"),
batch_size=4)
acc = tester.test()
print(acc)

def test_fastnlp_1min_tutorial(self):
# tutorials/fastnlp_1min_tutorial.ipynb
data_path = "tutorials/sample_data/tutorial_sample_dataset.csv"
ds = DataSet.read_csv(data_path, headers=('raw_sentence', 'label'), sep='\t')
print(ds[1])

# 将所有数字转为小写
ds.apply(lambda x: x['raw_sentence'].lower(), new_field_name='raw_sentence')
# label转int
ds.apply(lambda x: int(x['label']), new_field_name='label_seq', is_target=True)

def split_sent(ins):
return ins['raw_sentence'].split()

ds.apply(split_sent, new_field_name='words', is_input=True)

# 分割训练集/验证集
train_data, dev_data = ds.split(0.3)
print("Train size: ", len(train_data))
print("Test size: ", len(dev_data))

from fastNLP import Vocabulary
vocab = Vocabulary(min_freq=2)
train_data.apply(lambda x: [vocab.add(word) for word in x['words']])

# index句子, Vocabulary.to_index(word)
train_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq',
is_input=True)
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['words']], new_field_name='word_seq',
is_input=True)

from fastNLP.models import CNNText
model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)

from fastNLP import Trainer, CrossEntropyLoss, AccuracyMetric
trainer = Trainer(model=model,
train_data=train_data,
dev_data=dev_data,
loss=CrossEntropyLoss(),
metrics=AccuracyMetric()
)
trainer.train()
print('Train finished!')

def test_fastnlp_advanced_tutorial(self):
import os
os.chdir("tutorials/fastnlp_advanced_tutorial")

from fastNLP import DataSet
from fastNLP import Instance
from fastNLP import Vocabulary
from fastNLP import Trainer
from fastNLP import Tester

# ### Instance
# Instance表示一个样本,由一个或者多个field(域、属性、特征)组成,每个field具有自己的名字以及值
# 在初始化Instance的时候可以定义它包含的field,使用"field_name=field_value"的写法

# In[2]:

# 组织一个Instance,这个Instance由premise、hypothesis、label三个field组成
instance = Instance(premise='an premise example .', hypothesis='an hypothesis example.', label=1)
instance

# In[3]:

data_set = DataSet([instance] * 5)
data_set.append(instance)
data_set[-2:]

# In[4]:

# 如果某一个field的类型与dataset对应的field类型不一样仍可被加入dataset中
instance2 = Instance(premise='the second premise example .', hypothesis='the second hypothesis example.',
label='1')
try:
data_set.append(instance2)
except:
pass
data_set[-2:]

# In[5]:

# 如果某一个field的名字不对,则该instance不能被append到dataset中
instance3 = Instance(premises='the third premise example .', hypothesis='the third hypothesis example.',
label=1)
try:
data_set.append(instance3)
except:
print('cannot append instance')
pass
data_set[-2:]

# In[6]:

# 除了文本以外,还可以将tensor作为其中一个field的value
import torch
tensor_ins = Instance(image=torch.randn(5, 5), label=0)
ds = DataSet()
ds.append(tensor_ins)
ds

from fastNLP import DataSet
from fastNLP import Instance

# 从csv读取数据到DataSet
# 类csv文件,即每一行为一个example的文件,都可以使用这种方法进行数据读取
dataset = DataSet.read_csv('tutorial_sample_dataset.csv', headers=('raw_sentence', 'label'), sep='\t')
# 查看DataSet的大小
len(dataset)

# In[8]:

# 使用数字索引[k],获取第k个样本
dataset[0]

# In[9]:

# 获取的样本是一个Instance
type(dataset[0])

# In[10]:

# 使用数字索引[a: b],获取第a到第b个样本
dataset[0: 3]

# In[11]:

# 索引也可以是负数
dataset[-1]

data_path = ['premise', 'hypothesis', 'label']

# 读入文件
with open(data_path[0]) as f:
premise = f.readlines()

with open(data_path[1]) as f:
hypothesis = f.readlines()

with open(data_path[2]) as f:
label = f.readlines()

assert len(premise) == len(hypothesis) and len(hypothesis) == len(label)

# 组织DataSet
data_set = DataSet()
for p, h, l in zip(premise, hypothesis, label):
p = p.strip() # 将行末空格去除
h = h.strip() # 将行末空格去除
data_set.append(Instance(premise=p, hypothesis=h, truth=l))

data_set[0]

# ### DataSet的其他操作
# 在构建完毕DataSet后,仍然可以对DataSet的内容进行操作,函数接口为DataSet.apply()

# In[13]:

# 将premise域的所有文本转成小写
data_set.apply(lambda x: x['premise'].lower(), new_field_name='premise')
data_set[-2:]

# In[14]:

# label转int
data_set.apply(lambda x: int(x['truth']), new_field_name='truth')
data_set[-2:]

# In[15]:

# 使用空格分割句子
def split_sent(ins):
return ins['premise'].split()

data_set.apply(split_sent, new_field_name='premise')
data_set.apply(lambda x: x['hypothesis'].split(), new_field_name='hypothesis')
data_set[-2:]

# In[16]:

# 筛选数据
origin_data_set_len = len(data_set)
data_set.drop(lambda x: len(x['premise']) <= 6)
origin_data_set_len, len(data_set)

# In[17]:

# 增加长度信息
data_set.apply(lambda x: [1] * len(x['premise']), new_field_name='premise_len')
data_set.apply(lambda x: [1] * len(x['hypothesis']), new_field_name='hypothesis_len')
data_set[-1]

# In[18]:

# 设定特征域、标签域
data_set.set_input("premise", "premise_len", "hypothesis", "hypothesis_len")
data_set.set_target("truth")

# In[19]:

# 重命名field
data_set.rename_field('truth', 'label')
data_set[-1]

# In[20]:

# 切分训练、验证集、测试集
train_data, vad_data = data_set.split(0.5)
dev_data, test_data = vad_data.split(0.4)
len(train_data), len(dev_data), len(test_data)

# In[21]:

# 深拷贝一个数据集
import copy
train_data_2, dev_data_2 = copy.deepcopy(train_data), copy.deepcopy(dev_data)
del copy

# 初始化词表,该词表最大的vocab_size为10000,词表中每个词出现的最低频率为2,'<unk>'表示未知词语,'<pad>'表示padding词语
# Vocabulary默认初始化参数为max_size=None, min_freq=None, unknown='<unk>', padding='<pad>'
vocab = Vocabulary(max_size=10000, min_freq=2, unknown='<unk>', padding='<pad>')

# 构建词表
train_data.apply(lambda x: [vocab.add(word) for word in x['premise']])
train_data.apply(lambda x: [vocab.add(word) for word in x['hypothesis']])
vocab.build_vocab()

# In[23]:

# 根据词表index句子
train_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
train_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
dev_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['premise']], new_field_name='premise')
test_data.apply(lambda x: [vocab.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
train_data[-1], dev_data[-1], test_data[-1]

# 读入vocab文件
with open('vocab.txt') as f:
lines = f.readlines()
vocabs = []
for line in lines:
vocabs.append(line.strip())

# 实例化Vocabulary
vocab_bert = Vocabulary(unknown=None, padding=None)
# 将vocabs列表加入Vocabulary
vocab_bert.add_word_lst(vocabs)
# 构建词表
vocab_bert.build_vocab()
# 更新unknown与padding的token文本
vocab_bert.unknown = '[UNK]'
vocab_bert.padding = '[PAD]'

# In[25]:

# 根据词表index句子
train_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise')
train_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']],
new_field_name='hypothesis')
dev_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['premise']], new_field_name='premise')
dev_data_2.apply(lambda x: [vocab_bert.to_index(word) for word in x['hypothesis']], new_field_name='hypothesis')
train_data_2[-1], dev_data_2[-1]

# step 1:加载模型参数(非必选)
from fastNLP.io.config_io import ConfigSection, ConfigLoader
args = ConfigSection()
ConfigLoader().load_config("./data/config", {"esim_model": args})
args["vocab_size"] = len(vocab)
args.data

# In[27]:

# step 2:加载ESIM模型
from fastNLP.models import ESIM
model = ESIM(**args.data)
model

# In[28]:

# 另一个例子:加载CNN文本分类模型
from fastNLP.models import CNNText
cnn_text_model = CNNText(embed_num=len(vocab), embed_dim=50, num_classes=5, padding=2, dropout=0.1)
cnn_text_model

from fastNLP import CrossEntropyLoss
from fastNLP import Adam
from fastNLP import AccuracyMetric
trainer = Trainer(
train_data=train_data,
model=model,
loss=CrossEntropyLoss(pred='pred', target='label'),
metrics=AccuracyMetric(),
n_epochs=3,
batch_size=16,
print_every=-1,
validate_every=-1,
dev_data=dev_data,
use_cuda=False,
optimizer=Adam(lr=1e-3, weight_decay=0),
check_code_level=-1,
metric_key='acc',
use_tqdm=False,
)
trainer.train()

tester = Tester(
data=test_data,
model=model,
metrics=AccuracyMetric(),
batch_size=args["batch_size"],
)
tester.test()

os.chdir("../..")

+ 370
- 0
tutorials/fastNLP_padding_tutorial.ipynb View File

@@ -0,0 +1,370 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/yh/miniconda2/envs/python3/lib/python3.6/site-packages/tqdm/autonotebook/__init__.py:14: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console)\n",
" \" (e.g. in jupyter console)\", TqdmExperimentalWarning)\n"
]
},
{
"data": {
"text/plain": [
"DataSet({'raw_sent': this is a bad idea . type=str,\n",
"'label': 0 type=int,\n",
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
"'words': [4, 2, 5, 6, 7, 3] type=list},\n",
"{'raw_sent': it is great . type=str,\n",
"'label': 1 type=int,\n",
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
"'words': [8, 2, 9, 3] type=list})"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 假设有以下的DataSet, 这里只是为了举例所以只选择了两个sample\n",
"import sys\n",
"import os\n",
"sys.path.append('/Users/yh/Desktop/fastNLP/fastNLP')\n",
"\n",
"from fastNLP import DataSet\n",
"from fastNLP import Instance\n",
"from fastNLP import Vocabulary\n",
"\n",
"dataset = DataSet()\n",
"dataset.append(Instance(raw_sent='This is a bad idea .', label=0))\n",
"dataset.append(Instance(raw_sent='It is great .', label=1))\n",
"\n",
"# 按照fastNLP_10min_tutorial.ipynb的步骤,对数据进行一些处理。这里为了演示padding操作,把field的名称做了一些改变\n",
"dataset.apply(lambda x:x['raw_sent'].lower(), new_field_name='raw_sent')\n",
"dataset.apply(lambda x:x['raw_sent'].split(), new_field_name='word_str_lst')\n",
"\n",
"# 建立Vocabulary\n",
"word_vocab = Vocabulary()\n",
"dataset.apply(lambda x:word_vocab.update(x['word_str_lst']))\n",
"dataset.apply(lambda x:[word_vocab.to_index(word) for word in x['word_str_lst']], new_field_name='words')\n",
"\n",
"# 检查以下是否得到我们想要的结果了\n",
"dataset[:2]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[4, 2, 5, 6, 7, 3],\n",
" [8, 2, 9, 3, 0, 0]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\"\\n结果中\\n Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\\n padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\\n 两个维度的padding,不容易自动判断padding的形式。\\n'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 将field设置为input或者target\n",
"dataset.set_input('word_str_lst')\n",
"dataset.set_input('words')\n",
"dataset.set_target('label')\n",
"\n",
"# 使用Batch取出batch数据\n",
"from fastNLP.core.batch import Batch\n",
"from fastNLP.core.sampler import RandomSampler\n",
"\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"\"\"\"\"\n",
"结果中\n",
" Batch会对元素类型(元素即最内层的数据,raw_sent为str,word_str_lst为str,words为int, label为int)为int或者float的数据进行默认\n",
" padding,而非int或float的则不进行padding。但若每个Instance中该field为二维数据,也不进行padding。因为二维数据的padding涉及到\n",
" 两个维度的padding,不容易自动判断padding的形式。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['it', 'is', 'great', '.']),\n",
" list(['this', 'is', 'a', 'bad', 'idea', '.'])], dtype=object), 'words': tensor([[ 8, 2, 9, 3, -100, -100],\n",
" [ 4, 2, 5, 6, 7, 3]])}\n",
"batch_y has: {'label': tensor([1, 0])}\n"
]
}
],
"source": [
"# 所有的pad_val都默认为0,如果需要修改某一个field的默认pad值,可以通过DataSet.set_pad_val(field_name, pad_val)进行修改\n",
"# 若需要将word的padding修改为-100\n",
"dataset.set_pad_val('words', pad_val=-100)\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"# pad的值修改为-100了"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"DataSet({'raw_sent': this is a bad idea . type=str,\n",
"'label': 0 type=int,\n",
"'word_str_lst': ['this', 'is', 'a', 'bad', 'idea', '.'] type=list,\n",
"'words': [4, 2, 5, 6, 7, 3] type=list,\n",
"'char_str_lst': [['t', 'h', 'i', 's'], ['i', 's'], ['a'], ['b', 'a', 'd'], ['i', 'd', 'e', 'a'], ['.']] type=list,\n",
"'chars': [[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]] type=list},\n",
"{'raw_sent': it is great . type=str,\n",
"'label': 1 type=int,\n",
"'word_str_lst': ['it', 'is', 'great', '.'] type=list,\n",
"'words': [8, 2, 9, 3] type=list,\n",
"'char_str_lst': [['i', 't'], ['i', 's'], ['g', 'r', 'e', 'a', 't'], ['.']] type=list,\n",
"'chars': [[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]] type=list})"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 若需要使用二维padding或指定padding方式,可以通过设置该field的padder实现,下面以英文的character padding为例。在某些场景下,可能想要\n",
"# 使用英文word的character作为特征,character的padding为二维padding,fastNLP默认只会进行一维padding。\n",
"\n",
"dataset.apply(lambda x: [[c for c in word] for word in x['word_str_lst']], new_field_name='char_str_lst')\n",
"char_vocab = Vocabulary()\n",
"dataset.apply(lambda x:[char_vocab.update(chars) for chars in x['char_str_lst']])\n",
"dataset.apply(lambda x:[[char_vocab.to_index(c) for c in chars] for chars in x['char_str_lst']],new_field_name='chars')\n",
"dataset[:2]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': array([list([[4, 9, 2, 5], [2, 5], [3], [10, 3, 6], [2, 6, 7, 3], [8]]),\n",
" list([[2, 4], [2, 5], [11, 12, 7, 3, 4], [8]])], dtype=object)}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\\n'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 如果不针对二维的character指定padding方法\n",
"dataset.set_input('chars')\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
" \n",
"\"\"\"\n",
" 其它field与之前的是相同的。chars因为存在两个维度需要padding,不能自动决定padding方式,所以直接输出了原始形式。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
" [ 2, 5, 0, 0],\n",
" [ 3, 0, 0, 0],\n",
" [10, 3, 6, 0],\n",
" [ 2, 6, 7, 3],\n",
" [ 8, 0, 0, 0]],\n",
"\n",
" [[ 2, 4, 0, 0],\n",
" [ 2, 5, 0, 0],\n",
" [11, 12, 7, 3],\n",
" [ 8, 0, 0, 0],\n",
" [ 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0]]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n chars被正确padding了\\n'"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 若要使用二维padding,需要手动设置padding方式\n",
"from fastNLP.core.fieldarray import EngChar2DPadder\n",
"dataset.set_padder('chars', EngChar2DPadder())\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
" \n",
"\"\"\"\n",
" chars被正确padding了\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"batch_x has: {'raw_sent': ['this is a bad idea .', 'it is great . '], 'word_str_lst': array([list(['this', 'is', 'a', 'bad', 'idea', '.']),\n",
" list(['it', 'is', 'great', '.'])], dtype=object), 'words': tensor([[ 4, 2, 5, 6, 7, 3],\n",
" [ 8, 2, 9, 3, -100, -100]]), 'chars': tensor([[[ 4, 9, 2, 5],\n",
" [ 2, 5, 0, 0],\n",
" [ 3, 0, 0, 0],\n",
" [10, 3, 6, 0],\n",
" [ 2, 6, 7, 3],\n",
" [ 8, 0, 0, 0]],\n",
"\n",
" [[ 2, 4, 0, 0],\n",
" [ 2, 5, 0, 0],\n",
" [11, 12, 7, 3],\n",
" [ 8, 0, 0, 0],\n",
" [ 0, 0, 0, 0],\n",
" [ 0, 0, 0, 0]]])}\n",
"batch_y has: {'label': tensor([0, 1])}\n"
]
},
{
"data": {
"text/plain": [
"'\\n raw_sent正确输出,对应内容也进行了pad。\\n'"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# 如果AutoPad与EngChar2DPadder不能满足需要,可以自己实现Padder对象。这里举一个例子,比如需要把raw_sentence pad到一样长\n",
"from fastNLP.core.fieldarray import PadderBase\n",
"\n",
"class PadStr(PadderBase):\n",
" def __init__(self, pad_val=' '):\n",
" super().__init__(pad_val=pad_val) #让父类管理pad_val的值,这样可以通过DataSet.set_pad_val()修改到该值\n",
" \n",
" def __call__(self, contents, field_name, field_ele_dtype):\n",
" \"\"\"\n",
" 如果以上面的例子举例,在raw_sent这个field进行pad时,传入的\n",
" contents:\n",
" [\n",
" 'This is a bad idea .',\n",
" 'It is great .'\n",
" ]\n",
" field_name: 'raw_sent',当前field的名称,主要用于帮助debug。\n",
" field_ele_dtype: np.str. 这个参数基本都用不上,是该field中内部元素的类型\n",
" \"\"\"\n",
" max_len = max([len(str_) for str_ in contents])\n",
" pad_strs = []\n",
" for content in contents:\n",
" pad_strs.append(content + (max_len-len(content))*self.pad_val)\n",
" return pad_strs\n",
"\n",
"dataset.set_input('raw_sent')\n",
"dataset.set_padder('raw_sent', PadStr())\n",
"batch_iterator = Batch(dataset=dataset, batch_size=2, sampler=RandomSampler())\n",
"for batch_x, batch_y in batch_iterator:\n",
" print(\"batch_x has: \", batch_x)\n",
" print(\"batch_y has: \", batch_y)\n",
"\n",
"\"\"\"\n",
" raw_sent正确输出,对应内容也进行了pad。\n",
"\"\"\""
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

+ 97
- 0
tutorials/fastnlp_test_tutorial.ipynb View File

@@ -0,0 +1,97 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## fastNLP测试说明\n",
"### 测试环境\n",
"fastNLP使用pytest对代码进行单元测试,测试代码在test文件夹下,测试所需数据在test/data_for_tests文件夹下\n",
"测试的步骤主要分为准备数据,执行测试,比对结果,清除环境四步\n",
"测试代码以test_xxx.py命名,以DataSet的测试代码为例,测试代码文件名为test_dataset.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"import unittest # 单元测试需要用到unittest\n",
"\n",
"from fastNLP.core.dataset import DataSet\n",
"from fastNLP.core.fieldarray import FieldArray\n",
"from fastNLP.core.instance import Instance\n",
"# 在这个单元测试文件中,需要测试DataSet、FieldArray、以及Instance"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class TestDataSet(unittest.TestCase): # 类名字以Test打头,继承unittest.TestCase\n",
"\n",
" def test_init_v1(self): # 测试样例1, 函数名称以test_打头\n",
" # 该测试样例测试的是DataSet的初始化\n",
" ins = Instance(x=[1, 2, 3, 4], y=[5, 6]) # 准备数据\n",
" ds = DataSet([ins] * 40) # 执行测试(调用DataSet的初始化函数)\n",
" self.assertTrue(\"x\" in ds.field_arrays and \"y\" in ds.field_arrays) # 比对结果:'x'跟'y'都是ds的field\n",
" self.assertEqual(ds.field_arrays[\"x\"].content, [[1, 2, 3, 4], ] * 40) # 比对结果: field 'x'的内容正确\n",
" self.assertEqual(ds.field_arrays[\"y\"].content, [[5, 6], ] * 40) # 比对结果: field 'y'的内容正确\n",
" \n",
" def test_init_v2(self): # 测试样例2,该样例测试DataSet的另一种初始化方式\n",
" ds = DataSet({\"x\": [[1, 2, 3, 4]] * 40, \"y\": [[5, 6]] * 40})\n",
" self.assertTrue(\"x\" in ds.field_arrays and \"y\" in ds.field_arrays)\n",
" self.assertEqual(ds.field_arrays[\"x\"].content, [[1, 2, 3, 4], ] * 40)\n",
" self.assertEqual(ds.field_arrays[\"y\"].content, [[5, 6], ] * 40)\n",
" \n",
" def test_init_assert(self): # 测试样例3,该样例测试不规范初始化DataSet时是否会报正确错误\n",
" with self.assertRaises(AssertionError):\n",
" _ = DataSet({\"x\": [[1, 2, 3, 4]] * 40, \"y\": [[5, 6]] * 100})\n",
" with self.assertRaises(AssertionError):\n",
" _ = DataSet([[1, 2, 3, 4]] * 10)\n",
" with self.assertRaises(ValueError):\n",
" _ = DataSet(0.00001)\n",
" \n",
" def test_contains(self): # 测试样例4,该样例测试DataSet的contains函数,是功能测试\n",
" ds = DataSet({\"x\": [[1, 2, 3, 4]] * 40, \"y\": [[5, 6]] * 40})\n",
" self.assertTrue(\"x\" in ds)\n",
" self.assertTrue(\"y\" in ds)\n",
" self.assertFalse(\"z\" in ds)\n",
" \n",
" # 更多测试样例见test/core/test_dataset.py"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

Loading…
Cancel
Save