Browse Source

update tutorial-23 lxr 220518

tags/v1.0.0alpha
lxr-tech 3 years ago
parent
commit
0998b0a3ca
3 changed files with 329 additions and 108 deletions
  1. +142
    -17
      tutorials/fastnlp_tutorial_2.ipynb
  2. +128
    -91
      tutorials/fastnlp_tutorial_3.ipynb
  3. +59
    -0
      tutorials/fastnlp_tutorial_4.ipynb

+ 142
- 17
tutorials/fastnlp_tutorial_2.ipynb View File

@@ -15,8 +15,8 @@
"    2.1   PreTrainedTokenizer 的概念\n",
"\n",
"    2.2   BertTokenizer 的基本使用\n",
" \n",
"    2.3   补充:GloVe 词嵌入的使用"
"<!-- \n",
"&emsp; &emsp; 2.3 &ensp; 补充:GloVe 词嵌入的使用 -->"
]
},
{
@@ -83,18 +83,18 @@
"+------------------------------------------+----------+\n",
"| text | label |\n",
"+------------------------------------------+----------+\n",
"| ['this', 'quiet', ',', 'introspective... | positive |\n",
"| ['a', 'comedy-drama', 'of', 'nearly',... | positive |\n",
"| ['a', 'positively', 'thrilling', 'com... | neutral |\n",
"| ['a', 'series', 'of', 'escapades', 'd... | negative |\n",
"| ['this', 'quiet', ',', 'introspective... | positive |\n",
"| ['even', 'fans', 'of', 'ismail', 'mer... | negative |\n",
"| ['the', 'importance', 'of', 'being', ... | neutral |\n",
"+------------------------------------------+----------+\n",
"+------------------------------------------+----------+\n",
"| text | label |\n",
"+------------------------------------------+----------+\n",
"| ['even', 'fans', 'of', 'ismail', 'mer... | negative |\n",
"| ['the', 'importance', 'of', 'being', ... | neutral |\n",
"| ['a', 'comedy-drama', 'of', 'nearly',... | positive |\n",
"| ['a', 'positively', 'thrilling', 'com... | neutral |\n",
"+------------------------------------------+----------+\n",
"{'<pad>': 0, '<unk>': 1, 'positive': 2, 'neutral': 3, 'negative': 4}\n"
"{'<pad>': 0, '<unk>': 1, 'negative': 2, 'positive': 3, 'neutral': 4}\n"
]
}
],
@@ -261,10 +261,10 @@
"+------------------------------+----------+-----+\n",
"| text | label | len |\n",
"+------------------------------+----------+-----+\n",
"| ['this', 'quiet', ',', 'i... | positive | 11 |\n",
"| ['a', 'comedy-drama', 'of... | positive | 19 |\n",
"| ['a', 'positively', 'thri... | neutral | 26 |\n",
"| ['a', 'series', 'of', 'es... | negative | 37 |\n",
"| ['this', 'quiet', ',', 'i... | positive | 11 |\n",
"| ['even', 'fans', 'of', 'i... | negative | 21 |\n",
"| ['the', 'importance', 'of... | neutral | 20 |\n",
"+------------------------------+----------+-----+\n"
]
}
@@ -282,7 +282,11 @@
"\n",
"### 2.1 PreTrainTokenizer 的提出\n",
"\n",
"为什么要放弃传统的GloVe词嵌入?\n",
"*词嵌入是什么,为什么不用了*\n",
"\n",
"*什么是字节对编码,BPE的提出*\n",
"\n",
"*以BERT模型为例,WordPiece的提出*\n",
"\n",
"在`fastNLP 0.8`中,**使用`PreTrainedTokenizer`模块来为数据集中的词语进行词向量的标注**\n",
"\n",
@@ -686,10 +690,10 @@
"+------------------+----------+-----+------------------+--------------------+--------------------+\n",
"| text | label | len | input_ids | token_type_ids | attention_mask |\n",
"+------------------+----------+-----+------------------+--------------------+--------------------+\n",
"| ['this', 'qui... | positive | 11 | [101, 2023, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
"| ['a', 'comedy... | positive | 19 | [101, 1037, 1... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
"| ['a', 'positi... | neutral | 26 | [101, 1037, 1... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
"| ['a', 'series... | negative | 37 | [101, 1037, 2... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
"| ['this', 'qui... | positive | 11 | [101, 2023, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
"| ['even', 'fan... | negative | 21 | [101, 2130, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
"| ['the', 'impo... | neutral | 20 | [101, 1996, 5... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
"+------------------+----------+-----+------------------+--------------------+--------------------+\n"
]
}
@@ -713,9 +717,130 @@
}
},
"source": [
"### 2.3 补充:GloVe 词嵌入的使用\n",
"经过`tokenizer`的处理,原始数据集中的文本被替换为词素编号列表,此时,调用`databundle`模块的\n",
"\n",
"&emsp; **`set_pad`函数**,**将`databundle`的补零符编号`pad_val`和`tokenizer`补零符编号`pad_token_id`统一**\n",
"\n",
"&emsp; 该函数同时将`databundle`的`'input_ids'`字段添加到对应数据集的`collator`中(见`tutorial 3.`"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{}\n",
"{}\n",
"{'input_ids': {'pad_val': 0, 'dtype': None, 'backend': 'auto', 'pad_fn': None}}\n",
"{'input_ids': {'pad_val': 0, 'dtype': None, 'backend': 'auto', 'pad_fn': None}}\n"
]
}
],
"source": [
"print(data_bundle.get_dataset('train').collator.input_fields)\n",
"print(data_bundle.get_dataset('test').collator.input_fields)\n",
"data_bundle.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
"print(data_bundle.get_dataset('train').collator.input_fields)\n",
"print(data_bundle.get_dataset('test').collator.input_fields)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"最后,使用`from_dataset`、`index_dataset`和`iter_datasets`方法,为处理数据集的`'label'`字段编码\n",
"\n",
"&emsp; 接着**通过`set_ignore`函数**,**指定`databundle`的部分字段**,如`'text'`等,**在划分`batch`时不再出现**"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n",
"| text | label | len | input_ids | token_type_ids | attention_mask | target |\n",
"+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n",
"| ['a', 'seri... | negative | 37 | [101, 1037,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 0 |\n",
"| ['this', 'q... | positive | 11 | [101, 2023,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 1 |\n",
"| ['even', 'f... | negative | 21 | [101, 2130,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 0 |\n",
"| ['the', 'im... | neutral | 20 | [101, 1996,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 2 |\n",
"+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n"
]
}
],
"source": [
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"\n",
"target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
"target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
" new_field_name='target')\n",
"\n",
"data_bundle.set_ignore('text', 'len', 'label') \n",
"print(data_bundle.datasets['train'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"以上就是使用`dataset`、`vocabulary`、`databundle`和`tokenizer`实现输入文本数据的读取\n",
"\n",
"&emsp; 分词标注、序列化的全部预处理过程,通过下方的代码梳理,相信你会有更详细的了解\n",
"\n",
"```python\n",
"# 首先,导入预训练的 BertTokenizer,这里使用 'bert-base-uncased' 版本\n",
"tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
"\n",
"# 接着,导入数据,先生成为 dataset 形式,再变成 dataset-dict,并转为 databundle 形式\n",
"datasets = DataSet.from_pandas(pd.read_csv('./data/test4dataset.tsv'))\n",
"train_ds, test_ds = datasets.split(ratio=0.7)\n",
"data_bundle = DataBundle(datasets={'train': train_ds, 'test': test_ds})\n",
"\n",
"# 然后,通过 tokenizer.encode_plus 函数,进行文本分词标注、修改并补充数据包内容\n",
"encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n",
" return_attention_mask=True)\n",
"data_bundle.apply_field_more(encode, field_name='text', progress_bar='tqdm')\n",
"\n",
"# 在修改好 'text' 字段的文本信息后,接着处理 'label' 字段的预测信息\n",
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
"target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
" new_field_name='target')\n",
"\n",
"# 最后,通过 data_bundle 的其他一些函数,完成善后内容\n",
"data_bundle.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
"data_bundle.set_ignore('label', 'text') \n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"<!-- ### 2.3 补充:GloVe 词嵌入的使用\n",
"\n",
"如何使用传统的GloVe词嵌入\n",
"\n",
"from utils import get_from_cache\n",
"\n",
"filepath = get_from_cache(\"http://download.fastnlp.top/embedding/glove.6B.50d.zip\") -->\n",
"\n",
"在接下来的`tutorial 3.`中,将会介绍`fastNLP v0.8`中的`dataloader`模块,会涉及本章中\n",
"\n",
"如何使用传统的GloVe词嵌入"
"&emsp; 提到的`collator`模块,`fastNLP`的多框架适应以及完整的数据加载过程,敬请期待"
]
},
{


+ 128
- 91
tutorials/fastnlp_tutorial_3.ipynb View File

@@ -7,33 +7,43 @@
"source": [
"# T3. dataloader 的内部结构和基本使用\n",
"\n",
"&emsp; 1 &ensp; \n",
"&emsp; 1 &ensp; fastNLP 中的 dataloader\n",
" \n",
"&emsp; &emsp; 1.1 &ensp; \n",
"&emsp; &emsp; 1.1 &ensp; dataloader 的职责描述\n",
"\n",
"&emsp; &emsp; 1.2 &ensp; \n",
"&emsp; &emsp; 1.2 &ensp; dataloader 的基本使用\n",
"\n",
"&emsp; 2 &ensp; \n",
"&emsp; 2 &ensp; fastNLP 中 dataloader 的延伸\n",
"\n",
"&emsp; &emsp; 2.1 &ensp; \n",
"&emsp; &emsp; 2.1 &ensp; collator 的概念与使用\n",
"\n",
"&emsp; &emsp; 2.2 &ensp; \n",
"&emsp; &emsp; 2.2 &ensp; sampler 的概念与使用"
]
},
{
"cell_type": "markdown",
"id": "85857115",
"metadata": {},
"source": [
"## 1. fastNLP 中的 dataloader\n",
"\n",
"&emsp; 3 &ensp; \n",
" \n",
"&emsp; &emsp; 3.1 &ensp; \n",
"### 1.1 dataloader 的职责描述\n",
"\n",
"在`fastNLP 0.8`中,在数据加载模块`DataLoader`之前,还存在其他的一些模块,负责例如对文本数据\n",
"\n",
"&emsp; 进行补零对齐,即 **核对器`collator`模块**,进行分词标注,即 **分词器`tokenizer`模块**\n",
"\n",
"&emsp; 本节将对`fastNLP`中的核对器`collator`等展开介绍,分词器`tokenizer`将在下一节中详细介绍\n",
"\n",
"&emsp; &emsp; 3.2 &ensp; "
"在`fastNLP 0.8`中,**核对器`collator`模块负责文本序列的补零对齐**,通过"
]
},
{
"cell_type": "markdown",
"id": "d74d0523",
"id": "eb8fb51c",
"metadata": {},
"source": [
"## 3. fastNLP 中的 dataloader\n",
"\n",
"### 3.1 collator 的概念与使用\n",
"### 1.2 dataloader 的基本使用\n",
"\n",
"在`fastNLP 0.8`中,在数据加载模块`DataLoader`之前,还存在其他的一些模块,负责例如对文本数据\n",
"\n",
@@ -55,15 +65,44 @@
},
"outputs": [],
"source": [
"from fastNLP import Collator\n",
"import pandas as pd\n",
"from functools import partial\n",
"from fastNLP.transformers.torch import BertTokenizer\n",
"\n",
"from fastNLP import DataSet\n",
"from fastNLP import Vocabulary\n",
"from fastNLP.io import DataBundle\n",
"\n",
"\n",
"class PipeDemo:\n",
" def __init__(self, tokenizer='bert-base-uncased', num_proc=1):\n",
" self.tokenizer = BertTokenizer.from_pretrained(tokenizer)\n",
" self.num_proc = num_proc\n",
"\n",
" def process_from_file(self, path='./data/test4dataset.tsv'):\n",
" datasets = DataSet.from_pandas(pd.read_csv(path))\n",
" train_ds, test_ds = datasets.split(ratio=0.7)\n",
" train_ds, dev_ds = datasets.split(ratio=0.8)\n",
" data_bundle = DataBundle(datasets={'train': train_ds, 'dev': dev_ds, 'test': test_ds})\n",
"\n",
"collator = Collator()\n",
"# collator.set_pad(field_name='text', pad_val='<pad>')"
" encode = partial(self.tokenizer.encode_plus, max_length=100, truncation=True,\n",
" return_attention_mask=True)\n",
" data_bundle.apply_field_more(encode, field_name='text', num_proc=self.num_proc)\n",
"\n",
" target_vocab = Vocabulary(padding=None, unknown=None)\n",
"\n",
" target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
" target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
" new_field_name='target')\n",
"\n",
" data_bundle.set_pad('input_ids', pad_val=self.tokenizer.pad_token_id)\n",
" data_bundle.set_ignore('label', 'text') \n",
" return data_bundle"
]
},
{
"cell_type": "markdown",
"id": "51bf0878",
"id": "de53bff4",
"metadata": {},
"source": [
"&emsp; "
@@ -72,21 +111,74 @@
{
"cell_type": "code",
"execution_count": null,
"id": "3fd2486f",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"id": "57a29cb9",
"metadata": {},
"outputs": [],
"source": []
"source": [
"pipe = PipeDemo(tokenizer='bert-base-uncased', num_proc=4)\n",
"\n",
"data_bundle = pipe.process_from_file('./data/test4dataset.tsv')"
]
},
{
"cell_type": "markdown",
"id": "f9bbd9a7",
"id": "226bb081",
"metadata": {},
"source": [
"### 3.2 dataloader 的结构与使用"
"&emsp; "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7827557d",
"metadata": {},
"outputs": [],
"source": [
"from fastNLP import prepare_torch_dataloader\n",
"\n",
"dl_bundle = prepare_torch_dataloader(data_bundle, batch_size=arg.batch_size)"
]
},
{
"cell_type": "markdown",
"id": "d898cf40",
"metadata": {},
"source": [
"&emsp; \n",
"\n",
"```python\n",
"trainer = Trainer(\n",
" model=model,\n",
" train_dataloader=dl_bundle['train'],\n",
" optimizers=optimizer,\n",
"\t...\n",
"\tdriver=\"torch\",\n",
"\tdevice='cuda',\n",
"\t...\n",
" evaluate_dataloaders={'dev': dl_bundle['dev'], 'test': dl_bundle['test']}, \n",
" metrics={'acc': Accuracy()},\n",
"\t...\n",
")\n",
"```"
]
},
{
"cell_type": "markdown",
"id": "d74d0523",
"metadata": {},
"source": [
"## 2. fastNLP 中 dataloader 的延伸\n",
"\n",
"### 2.1 collator 的概念与使用\n",
"\n",
"在`fastNLP 0.8`中,在数据加载模块`DataLoader`之前,还存在其他的一些模块,负责例如对文本数据\n",
"\n",
"&emsp; 进行补零对齐,即 **核对器`collator`模块**,进行分词标注,即 **分词器`tokenizer`模块**\n",
"\n",
"&emsp; 本节将对`fastNLP`中的核对器`collator`等展开介绍,分词器`tokenizer`将在下一节中详细介绍\n",
"\n",
"在`fastNLP 0.8`中,**核对器`collator`模块负责文本序列的补零对齐**,通过"
]
},
{
@@ -138,39 +230,17 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b0c3c58d",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"dataloader.batch_sampler"
]
},
{
"cell_type": "markdown",
"id": "7ed431cc",
"id": "f9bbd9a7",
"metadata": {},
"source": [
"### 3.3 实例:NG20 的加载预处理\n",
"\n",
"在`fastNLP 0.8`中,**`Trainer`模块和`Evaluator`模块分别表示“训练器”和“评测器”**\n",
"\n",
"&emsp; 对应于之前的`fastNLP`版本中的`Trainer`模块和`Tester`模块,其定义方法如下所示\n",
"\n",
"在`fastNLP 0.8`中,需要注意,在同个`python`脚本中先使用`Trainer`训练,然后使用`Evaluator`评测\n",
"\n",
"&emsp; 非常关键的问题在于**如何正确设置二者的`driver`**。这就引入了另一个问题:什么是 `driver`?"
"### 2.2 sampler 的概念与使用"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a89ef613",
"id": "b0c3c58d",
"metadata": {
"pycharm": {
"name": "#%%\n"
@@ -178,60 +248,27 @@
},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from fastNLP import DataSet\n",
"from fastNLP import Vocabulary\n",
"\n",
"dataset = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv'))"
"dataloader.batch_sampler"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1624b0fa",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"cell_type": "markdown",
"id": "51bf0878",
"metadata": {},
"source": [
"from functools import partial\n",
"\n",
"encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n",
" return_attention_mask=True)\n",
"# 会新增 input_ids 、 attention_mask 和 token_type_ids 这三个 field\n",
"dataset.apply_field_more(encode, field_name='text')"
"&emsp; "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0991a8ee",
"id": "3fd2486f",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"target_vocab = Vocabulary(padding=None, unknown=None)\n",
"\n",
"target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
"target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
" new_field_name='labels')\n",
"# 需要将 input_ids 的 pad 值设置为 tokenizer 的 pad 值\n",
"dataset.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
"dataset.set_ignore('label', 'text') # 因为 label 是原始的不需要的 str ,所以我们可以忽略它,让它不要在 batch 的输出中出现"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b369137f",
"metadata": {},
"outputs": [],
"source": []
}
],


+ 59
- 0
tutorials/fastnlp_tutorial_4.ipynb View File

@@ -0,0 +1,59 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "fdd7ff16",
"metadata": {},
"source": [
"# T4. model 的搭建与 driver 的概念\n",
"\n",
"&emsp; 1 &ensp; fastNLP 中预训练模型的使用\n",
" \n",
"&emsp; &emsp; 1.1 &ensp; \n",
"\n",
"&emsp; &emsp; 1.2 &ensp; \n",
"\n",
"&emsp; 2 &ensp; fastNLP 中使用 Pytorch 搭建模型\n",
"\n",
"&emsp; &emsp; 2.1 &ensp; \n",
"\n",
"&emsp; &emsp; 2.2 &ensp; \n",
"\n",
"&emsp; 3 &ensp; fastNLP 中的 driver\n",
"\n",
"&emsp; &emsp; 3.1 &ensp; \n",
"\n",
"&emsp; &emsp; 3.2 &ensp; "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "08752c5a",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

Loading…
Cancel
Save