From 0998b0a3ca2b285cd2b4813c87174f316e308394 Mon Sep 17 00:00:00 2001 From: lxr-tech <1838593642@qq.com> Date: Wed, 18 May 2022 15:41:24 +0800 Subject: [PATCH] update tutorial-23 lxr 220518 --- tutorials/fastnlp_tutorial_2.ipynb | 159 ++++++++++++++++++++++++--- tutorials/fastnlp_tutorial_3.ipynb | 219 ++++++++++++++++++++++--------------- tutorials/fastnlp_tutorial_4.ipynb | 59 ++++++++++ 3 files changed, 329 insertions(+), 108 deletions(-) create mode 100644 tutorials/fastnlp_tutorial_4.ipynb diff --git a/tutorials/fastnlp_tutorial_2.ipynb b/tutorials/fastnlp_tutorial_2.ipynb index 74a0cb49..3aa27c86 100644 --- a/tutorials/fastnlp_tutorial_2.ipynb +++ b/tutorials/fastnlp_tutorial_2.ipynb @@ -15,8 +15,8 @@ "    2.1   PreTrainedTokenizer 的概念\n", "\n", "    2.2   BertTokenizer 的基本使用\n", - " \n", - "    2.3   补充:GloVe 词嵌入的使用" + "" ] }, { @@ -83,18 +83,18 @@ "+------------------------------------------+----------+\n", "| text | label |\n", "+------------------------------------------+----------+\n", - "| ['this', 'quiet', ',', 'introspective... | positive |\n", - "| ['a', 'comedy-drama', 'of', 'nearly',... | positive |\n", - "| ['a', 'positively', 'thrilling', 'com... | neutral |\n", "| ['a', 'series', 'of', 'escapades', 'd... | negative |\n", + "| ['this', 'quiet', ',', 'introspective... | positive |\n", + "| ['even', 'fans', 'of', 'ismail', 'mer... | negative |\n", + "| ['the', 'importance', 'of', 'being', ... | neutral |\n", "+------------------------------------------+----------+\n", "+------------------------------------------+----------+\n", "| text | label |\n", "+------------------------------------------+----------+\n", - "| ['even', 'fans', 'of', 'ismail', 'mer... | negative |\n", - "| ['the', 'importance', 'of', 'being', ... | neutral |\n", + "| ['a', 'comedy-drama', 'of', 'nearly',... | positive |\n", + "| ['a', 'positively', 'thrilling', 'com... | neutral |\n", "+------------------------------------------+----------+\n", - "{'': 0, '': 1, 'positive': 2, 'neutral': 3, 'negative': 4}\n" + "{'': 0, '': 1, 'negative': 2, 'positive': 3, 'neutral': 4}\n" ] } ], @@ -261,10 +261,10 @@ "+------------------------------+----------+-----+\n", "| text | label | len |\n", "+------------------------------+----------+-----+\n", - "| ['this', 'quiet', ',', 'i... | positive | 11 |\n", - "| ['a', 'comedy-drama', 'of... | positive | 19 |\n", - "| ['a', 'positively', 'thri... | neutral | 26 |\n", "| ['a', 'series', 'of', 'es... | negative | 37 |\n", + "| ['this', 'quiet', ',', 'i... | positive | 11 |\n", + "| ['even', 'fans', 'of', 'i... | negative | 21 |\n", + "| ['the', 'importance', 'of... | neutral | 20 |\n", "+------------------------------+----------+-----+\n" ] } @@ -282,7 +282,11 @@ "\n", "### 2.1 PreTrainTokenizer 的提出\n", "\n", - "为什么要放弃传统的GloVe词嵌入?\n", + "*词嵌入是什么,为什么不用了*\n", + "\n", + "*什么是字节对编码,BPE的提出*\n", + "\n", + "*以BERT模型为例,WordPiece的提出*\n", "\n", "在`fastNLP 0.8`中,**使用`PreTrainedTokenizer`模块来为数据集中的词语进行词向量的标注**\n", "\n", @@ -686,10 +690,10 @@ "+------------------+----------+-----+------------------+--------------------+--------------------+\n", "| text | label | len | input_ids | token_type_ids | attention_mask |\n", "+------------------+----------+-----+------------------+--------------------+--------------------+\n", - "| ['this', 'qui... | positive | 11 | [101, 2023, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n", - "| ['a', 'comedy... | positive | 19 | [101, 1037, 1... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n", - "| ['a', 'positi... | neutral | 26 | [101, 1037, 1... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n", "| ['a', 'series... | negative | 37 | [101, 1037, 2... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n", + "| ['this', 'qui... | positive | 11 | [101, 2023, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n", + "| ['even', 'fan... | negative | 21 | [101, 2130, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n", + "| ['the', 'impo... | neutral | 20 | [101, 1996, 5... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n", "+------------------+----------+-----+------------------+--------------------+--------------------+\n" ] } @@ -713,9 +717,130 @@ } }, "source": [ - "### 2.3 补充:GloVe 词嵌入的使用\n", + "经过`tokenizer`的处理,原始数据集中的文本被替换为词素编号列表,此时,调用`databundle`模块的\n", + "\n", + "  **`set_pad`函数**,**将`databundle`的补零符编号`pad_val`和`tokenizer`补零符编号`pad_token_id`统一**\n", + "\n", + "  该函数同时将`databundle`的`'input_ids'`字段添加到对应数据集的`collator`中(见`tutorial 3.`" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{}\n", + "{}\n", + "{'input_ids': {'pad_val': 0, 'dtype': None, 'backend': 'auto', 'pad_fn': None}}\n", + "{'input_ids': {'pad_val': 0, 'dtype': None, 'backend': 'auto', 'pad_fn': None}}\n" + ] + } + ], + "source": [ + "print(data_bundle.get_dataset('train').collator.input_fields)\n", + "print(data_bundle.get_dataset('test').collator.input_fields)\n", + "data_bundle.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n", + "print(data_bundle.get_dataset('train').collator.input_fields)\n", + "print(data_bundle.get_dataset('test').collator.input_fields)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "最后,使用`from_dataset`、`index_dataset`和`iter_datasets`方法,为处理数据集的`'label'`字段编码\n", + "\n", + "  接着**通过`set_ignore`函数**,**指定`databundle`的部分字段**,如`'text'`等,**在划分`batch`时不再出现**" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n", + "| text | label | len | input_ids | token_type_ids | attention_mask | target |\n", + "+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n", + "| ['a', 'seri... | negative | 37 | [101, 1037,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 0 |\n", + "| ['this', 'q... | positive | 11 | [101, 2023,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 1 |\n", + "| ['even', 'f... | negative | 21 | [101, 2130,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 0 |\n", + "| ['the', 'im... | neutral | 20 | [101, 1996,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 2 |\n", + "+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n" + ] + } + ], + "source": [ + "target_vocab = Vocabulary(padding=None, unknown=None)\n", + "\n", + "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n", + "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n", + " new_field_name='target')\n", + "\n", + "data_bundle.set_ignore('text', 'len', 'label') \n", + "print(data_bundle.datasets['train'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "以上就是使用`dataset`、`vocabulary`、`databundle`和`tokenizer`实现输入文本数据的读取\n", + "\n", + "  分词标注、序列化的全部预处理过程,通过下方的代码梳理,相信你会有更详细的了解\n", + "\n", + "```python\n", + "# 首先,导入预训练的 BertTokenizer,这里使用 'bert-base-uncased' 版本\n", + "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n", + "\n", + "# 接着,导入数据,先生成为 dataset 形式,再变成 dataset-dict,并转为 databundle 形式\n", + "datasets = DataSet.from_pandas(pd.read_csv('./data/test4dataset.tsv'))\n", + "train_ds, test_ds = datasets.split(ratio=0.7)\n", + "data_bundle = DataBundle(datasets={'train': train_ds, 'test': test_ds})\n", + "\n", + "# 然后,通过 tokenizer.encode_plus 函数,进行文本分词标注、修改并补充数据包内容\n", + "encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n", + " return_attention_mask=True)\n", + "data_bundle.apply_field_more(encode, field_name='text', progress_bar='tqdm')\n", + "\n", + "# 在修改好 'text' 字段的文本信息后,接着处理 'label' 字段的预测信息\n", + "target_vocab = Vocabulary(padding=None, unknown=None)\n", + "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n", + "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n", + " new_field_name='target')\n", + "\n", + "# 最后,通过 data_bundle 的其他一些函数,完成善后内容\n", + "data_bundle.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n", + "data_bundle.set_ignore('label', 'text') \n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "\n", + "\n", + "在接下来的`tutorial 3.`中,将会介绍`fastNLP v0.8`中的`dataloader`模块,会涉及本章中\n", "\n", - "如何使用传统的GloVe词嵌入" + "  提到的`collator`模块,`fastNLP`的多框架适应以及完整的数据加载过程,敬请期待" ] }, { diff --git a/tutorials/fastnlp_tutorial_3.ipynb b/tutorials/fastnlp_tutorial_3.ipynb index 5e09c4ea..8c3c935e 100644 --- a/tutorials/fastnlp_tutorial_3.ipynb +++ b/tutorials/fastnlp_tutorial_3.ipynb @@ -7,33 +7,43 @@ "source": [ "# T3. dataloader 的内部结构和基本使用\n", "\n", - "  1   \n", + "  1   fastNLP 中的 dataloader\n", " \n", - "    1.1   \n", + "    1.1   dataloader 的职责描述\n", "\n", - "    1.2   \n", + "    1.2   dataloader 的基本使用\n", "\n", - "  2   \n", + "  2   fastNLP 中 dataloader 的延伸\n", "\n", - "    2.1   \n", + "    2.1   collator 的概念与使用\n", "\n", - "    2.2   \n", + "    2.2   sampler 的概念与使用" + ] + }, + { + "cell_type": "markdown", + "id": "85857115", + "metadata": {}, + "source": [ + "## 1. fastNLP 中的 dataloader\n", "\n", - "  3   \n", - " \n", - "    3.1   \n", + "### 1.1 dataloader 的职责描述\n", + "\n", + "在`fastNLP 0.8`中,在数据加载模块`DataLoader`之前,还存在其他的一些模块,负责例如对文本数据\n", + "\n", + "  进行补零对齐,即 **核对器`collator`模块**,进行分词标注,即 **分词器`tokenizer`模块**\n", + "\n", + "  本节将对`fastNLP`中的核对器`collator`等展开介绍,分词器`tokenizer`将在下一节中详细介绍\n", "\n", - "    3.2   " + "在`fastNLP 0.8`中,**核对器`collator`模块负责文本序列的补零对齐**,通过" ] }, { "cell_type": "markdown", - "id": "d74d0523", + "id": "eb8fb51c", "metadata": {}, "source": [ - "## 3. fastNLP 中的 dataloader\n", - "\n", - "### 3.1 collator 的概念与使用\n", + "### 1.2 dataloader 的基本使用\n", "\n", "在`fastNLP 0.8`中,在数据加载模块`DataLoader`之前,还存在其他的一些模块,负责例如对文本数据\n", "\n", @@ -55,15 +65,44 @@ }, "outputs": [], "source": [ - "from fastNLP import Collator\n", + "import pandas as pd\n", + "from functools import partial\n", + "from fastNLP.transformers.torch import BertTokenizer\n", + "\n", + "from fastNLP import DataSet\n", + "from fastNLP import Vocabulary\n", + "from fastNLP.io import DataBundle\n", + "\n", + "\n", + "class PipeDemo:\n", + " def __init__(self, tokenizer='bert-base-uncased', num_proc=1):\n", + " self.tokenizer = BertTokenizer.from_pretrained(tokenizer)\n", + " self.num_proc = num_proc\n", + "\n", + " def process_from_file(self, path='./data/test4dataset.tsv'):\n", + " datasets = DataSet.from_pandas(pd.read_csv(path))\n", + " train_ds, test_ds = datasets.split(ratio=0.7)\n", + " train_ds, dev_ds = datasets.split(ratio=0.8)\n", + " data_bundle = DataBundle(datasets={'train': train_ds, 'dev': dev_ds, 'test': test_ds})\n", "\n", - "collator = Collator()\n", - "# collator.set_pad(field_name='text', pad_val='')" + " encode = partial(self.tokenizer.encode_plus, max_length=100, truncation=True,\n", + " return_attention_mask=True)\n", + " data_bundle.apply_field_more(encode, field_name='text', num_proc=self.num_proc)\n", + "\n", + " target_vocab = Vocabulary(padding=None, unknown=None)\n", + "\n", + " target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n", + " target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n", + " new_field_name='target')\n", + "\n", + " data_bundle.set_pad('input_ids', pad_val=self.tokenizer.pad_token_id)\n", + " data_bundle.set_ignore('label', 'text') \n", + " return data_bundle" ] }, { "cell_type": "markdown", - "id": "51bf0878", + "id": "de53bff4", "metadata": {}, "source": [ "  " @@ -72,21 +111,74 @@ { "cell_type": "code", "execution_count": null, - "id": "3fd2486f", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, + "id": "57a29cb9", + "metadata": {}, "outputs": [], - "source": [] + "source": [ + "pipe = PipeDemo(tokenizer='bert-base-uncased', num_proc=4)\n", + "\n", + "data_bundle = pipe.process_from_file('./data/test4dataset.tsv')" + ] }, { "cell_type": "markdown", - "id": "f9bbd9a7", + "id": "226bb081", "metadata": {}, "source": [ - "### 3.2 dataloader 的结构与使用" + "  " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7827557d", + "metadata": {}, + "outputs": [], + "source": [ + "from fastNLP import prepare_torch_dataloader\n", + "\n", + "dl_bundle = prepare_torch_dataloader(data_bundle, batch_size=arg.batch_size)" + ] + }, + { + "cell_type": "markdown", + "id": "d898cf40", + "metadata": {}, + "source": [ + "  \n", + "\n", + "```python\n", + "trainer = Trainer(\n", + " model=model,\n", + " train_dataloader=dl_bundle['train'],\n", + " optimizers=optimizer,\n", + "\t...\n", + "\tdriver=\"torch\",\n", + "\tdevice='cuda',\n", + "\t...\n", + " evaluate_dataloaders={'dev': dl_bundle['dev'], 'test': dl_bundle['test']}, \n", + " metrics={'acc': Accuracy()},\n", + "\t...\n", + ")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "d74d0523", + "metadata": {}, + "source": [ + "## 2. fastNLP 中 dataloader 的延伸\n", + "\n", + "### 2.1 collator 的概念与使用\n", + "\n", + "在`fastNLP 0.8`中,在数据加载模块`DataLoader`之前,还存在其他的一些模块,负责例如对文本数据\n", + "\n", + "  进行补零对齐,即 **核对器`collator`模块**,进行分词标注,即 **分词器`tokenizer`模块**\n", + "\n", + "  本节将对`fastNLP`中的核对器`collator`等展开介绍,分词器`tokenizer`将在下一节中详细介绍\n", + "\n", + "在`fastNLP 0.8`中,**核对器`collator`模块负责文本序列的补零对齐**,通过" ] }, { @@ -138,39 +230,17 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "b0c3c58d", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], - "source": [ - "dataloader.batch_sampler" - ] - }, - { "cell_type": "markdown", - "id": "7ed431cc", + "id": "f9bbd9a7", "metadata": {}, "source": [ - "### 3.3 实例:NG20 的加载预处理\n", - "\n", - "在`fastNLP 0.8`中,**`Trainer`模块和`Evaluator`模块分别表示“训练器”和“评测器”**\n", - "\n", - "  对应于之前的`fastNLP`版本中的`Trainer`模块和`Tester`模块,其定义方法如下所示\n", - "\n", - "在`fastNLP 0.8`中,需要注意,在同个`python`脚本中先使用`Trainer`训练,然后使用`Evaluator`评测\n", - "\n", - "  非常关键的问题在于**如何正确设置二者的`driver`**。这就引入了另一个问题:什么是 `driver`?" + "### 2.2 sampler 的概念与使用" ] }, { "cell_type": "code", "execution_count": null, - "id": "a89ef613", + "id": "b0c3c58d", "metadata": { "pycharm": { "name": "#%%\n" @@ -178,60 +248,27 @@ }, "outputs": [], "source": [ - "import pandas as pd\n", - "\n", - "from fastNLP import DataSet\n", - "from fastNLP import Vocabulary\n", - "\n", - "dataset = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv'))" + "dataloader.batch_sampler" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "1624b0fa", - "metadata": { - "pycharm": { - "name": "#%%\n" - } - }, - "outputs": [], + "cell_type": "markdown", + "id": "51bf0878", + "metadata": {}, "source": [ - "from functools import partial\n", - "\n", - "encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n", - " return_attention_mask=True)\n", - "# 会新增 input_ids 、 attention_mask 和 token_type_ids 这三个 field\n", - "dataset.apply_field_more(encode, field_name='text')" + "  " ] }, { "cell_type": "code", "execution_count": null, - "id": "0991a8ee", + "id": "3fd2486f", "metadata": { "pycharm": { "name": "#%%\n" } }, "outputs": [], - "source": [ - "target_vocab = Vocabulary(padding=None, unknown=None)\n", - "\n", - "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n", - "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n", - " new_field_name='labels')\n", - "# 需要将 input_ids 的 pad 值设置为 tokenizer 的 pad 值\n", - "dataset.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n", - "dataset.set_ignore('label', 'text') # 因为 label 是原始的不需要的 str ,所以我们可以忽略它,让它不要在 batch 的输出中出现" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b369137f", - "metadata": {}, - "outputs": [], "source": [] } ], diff --git a/tutorials/fastnlp_tutorial_4.ipynb b/tutorials/fastnlp_tutorial_4.ipynb new file mode 100644 index 00000000..532118b0 --- /dev/null +++ b/tutorials/fastnlp_tutorial_4.ipynb @@ -0,0 +1,59 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fdd7ff16", + "metadata": {}, + "source": [ + "# T4. model 的搭建与 driver 的概念\n", + "\n", + "  1   fastNLP 中预训练模型的使用\n", + " \n", + "    1.1   \n", + "\n", + "    1.2   \n", + "\n", + "  2   fastNLP 中使用 Pytorch 搭建模型\n", + "\n", + "    2.1   \n", + "\n", + "    2.2   \n", + "\n", + "  3   fastNLP 中的 driver\n", + "\n", + "    3.1   \n", + "\n", + "    3.2   " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08752c5a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}