From 0998b0a3ca2b285cd2b4813c87174f316e308394 Mon Sep 17 00:00:00 2001
From: lxr-tech <1838593642@qq.com>
Date: Wed, 18 May 2022 15:41:24 +0800
Subject: [PATCH] update tutorial-23 lxr 220518

---
 tutorials/fastnlp_tutorial_2.ipynb | 159 ++++++++++++++++++++++++---
 tutorials/fastnlp_tutorial_3.ipynb | 219 ++++++++++++++++++++++---------------
 tutorials/fastnlp_tutorial_4.ipynb |  59 ++++++++++
 3 files changed, 329 insertions(+), 108 deletions(-)
 create mode 100644 tutorials/fastnlp_tutorial_4.ipynb
diff --git a/tutorials/fastnlp_tutorial_2.ipynb b/tutorials/fastnlp_tutorial_2.ipynb
index 74a0cb49..3aa27c86 100644
--- a/tutorials/fastnlp_tutorial_2.ipynb
+++ b/tutorials/fastnlp_tutorial_2.ipynb
@@ -15,8 +15,8 @@
     "&emsp; &emsp; 2.1 &ensp; PreTrainedTokenizer 的概念\n",
     "\n",
     "&emsp; &emsp; 2.2 &ensp; BertTokenizer 的基本使用\n",
-    " \n",
-    "&emsp; &emsp; 2.3 &ensp; 补充：GloVe 词嵌入的使用"
+    "<!--  \n",
+    "&emsp; &emsp; 2.3 &ensp; 补充：GloVe 词嵌入的使用 -->"
    ]
   },
   {
@@ -83,18 +83,18 @@
       "+------------------------------------------+----------+\n",
       "| text                                     | label    |\n",
       "+------------------------------------------+----------+\n",
-      "| ['this', 'quiet', ',', 'introspective... | positive |\n",
-      "| ['a', 'comedy-drama', 'of', 'nearly',... | positive |\n",
-      "| ['a', 'positively', 'thrilling', 'com... | neutral  |\n",
       "| ['a', 'series', 'of', 'escapades', 'd... | negative |\n",
+      "| ['this', 'quiet', ',', 'introspective... | positive |\n",
+      "| ['even', 'fans', 'of', 'ismail', 'mer... | negative |\n",
+      "| ['the', 'importance', 'of', 'being', ... | neutral  |\n",
       "+------------------------------------------+----------+\n",
       "+------------------------------------------+----------+\n",
       "| text                                     | label    |\n",
       "+------------------------------------------+----------+\n",
-      "| ['even', 'fans', 'of', 'ismail', 'mer... | negative |\n",
-      "| ['the', 'importance', 'of', 'being', ... | neutral  |\n",
+      "| ['a', 'comedy-drama', 'of', 'nearly',... | positive |\n",
+      "| ['a', 'positively', 'thrilling', 'com... | neutral  |\n",
       "+------------------------------------------+----------+\n",
-      "{'<pad>': 0, '<unk>': 1, 'positive': 2, 'neutral': 3, 'negative': 4}\n"
+      "{'<pad>': 0, '<unk>': 1, 'negative': 2, 'positive': 3, 'neutral': 4}\n"
      ]
     }
    ],
@@ -261,10 +261,10 @@
       "+------------------------------+----------+-----+\n",
       "| text                         | label    | len |\n",
       "+------------------------------+----------+-----+\n",
-      "| ['this', 'quiet', ',', 'i... | positive | 11  |\n",
-      "| ['a', 'comedy-drama', 'of... | positive | 19  |\n",
-      "| ['a', 'positively', 'thri... | neutral  | 26  |\n",
       "| ['a', 'series', 'of', 'es... | negative | 37  |\n",
+      "| ['this', 'quiet', ',', 'i... | positive | 11  |\n",
+      "| ['even', 'fans', 'of', 'i... | negative | 21  |\n",
+      "| ['the', 'importance', 'of... | neutral  | 20  |\n",
       "+------------------------------+----------+-----+\n"
      ]
     }
@@ -282,7 +282,11 @@
     "\n",
     "### 2.1 PreTrainTokenizer 的提出\n",
     "\n",
-    "为什么要放弃传统的GloVe词嵌入？\n",
+    "*词嵌入是什么，为什么不用了*\n",
+    "\n",
+    "*什么是字节对编码，BPE的提出*\n",
+    "\n",
+    "*以BERT模型为例，WordPiece的提出*\n",
     "\n",
     "在`fastNLP 0.8`中，**使用`PreTrainedTokenizer`模块来为数据集中的词语进行词向量的标注**\n",
     "\n",
@@ -686,10 +690,10 @@
       "+------------------+----------+-----+------------------+--------------------+--------------------+\n",
       "| text             | label    | len | input_ids        | token_type_ids     | attention_mask     |\n",
       "+------------------+----------+-----+------------------+--------------------+--------------------+\n",
-      "| ['this', 'qui... | positive | 11  | [101, 2023, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
-      "| ['a', 'comedy... | positive | 19  | [101, 1037, 1... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
-      "| ['a', 'positi... | neutral  | 26  | [101, 1037, 1... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
       "| ['a', 'series... | negative | 37  | [101, 1037, 2... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
+      "| ['this', 'qui... | positive | 11  | [101, 2023, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
+      "| ['even', 'fan... | negative | 21  | [101, 2130, 4... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
+      "| ['the', 'impo... | neutral  | 20  | [101, 1996, 5... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... |\n",
       "+------------------+----------+-----+------------------+--------------------+--------------------+\n"
      ]
     }
@@ -713,9 +717,130 @@
     }
    },
    "source": [
-    "### 2.3 补充：GloVe 词嵌入的使用\n",
+    "经过`tokenizer`的处理，原始数据集中的文本被替换为词素编号列表，此时，调用`databundle`模块的\n",
+    "\n",
+    "&emsp; **`set_pad`函数**，**将`databundle`的补零符编号`pad_val`和`tokenizer`补零符编号`pad_token_id`统一**\n",
+    "\n",
+    "&emsp; 该函数同时将`databundle`的`'input_ids'`字段添加到对应数据集的`collator`中（见`tutorial 3.`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{}\n",
+      "{}\n",
+      "{'input_ids': {'pad_val': 0, 'dtype': None, 'backend': 'auto', 'pad_fn': None}}\n",
+      "{'input_ids': {'pad_val': 0, 'dtype': None, 'backend': 'auto', 'pad_fn': None}}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(data_bundle.get_dataset('train').collator.input_fields)\n",
+    "print(data_bundle.get_dataset('test').collator.input_fields)\n",
+    "data_bundle.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
+    "print(data_bundle.get_dataset('train').collator.input_fields)\n",
+    "print(data_bundle.get_dataset('test').collator.input_fields)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "最后，使用`from_dataset`、`index_dataset`和`iter_datasets`方法，为处理数据集的`'label'`字段编码\n",
+    "\n",
+    "&emsp; 接着**通过`set_ignore`函数**，**指定`databundle`的部分字段**，如`'text'`等，**在划分`batch`时不再出现**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n",
+      "| text           | label    | len | input_ids      | token_type_ids     | attention_mask     | target |\n",
+      "+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n",
+      "| ['a', 'seri... | negative | 37  | [101, 1037,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 0      |\n",
+      "| ['this', 'q... | positive | 11  | [101, 2023,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 1      |\n",
+      "| ['even', 'f... | negative | 21  | [101, 2130,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 0      |\n",
+      "| ['the', 'im... | neutral  | 20  | [101, 1996,... | [0, 0, 0, 0, 0,... | [1, 1, 1, 1, 1,... | 2      |\n",
+      "+----------------+----------+-----+----------------+--------------------+--------------------+--------+\n"
+     ]
+    }
+   ],
+   "source": [
+    "target_vocab = Vocabulary(padding=None, unknown=None)\n",
+    "\n",
+    "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
+    "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
+    "                           new_field_name='target')\n",
+    "\n",
+    "data_bundle.set_ignore('text', 'len', 'label') \n",
+    "print(data_bundle.datasets['train'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "以上就是使用`dataset`、`vocabulary`、`databundle`和`tokenizer`实现输入文本数据的读取\n",
+    "\n",
+    "&emsp; 分词标注、序列化的全部预处理过程，通过下方的代码梳理，相信你会有更详细的了解\n",
+    "\n",
+    "```python\n",
+    "# 首先，导入预训练的 BertTokenizer，这里使用 'bert-base-uncased' 版本\n",
+    "tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')\n",
+    "\n",
+    "# 接着，导入数据，先生成为 dataset 形式，再变成 dataset-dict，并转为 databundle 形式\n",
+    "datasets = DataSet.from_pandas(pd.read_csv('./data/test4dataset.tsv'))\n",
+    "train_ds, test_ds = datasets.split(ratio=0.7)\n",
+    "data_bundle = DataBundle(datasets={'train': train_ds, 'test': test_ds})\n",
+    "\n",
+    "# 然后，通过 tokenizer.encode_plus 函数，进行文本分词标注、修改并补充数据包内容\n",
+    "encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n",
+    "                 return_attention_mask=True)\n",
+    "data_bundle.apply_field_more(encode, field_name='text', progress_bar='tqdm')\n",
+    "\n",
+    "# 在修改好 'text' 字段的文本信息后，接着处理 'label' 字段的预测信息\n",
+    "target_vocab = Vocabulary(padding=None, unknown=None)\n",
+    "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
+    "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
+    "                           new_field_name='target')\n",
+    "\n",
+    "# 最后，通过 data_bundle 的其他一些函数，完成善后内容\n",
+    "data_bundle.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
+    "data_bundle.set_ignore('label', 'text')  \n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "pycharm": {
+     "name": "#%% md\n"
+    }
+   },
+   "source": [
+    "<!-- ### 2.3 补充：GloVe 词嵌入的使用\n",
+    "\n",
+    "如何使用传统的GloVe词嵌入\n",
+    "\n",
+    "from utils import get_from_cache\n",
+    "\n",
+    "filepath = get_from_cache(\"http://download.fastnlp.top/embedding/glove.6B.50d.zip\") -->\n",
+    "\n",
+    "在接下来的`tutorial 3.`中，将会介绍`fastNLP v0.8`中的`dataloader`模块，会涉及本章中\n",
     "\n",
-    "如何使用传统的GloVe词嵌入"
+    "&emsp; 提到的`collator`模块，`fastNLP`的多框架适应以及完整的数据加载过程，敬请期待"
    ]
   },
   {
diff --git a/tutorials/fastnlp_tutorial_3.ipynb b/tutorials/fastnlp_tutorial_3.ipynb
index 5e09c4ea..8c3c935e 100644
--- a/tutorials/fastnlp_tutorial_3.ipynb
+++ b/tutorials/fastnlp_tutorial_3.ipynb
@@ -7,33 +7,43 @@
    "source": [
     "# T3. dataloader 的内部结构和基本使用\n",
     "\n",
-    "&emsp; 1 &ensp; \n",
+    "&emsp; 1 &ensp; fastNLP 中的 dataloader\n",
     " \n",
-    "&emsp; &emsp; 1.1 &ensp; \n",
+    "&emsp; &emsp; 1.1 &ensp; dataloader 的职责描述\n",
     "\n",
-    "&emsp; &emsp; 1.2 &ensp; \n",
+    "&emsp; &emsp; 1.2 &ensp; dataloader 的基本使用\n",
     "\n",
-    "&emsp; 2 &ensp; \n",
+    "&emsp; 2 &ensp; fastNLP 中 dataloader 的延伸\n",
     "\n",
-    "&emsp; &emsp; 2.1 &ensp; \n",
+    "&emsp; &emsp; 2.1 &ensp; collator 的概念与使用\n",
     "\n",
-    "&emsp; &emsp; 2.2 &ensp; \n",
+    "&emsp; &emsp; 2.2 &ensp; sampler 的概念与使用"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85857115",
+   "metadata": {},
+   "source": [
+    "## 1. fastNLP 中的 dataloader\n",
     "\n",
-    "&emsp; 3 &ensp; \n",
-    " \n",
-    "&emsp; &emsp; 3.1 &ensp; \n",
+    "### 1.1 dataloader 的职责描述\n",
+    "\n",
+    "在`fastNLP 0.8`中，在数据加载模块`DataLoader`之前，还存在其他的一些模块，负责例如对文本数据\n",
+    "\n",
+    "&emsp; 进行补零对齐，即 **核对器`collator`模块**，进行分词标注，即 **分词器`tokenizer`模块**\n",
+    "\n",
+    "&emsp; 本节将对`fastNLP`中的核对器`collator`等展开介绍，分词器`tokenizer`将在下一节中详细介绍\n",
     "\n",
-    "&emsp; &emsp; 3.2 &ensp; "
+    "在`fastNLP 0.8`中，**核对器`collator`模块负责文本序列的补零对齐**，通过"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "d74d0523",
+   "id": "eb8fb51c",
    "metadata": {},
    "source": [
-    "## 3. fastNLP 中的 dataloader\n",
-    "\n",
-    "### 3.1 collator 的概念与使用\n",
+    "### 1.2 dataloader 的基本使用\n",
     "\n",
     "在`fastNLP 0.8`中，在数据加载模块`DataLoader`之前，还存在其他的一些模块，负责例如对文本数据\n",
     "\n",
@@ -55,15 +65,44 @@
    },
    "outputs": [],
    "source": [
-    "from fastNLP import Collator\n",
+    "import pandas as pd\n",
+    "from functools import partial\n",
+    "from fastNLP.transformers.torch import BertTokenizer\n",
+    "\n",
+    "from fastNLP import DataSet\n",
+    "from fastNLP import Vocabulary\n",
+    "from fastNLP.io import DataBundle\n",
+    "\n",
+    "\n",
+    "class PipeDemo:\n",
+    "    def __init__(self, tokenizer='bert-base-uncased', num_proc=1):\n",
+    "        self.tokenizer = BertTokenizer.from_pretrained(tokenizer)\n",
+    "        self.num_proc = num_proc\n",
+    "\n",
+    "    def process_from_file(self, path='./data/test4dataset.tsv'):\n",
+    "        datasets = DataSet.from_pandas(pd.read_csv(path))\n",
+    "        train_ds, test_ds = datasets.split(ratio=0.7)\n",
+    "        train_ds, dev_ds = datasets.split(ratio=0.8)\n",
+    "        data_bundle = DataBundle(datasets={'train': train_ds, 'dev': dev_ds, 'test': test_ds})\n",
     "\n",
-    "collator = Collator()\n",
-    "# collator.set_pad(field_name='text', pad_val='<pad>')"
+    "        encode = partial(self.tokenizer.encode_plus, max_length=100, truncation=True,\n",
+    "                         return_attention_mask=True)\n",
+    "        data_bundle.apply_field_more(encode, field_name='text', num_proc=self.num_proc)\n",
+    "\n",
+    "        target_vocab = Vocabulary(padding=None, unknown=None)\n",
+    "\n",
+    "        target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
+    "        target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
+    "                                   new_field_name='target')\n",
+    "\n",
+    "        data_bundle.set_pad('input_ids', pad_val=self.tokenizer.pad_token_id)\n",
+    "        data_bundle.set_ignore('label', 'text')  \n",
+    "        return data_bundle"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "51bf0878",
+   "id": "de53bff4",
    "metadata": {},
    "source": [
     "&emsp; "
@@ -72,21 +111,74 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3fd2486f",
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
+   "id": "57a29cb9",
+   "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "pipe = PipeDemo(tokenizer='bert-base-uncased', num_proc=4)\n",
+    "\n",
+    "data_bundle = pipe.process_from_file('./data/test4dataset.tsv')"
+   ]
   },
   {
    "cell_type": "markdown",
-   "id": "f9bbd9a7",
+   "id": "226bb081",
    "metadata": {},
    "source": [
-    "### 3.2 dataloader 的结构与使用"
+    "&emsp; "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7827557d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from fastNLP import prepare_torch_dataloader\n",
+    "\n",
+    "dl_bundle = prepare_torch_dataloader(data_bundle, batch_size=arg.batch_size)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d898cf40",
+   "metadata": {},
+   "source": [
+    "&emsp; \n",
+    "\n",
+    "```python\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    train_dataloader=dl_bundle['train'],\n",
+    "    optimizers=optimizer,\n",
+    "\t...\n",
+    "\tdriver=\"torch\",\n",
+    "\tdevice='cuda',\n",
+    "\t...\n",
+    "    evaluate_dataloaders={'dev': dl_bundle['dev'], 'test': dl_bundle['test']},     \n",
+    "    metrics={'acc': Accuracy()},\n",
+    "\t...\n",
+    ")\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d74d0523",
+   "metadata": {},
+   "source": [
+    "## 2. fastNLP 中 dataloader 的延伸\n",
+    "\n",
+    "### 2.1 collator 的概念与使用\n",
+    "\n",
+    "在`fastNLP 0.8`中，在数据加载模块`DataLoader`之前，还存在其他的一些模块，负责例如对文本数据\n",
+    "\n",
+    "&emsp; 进行补零对齐，即 **核对器`collator`模块**，进行分词标注，即 **分词器`tokenizer`模块**\n",
+    "\n",
+    "&emsp; 本节将对`fastNLP`中的核对器`collator`等展开介绍，分词器`tokenizer`将在下一节中详细介绍\n",
+    "\n",
+    "在`fastNLP 0.8`中，**核对器`collator`模块负责文本序列的补零对齐**，通过"
    ]
   },
   {
@@ -138,39 +230,17 @@
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b0c3c58d",
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
-   "source": [
-    "dataloader.batch_sampler"
-   ]
-  },
-  {
    "cell_type": "markdown",
-   "id": "7ed431cc",
+   "id": "f9bbd9a7",
    "metadata": {},
    "source": [
-    "### 3.3 实例：NG20 的加载预处理\n",
-    "\n",
-    "在`fastNLP 0.8`中，**`Trainer`模块和`Evaluator`模块分别表示“训练器”和“评测器”**\n",
-    "\n",
-    "&emsp; 对应于之前的`fastNLP`版本中的`Trainer`模块和`Tester`模块，其定义方法如下所示\n",
-    "\n",
-    "在`fastNLP 0.8`中，需要注意，在同个`python`脚本中先使用`Trainer`训练，然后使用`Evaluator`评测\n",
-    "\n",
-    "&emsp; 非常关键的问题在于**如何正确设置二者的`driver`**。这就引入了另一个问题：什么是 `driver`？"
+    "### 2.2 sampler 的概念与使用"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a89ef613",
+   "id": "b0c3c58d",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
@@ -178,60 +248,27 @@
    },
    "outputs": [],
    "source": [
-    "import pandas as pd\n",
-    "\n",
-    "from fastNLP import DataSet\n",
-    "from fastNLP import Vocabulary\n",
-    "\n",
-    "dataset = DataSet.from_pandas(pd.read_csv('./data/ng20_test.csv'))"
+    "dataloader.batch_sampler"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1624b0fa",
-   "metadata": {
-    "pycharm": {
-     "name": "#%%\n"
-    }
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "51bf0878",
+   "metadata": {},
    "source": [
-    "from functools import partial\n",
-    "\n",
-    "encode = partial(tokenizer.encode_plus, max_length=100, truncation=True,\n",
-    "                 return_attention_mask=True)\n",
-    "# 会新增 input_ids 、 attention_mask 和 token_type_ids 这三个 field\n",
-    "dataset.apply_field_more(encode, field_name='text')"
+    "&emsp; "
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "0991a8ee",
+   "id": "3fd2486f",
    "metadata": {
     "pycharm": {
      "name": "#%%\n"
     }
    },
    "outputs": [],
-   "source": [
-    "target_vocab = Vocabulary(padding=None, unknown=None)\n",
-    "\n",
-    "target_vocab.from_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label')\n",
-    "target_vocab.index_dataset(*[ds for _, ds in data_bundle.iter_datasets()], field_name='label',\n",
-    "                           new_field_name='labels')\n",
-    "# 需要将 input_ids 的 pad 值设置为 tokenizer 的 pad 值\n",
-    "dataset.set_pad('input_ids', pad_val=tokenizer.pad_token_id)\n",
-    "dataset.set_ignore('label', 'text')  # 因为 label 是原始的不需要的 str ，所以我们可以忽略它，让它不要在 batch 的输出中出现"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b369137f",
-   "metadata": {},
-   "outputs": [],
    "source": []
   }
  ],
diff --git a/tutorials/fastnlp_tutorial_4.ipynb b/tutorials/fastnlp_tutorial_4.ipynb
new file mode 100644
index 00000000..532118b0
--- /dev/null
+++ b/tutorials/fastnlp_tutorial_4.ipynb
@@ -0,0 +1,59 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fdd7ff16",
+   "metadata": {},
+   "source": [
+    "# T4. model 的搭建与 driver 的概念\n",
+    "\n",
+    "&emsp; 1 &ensp; fastNLP 中预训练模型的使用\n",
+    " \n",
+    "&emsp; &emsp; 1.1 &ensp; \n",
+    "\n",
+    "&emsp; &emsp; 1.2 &ensp; \n",
+    "\n",
+    "&emsp; 2 &ensp; fastNLP 中使用 Pytorch 搭建模型\n",
+    "\n",
+    "&emsp; &emsp; 2.1 &ensp; \n",
+    "\n",
+    "&emsp; &emsp; 2.2 &ensp; \n",
+    "\n",
+    "&emsp; 3 &ensp; fastNLP 中的 driver\n",
+    "\n",
+    "&emsp; &emsp; 3.1 &ensp; \n",
+    "\n",
+    "&emsp; &emsp; 3.2 &ensp; "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08752c5a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}