修复了 tutorial_3 的拼写错误

5 years ago · 24fe256917
--- a/docs/source/tutorials/tutorial_3_embedding.rst
+++ b/docs/source/tutorials/tutorial_3_embedding.rst
@@ -254,14 +254,14 @@ CNNCharEmbedding的使用例子如下：

 .. code-block:: python

    from fastNLP.embeddings import LSTMCharEmbeddding
    from fastNLP.embeddings import LSTMCharEmbedding
    from fastNLP import Vocabulary

    vocab = Vocabulary()
    vocab.add_word_lst("this is a demo .".split())

    # character的embedding维度大小为50，返回的embedding结果维度大小为64。
    embed = LSTMCharEmbeddding(vocab, embed_size=64, char_emb_size=50)
    embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)
    words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]])
    print(embed(words).size())

--- a/docs/source/tutorials/tutorial_5_loss_optimizer.rst
+++ b/docs/source/tutorials/tutorial_5_loss_optimizer.rst
@@ -9,8 +9,8 @@

 数据读入
    我们可以使用 fastNLP  :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类，轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的
    :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理，方法的参数为paths, 指要处理的文件所在目录，如果paths为None，则会自动下载数      据集，函数默认paths值为None。
    此函数返回一个 :class:`~fastNLP.io.DataBundle`，包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个     :mod:`~fastNLP.core.field` ：
    :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理，方法的参数为paths, 指要处理的文件所在目录，如果paths为None，则会自动下载数据集，函数默认paths值为None。
    此函数返回一个 :class:`~fastNLP.io.DataBundle`，包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` ：

    * raw_words: 原source句子
    * target: 标签值
@@ -69,8 +69,7 @@
 数据集 :meth:`~fastNLP.DataSet.set_input` 和  :meth:`~fastNLP.DataSet.set_target` 函数
    :class:`~fastNLP.io.SST2Pipe`  类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证
    集的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input，同时将 `target`  :mod:`~fastNLP.core.field` 设定
    为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个
     :mod:`~fastNLP.core.field` 的设定情况，代码如下：
    为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个 :mod:`~fastNLP.core.field` 的设定情况，代码如下：

    .. code-block:: python

--- a/tutorials/tutorial_3_embedding.ipynb
+++ b/tutorials/tutorial_3_embedding.ipynb
@@ -0,0 +1,524 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 5 out of 7 words in the pre-training embedding.\n",
      "torch.Size([1, 5, 50])\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from fastNLP.embeddings import StaticEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
    "\n",
    "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])  # 将文本转为index\n",
    "print(embed(words).size())  # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "torch.Size([1, 5, 30])\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import StaticEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
    "\n",
    "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22 out of 22 characters were found in pretrained elmo embedding.\n",
      "torch.Size([1, 5, 256])\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import ElmoEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
    "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22 out of 22 characters were found in pretrained elmo embedding.\n",
      "torch.Size([1, 5, 512])\n"
     ]
    }
   ],
   "source": [
    "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "22 out of 22 characters were found in pretrained elmo embedding.\n",
      "torch.Size([1, 5, 256])\n"
     ]
    }
   ],
   "source": [
    "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
    "print(embed(words).size())  # 三层输出按照权重element-wise的加起来"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 7 words out of 7.\n",
      "torch.Size([1, 5, 768])\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import BertEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
    "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 7 words out of 7.\n",
      "torch.Size([1, 5, 1536])\n"
     ]
    }
   ],
   "source": [
    "#  使用后面两层的输出\n",
    "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
    "print(embed(words).size())  # 结果将是在最后一维做拼接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 7 words out of 7.\n",
      "torch.Size([1, 7, 768])\n"
     ]
    }
   ],
   "source": [
    "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
    "print(embed(words).size())  # 结果将在序列维度上增加2\n",
    "# 取出句子的cls表示\n",
    "cls_reps = embed(words)[:, 0]  # shape: [batch_size, 768]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 7 words out of 7.\n",
      "torch.Size([1, 5, 768])\n"
     ]
    }
   ],
   "source": [
    "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 10 words out of 10.\n",
      "torch.Size([1, 9, 768])\n"
     ]
    }
   ],
   "source": [
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
    "\n",
    "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
    "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start constructing character vocabulary.\n",
      "In total, there are 8 distinct characters.\n",
      "torch.Size([1, 5, 64])\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import CNNCharEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "# character的embedding维度大小为50，返回的embedding结果维度大小为64。\n",
    "embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
    "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Start constructing character vocabulary.\n",
      "In total, there are 8 distinct characters.\n",
      "torch.Size([1, 5, 64])\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import LSTMCharEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "# character的embedding维度大小为50，返回的embedding结果维度大小为64。\n",
    "embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
    "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
    "print(embed(words).size())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found 5 out of 7 words in the pre-training embedding.\n",
      "50\n",
      "Start constructing character vocabulary.\n",
      "In total, there are 8 distinct characters.\n",
      "30\n",
      "22 out of 22 characters were found in pretrained elmo embedding.\n",
      "256\n",
      "22 out of 22 characters were found in pretrained elmo embedding.\n",
      "512\n",
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 7 words out of 7.\n",
      "768\n",
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 7 words out of 7.\n",
      "1536\n",
      "80\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import *\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
    "print(static_embed.embedding_dim)  # 50\n",
    "char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
    "print(char_embed.embedding_dim)    # 30\n",
    "elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
    "print(elmo_embed_1.embedding_dim)  # 256\n",
    "elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
    "print(elmo_embed_2.embedding_dim)  # 512\n",
    "bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
    "print(bert_embed_1.embedding_dim)  # 768\n",
    "bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
    "print(bert_embed_2.embedding_dim)  # 1536\n",
    "stack_embed = StackEmbedding([static_embed, char_embed])\n",
    "print(stack_embed.embedding_dim)  # 80"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
      "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
      "Start to generate word pieces for word.\n",
      "Found(Or segment into word pieces) 7 words out of 7.\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import *\n",
    "\n",
    "vocab = Vocabulary()\n",
    "vocab.add_word_lst(\"this is a demo .\".split())\n",
    "\n",
    "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True)  # 初始化时设定为需要更新\n",
    "embed.requires_grad = False  # 修改BertEmbedding的权重为不更新"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
      "       grad_fn=<EmbeddingBackward>)\n",
      "tensor([[ 0.0926, -0.4812, -0.7744,  0.4836, -0.5475]],\n",
      "       grad_fn=<EmbeddingBackward>)\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import StaticEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
    "#  下面用随机的StaticEmbedding演示，但与使用预训练词向量时效果是一致的\n",
    "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
    "print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
    "print(embed(torch.LongTensor([vocab.to_index('the')])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
      "tensor([[ 0.4530, -0.1558, -0.1941,  0.3203,  0.0355]],\n",
      "       grad_fn=<EmbeddingBackward>)\n",
      "tensor([[ 0.4530, -0.1558, -0.1941,  0.3203,  0.0355]],\n",
      "       grad_fn=<EmbeddingBackward>)\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import StaticEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
    "#  下面用随机的StaticEmbedding演示，但与使用预训练时效果是一致的\n",
    "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
    "print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
    "print(embed(torch.LongTensor([vocab.to_index('the')])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 out of 4 words have frequency less than 2.\n",
      "tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
      "       grad_fn=<EmbeddingBackward>)\n",
      "tensor([[ 0.7638, -0.0552,  0.1625, -0.2210,  0.4993]],\n",
      "       grad_fn=<EmbeddingBackward>)\n",
      "tensor([[ 0.7638, -0.0552,  0.1625, -0.2210,  0.4993]],\n",
      "       grad_fn=<EmbeddingBackward>)\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import StaticEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
    "#  下面用随机的StaticEmbedding演示，但与使用预训练时效果是一致的\n",
    "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
    "print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
    "print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
    "print(embed(torch.LongTensor([vocab.unknown_idx])))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 out of 5 words have frequency less than 2.\n",
      "All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
      "tensor([[ 0.1943,  0.3739,  0.2769, -0.4746, -0.3181]],\n",
      "       grad_fn=<EmbeddingBackward>)\n",
      "tensor([[ 0.5892, -0.6916,  0.7319, -0.3803,  0.4979]],\n",
      "       grad_fn=<EmbeddingBackward>)\n",
      "tensor([[ 0.5892, -0.6916,  0.7319, -0.3803,  0.4979]],\n",
      "       grad_fn=<EmbeddingBackward>)\n",
      "tensor([[-0.1348, -0.2172, -0.0071,  0.5704, -0.2607]],\n",
      "       grad_fn=<EmbeddingBackward>)\n"
     ]
    }
   ],
   "source": [
    "from fastNLP.embeddings import StaticEmbedding\n",
    "from fastNLP import Vocabulary\n",
    "\n",
    "vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
    "#  下面用随机的StaticEmbedding演示，但与使用预训练时效果是一致的\n",
    "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
    "print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
    "print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
    "print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
    "print(embed(torch.LongTensor([vocab.unknown_idx])))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python Now",
   "language": "python",
   "name": "now"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }