From 24fe2569173973caacfaba9d97f4fde11f739be6 Mon Sep 17 00:00:00 2001 From: ChenXin Date: Thu, 27 Feb 2020 15:22:42 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=20tutorial=5F3=20?= =?UTF-8?q?=E7=9A=84=E6=8B=BC=E5=86=99=E9=94=99=E8=AF=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/tutorials/tutorial_3_embedding.rst | 4 +- .../source/tutorials/tutorial_5_loss_optimizer.rst | 7 +- tutorials/tutorial_3_embedding.ipynb | 524 +++++++++++++++++++++ 3 files changed, 529 insertions(+), 6 deletions(-) create mode 100644 tutorials/tutorial_3_embedding.ipynb diff --git a/docs/source/tutorials/tutorial_3_embedding.rst b/docs/source/tutorials/tutorial_3_embedding.rst index 95c0105e..b719041e 100644 --- a/docs/source/tutorials/tutorial_3_embedding.rst +++ b/docs/source/tutorials/tutorial_3_embedding.rst @@ -254,14 +254,14 @@ CNNCharEmbedding的使用例子如下: .. code-block:: python - from fastNLP.embeddings import LSTMCharEmbeddding + from fastNLP.embeddings import LSTMCharEmbedding from fastNLP import Vocabulary vocab = Vocabulary() vocab.add_word_lst("this is a demo .".split()) # character的embedding维度大小为50,返回的embedding结果维度大小为64。 - embed = LSTMCharEmbeddding(vocab, embed_size=64, char_emb_size=50) + embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50) words = torch.LongTensor([[vocab.to_index(word) for word in "this is a demo .".split()]]) print(embed(words).size()) diff --git a/docs/source/tutorials/tutorial_5_loss_optimizer.rst b/docs/source/tutorials/tutorial_5_loss_optimizer.rst index 5874f66d..77d67e07 100644 --- a/docs/source/tutorials/tutorial_5_loss_optimizer.rst +++ b/docs/source/tutorials/tutorial_5_loss_optimizer.rst @@ -9,8 +9,8 @@ 数据读入 我们可以使用 fastNLP :mod:`fastNLP.io` 模块中的 :class:`~fastNLP.io.SST2Pipe` 类,轻松地读取以及预处理SST2数据集。:class:`~fastNLP.io.SST2Pipe` 对象的 - :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数 据集,函数默认paths值为None。 - 此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` : + :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法能够对读入的SST2数据集进行数据的预处理,方法的参数为paths, 指要处理的文件所在目录,如果paths为None,则会自动下载数据集,函数默认paths值为None。 + 此函数返回一个 :class:`~fastNLP.io.DataBundle`,包含SST2数据集的训练集、测试集、验证集以及source端和target端的字典。其训练、测试、验证数据集含有四个 :mod:`~fastNLP.core.field` : * raw_words: 原source句子 * target: 标签值 @@ -69,8 +69,7 @@ 数据集 :meth:`~fastNLP.DataSet.set_input` 和 :meth:`~fastNLP.DataSet.set_target` 函数 :class:`~fastNLP.io.SST2Pipe` 类的 :meth:`~fastNLP.io.SST2Pipe.process_from_file` 方法在预处理过程中还将训练、测试、验证 集的 `words` 、`seq_len` :mod:`~fastNLP.core.field` 设定为input,同时将 `target` :mod:`~fastNLP.core.field` 设定 - 为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个 - :mod:`~fastNLP.core.field` 的设定情况,代码如下: + 为target。我们可以通过 :class:`~fastNLP.core.Dataset` 类的 :meth:`~fastNLP.core.Dataset.print_field_meta` 方法查看各个 :mod:`~fastNLP.core.field` 的设定情况,代码如下: .. code-block:: python diff --git a/tutorials/tutorial_3_embedding.ipynb b/tutorials/tutorial_3_embedding.ipynb new file mode 100644 index 00000000..154a0756 --- /dev/null +++ b/tutorials/tutorial_3_embedding.ipynb @@ -0,0 +1,524 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 5 out of 7 words in the pre-training embedding.\n", + "torch.Size([1, 5, 50])\n" + ] + } + ], + "source": [ + "import torch\n", + "from fastNLP.embeddings import StaticEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n", + "\n", + "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n", + "print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "torch.Size([1, 5, 30])\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import StaticEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n", + "\n", + "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 out of 22 characters were found in pretrained elmo embedding.\n", + "torch.Size([1, 5, 256])\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import ElmoEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n", + "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 out of 22 characters were found in pretrained elmo embedding.\n", + "torch.Size([1, 5, 512])\n" + ] + } + ], + "source": [ + "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "22 out of 22 characters were found in pretrained elmo embedding.\n", + "torch.Size([1, 5, 256])\n" + ] + } + ], + "source": [ + "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n", + "print(embed(words).size()) # 三层输出按照权重element-wise的加起来" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 7 words out of 7.\n", + "torch.Size([1, 5, 768])\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import BertEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n", + "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 7 words out of 7.\n", + "torch.Size([1, 5, 1536])\n" + ] + } + ], + "source": [ + "# 使用后面两层的输出\n", + "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n", + "print(embed(words).size()) # 结果将是在最后一维做拼接" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 7 words out of 7.\n", + "torch.Size([1, 7, 768])\n" + ] + } + ], + "source": [ + "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n", + "print(embed(words).size()) # 结果将在序列维度上增加2\n", + "# 取出句子的cls表示\n", + "cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 7 words out of 7.\n", + "torch.Size([1, 5, 768])\n" + ] + } + ], + "source": [ + "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 10 words out of 10.\n", + "torch.Size([1, 9, 768])\n" + ] + } + ], + "source": [ + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n", + "\n", + "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n", + "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start constructing character vocabulary.\n", + "In total, there are 8 distinct characters.\n", + "torch.Size([1, 5, 64])\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import CNNCharEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n", + "embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n", + "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Start constructing character vocabulary.\n", + "In total, there are 8 distinct characters.\n", + "torch.Size([1, 5, 64])\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import LSTMCharEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n", + "embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n", + "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", + "print(embed(words).size())" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found 5 out of 7 words in the pre-training embedding.\n", + "50\n", + "Start constructing character vocabulary.\n", + "In total, there are 8 distinct characters.\n", + "30\n", + "22 out of 22 characters were found in pretrained elmo embedding.\n", + "256\n", + "22 out of 22 characters were found in pretrained elmo embedding.\n", + "512\n", + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 7 words out of 7.\n", + "768\n", + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 7 words out of 7.\n", + "1536\n", + "80\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import *\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n", + "print(static_embed.embedding_dim) # 50\n", + "char_embed = CNNCharEmbedding(vocab, embed_size=30)\n", + "print(char_embed.embedding_dim) # 30\n", + "elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n", + "print(elmo_embed_1.embedding_dim) # 256\n", + "elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n", + "print(elmo_embed_2.embedding_dim) # 512\n", + "bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n", + "print(bert_embed_1.embedding_dim) # 768\n", + "bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n", + "print(bert_embed_2.embedding_dim) # 1536\n", + "stack_embed = StackEmbedding([static_embed, char_embed])\n", + "print(stack_embed.embedding_dim) # 80" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", + "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", + "Start to generate word pieces for word.\n", + "Found(Or segment into word pieces) 7 words out of 7.\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import *\n", + "\n", + "vocab = Vocabulary()\n", + "vocab.add_word_lst(\"this is a demo .\".split())\n", + "\n", + "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n", + "embed.requires_grad = False # 修改BertEmbedding的权重为不更新" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n", + " grad_fn=)\n", + "tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import StaticEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n", + "# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的\n", + "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n", + "print(embed(torch.LongTensor([vocab.to_index('The')])))\n", + "print(embed(torch.LongTensor([vocab.to_index('the')])))" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n", + "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n", + " grad_fn=)\n", + "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import StaticEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n", + "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n", + "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n", + "print(embed(torch.LongTensor([vocab.to_index('The')])))\n", + "print(embed(torch.LongTensor([vocab.to_index('the')])))" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 out of 4 words have frequency less than 2.\n", + "tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n", + " grad_fn=)\n", + "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n", + " grad_fn=)\n", + "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import StaticEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n", + "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n", + "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n", + "print(embed(torch.LongTensor([vocab.to_index('the')])))\n", + "print(embed(torch.LongTensor([vocab.to_index('a')])))\n", + "print(embed(torch.LongTensor([vocab.unknown_idx])))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 out of 5 words have frequency less than 2.\n", + "All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n", + "tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n", + " grad_fn=)\n", + "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n", + " grad_fn=)\n", + "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n", + " grad_fn=)\n", + "tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n", + " grad_fn=)\n" + ] + } + ], + "source": [ + "from fastNLP.embeddings import StaticEmbedding\n", + "from fastNLP import Vocabulary\n", + "\n", + "vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n", + "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n", + "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n", + "print(embed(torch.LongTensor([vocab.to_index('the')])))\n", + "print(embed(torch.LongTensor([vocab.to_index('a')])))\n", + "print(embed(torch.LongTensor([vocab.to_index('A')])))\n", + "print(embed(torch.LongTensor([vocab.unknown_idx])))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python Now", + "language": "python", + "name": "now" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}