{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 5 out of 7 words in the pre-training embedding.\n", "torch.Size([1, 5, 50])\n" ] } ], "source": [ "import torch\n", "from fastNLP.embeddings import StaticEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n", "\n", "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n", "print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([1, 5, 30])\n" ] } ], "source": [ "from fastNLP.embeddings import StaticEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n", "\n", "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22 out of 22 characters were found in pretrained elmo embedding.\n", "torch.Size([1, 5, 256])\n" ] } ], "source": [ "from fastNLP.embeddings import ElmoEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n", "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22 out of 22 characters were found in pretrained elmo embedding.\n", "torch.Size([1, 5, 512])\n" ] } ], "source": [ "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "22 out of 22 characters were found in pretrained elmo embedding.\n", "torch.Size([1, 5, 256])\n" ] } ], "source": [ "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n", "print(embed(words).size()) # 三层输出按照权重element-wise的加起来" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 7 words out of 7.\n", "torch.Size([1, 5, 768])\n" ] } ], "source": [ "from fastNLP.embeddings import BertEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n", "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 7 words out of 7.\n", "torch.Size([1, 5, 1536])\n" ] } ], "source": [ "# 使用后面两层的输出\n", "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n", "print(embed(words).size()) # 结果将是在最后一维做拼接" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 7 words out of 7.\n", "torch.Size([1, 7, 768])\n" ] } ], "source": [ "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n", "print(embed(words).size()) # 结果将在序列维度上增加2\n", "# 取出句子的cls表示\n", "cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 7 words out of 7.\n", "torch.Size([1, 5, 768])\n" ] } ], "source": [ "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 10 words out of 10.\n", "torch.Size([1, 9, 768])\n" ] } ], "source": [ "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n", "\n", "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n", "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Start constructing character vocabulary.\n", "In total, there are 8 distinct characters.\n", "torch.Size([1, 5, 64])\n" ] } ], "source": [ "from fastNLP.embeddings import CNNCharEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n", "embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n", "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Start constructing character vocabulary.\n", "In total, there are 8 distinct characters.\n", "torch.Size([1, 5, 64])\n" ] } ], "source": [ "from fastNLP.embeddings import LSTMCharEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n", "embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n", "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n", "print(embed(words).size())" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Found 5 out of 7 words in the pre-training embedding.\n", "50\n", "Start constructing character vocabulary.\n", "In total, there are 8 distinct characters.\n", "30\n", "22 out of 22 characters were found in pretrained elmo embedding.\n", "256\n", "22 out of 22 characters were found in pretrained elmo embedding.\n", "512\n", "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 7 words out of 7.\n", "768\n", "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 7 words out of 7.\n", "1536\n", "80\n" ] } ], "source": [ "from fastNLP.embeddings import *\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n", "print(static_embed.embedding_dim) # 50\n", "char_embed = CNNCharEmbedding(vocab, embed_size=30)\n", "print(char_embed.embedding_dim) # 30\n", "elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n", "print(elmo_embed_1.embedding_dim) # 256\n", "elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n", "print(elmo_embed_2.embedding_dim) # 512\n", "bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n", "print(bert_embed_1.embedding_dim) # 768\n", "bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n", "print(bert_embed_2.embedding_dim) # 1536\n", "stack_embed = StackEmbedding([static_embed, char_embed])\n", "print(stack_embed.embedding_dim) # 80" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n", "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n", "Start to generate word pieces for word.\n", "Found(Or segment into word pieces) 7 words out of 7.\n" ] } ], "source": [ "from fastNLP.embeddings import *\n", "\n", "vocab = Vocabulary()\n", "vocab.add_word_lst(\"this is a demo .\".split())\n", "\n", "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n", "embed.requires_grad = False # 修改BertEmbedding的权重为不更新" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n", " grad_fn=)\n", "tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n", " grad_fn=)\n" ] } ], "source": [ "from fastNLP.embeddings import StaticEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n", "# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的\n", "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n", "print(embed(torch.LongTensor([vocab.to_index('The')])))\n", "print(embed(torch.LongTensor([vocab.to_index('the')])))" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n", "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n", " grad_fn=)\n", "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n", " grad_fn=)\n" ] } ], "source": [ "from fastNLP.embeddings import StaticEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n", "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n", "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n", "print(embed(torch.LongTensor([vocab.to_index('The')])))\n", "print(embed(torch.LongTensor([vocab.to_index('the')])))" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1 out of 4 words have frequency less than 2.\n", "tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n", " grad_fn=)\n", "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n", " grad_fn=)\n", "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n", " grad_fn=)\n" ] } ], "source": [ "from fastNLP.embeddings import StaticEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n", "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n", "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n", "print(embed(torch.LongTensor([vocab.to_index('the')])))\n", "print(embed(torch.LongTensor([vocab.to_index('a')])))\n", "print(embed(torch.LongTensor([vocab.unknown_idx])))" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 out of 5 words have frequency less than 2.\n", "All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n", "tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n", " grad_fn=)\n", "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n", " grad_fn=)\n", "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n", " grad_fn=)\n", "tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n", " grad_fn=)\n" ] } ], "source": [ "from fastNLP.embeddings import StaticEmbedding\n", "from fastNLP import Vocabulary\n", "\n", "vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n", "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n", "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n", "print(embed(torch.LongTensor([vocab.to_index('the')])))\n", "print(embed(torch.LongTensor([vocab.to_index('a')])))\n", "print(embed(torch.LongTensor([vocab.to_index('A')])))\n", "print(embed(torch.LongTensor([vocab.unknown_idx])))" ] } ], "metadata": { "kernelspec": { "display_name": "Python Now", "language": "python", "name": "now" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 2 }