|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524 |
- {
- "cells": [
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Found 5 out of 7 words in the pre-training embedding.\n",
- "torch.Size([1, 5, 50])\n"
- ]
- }
- ],
- "source": [
- "import torch\n",
- "from fastNLP.embeddings import StaticEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
- "\n",
- "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]]) # 将文本转为index\n",
- "print(embed(words).size()) # StaticEmbedding的使用和pytorch的nn.Embedding是类似的"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "torch.Size([1, 5, 30])\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import StaticEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=30)\n",
- "\n",
- "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "22 out of 22 characters were found in pretrained elmo embedding.\n",
- "torch.Size([1, 5, 256])\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import ElmoEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False)\n",
- "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "22 out of 22 characters were found in pretrained elmo embedding.\n",
- "torch.Size([1, 5, 512])\n"
- ]
- }
- ],
- "source": [
- "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=False, layers='1,2')\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "22 out of 22 characters were found in pretrained elmo embedding.\n",
- "torch.Size([1, 5, 256])\n"
- ]
- }
- ],
- "source": [
- "embed = ElmoEmbedding(vocab, model_dir_or_name='en-small', requires_grad=True, layers='mix')\n",
- "print(embed(words).size()) # 三层输出按照权重element-wise的加起来"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 7 words out of 7.\n",
- "torch.Size([1, 5, 768])\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import BertEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased')\n",
- "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 7 words out of 7.\n",
- "torch.Size([1, 5, 1536])\n"
- ]
- }
- ],
- "source": [
- "# 使用后面两层的输出\n",
- "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='10,11')\n",
- "print(embed(words).size()) # 结果将是在最后一维做拼接"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 7 words out of 7.\n",
- "torch.Size([1, 7, 768])\n"
- ]
- }
- ],
- "source": [
- "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', include_cls_sep=True)\n",
- "print(embed(words).size()) # 结果将在序列维度上增加2\n",
- "# 取出句子的cls表示\n",
- "cls_reps = embed(words)[:, 0] # shape: [batch_size, 768]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 7 words out of 7.\n",
- "torch.Size([1, 5, 768])\n"
- ]
- }
- ],
- "source": [
- "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 10 words out of 10.\n",
- "torch.Size([1, 9, 768])\n"
- ]
- }
- ],
- "source": [
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo . [SEP] another sentence .\".split())\n",
- "\n",
- "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', layers='-1', pool_method='max')\n",
- "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo . [SEP] another sentence .\".split()]])\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Start constructing character vocabulary.\n",
- "In total, there are 8 distinct characters.\n",
- "torch.Size([1, 5, 64])\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import CNNCharEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
- "embed = CNNCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
- "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Start constructing character vocabulary.\n",
- "In total, there are 8 distinct characters.\n",
- "torch.Size([1, 5, 64])\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import LSTMCharEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "# character的embedding维度大小为50,返回的embedding结果维度大小为64。\n",
- "embed = LSTMCharEmbedding(vocab, embed_size=64, char_emb_size=50)\n",
- "words = torch.LongTensor([[vocab.to_index(word) for word in \"this is a demo .\".split()]])\n",
- "print(embed(words).size())"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Found 5 out of 7 words in the pre-training embedding.\n",
- "50\n",
- "Start constructing character vocabulary.\n",
- "In total, there are 8 distinct characters.\n",
- "30\n",
- "22 out of 22 characters were found in pretrained elmo embedding.\n",
- "256\n",
- "22 out of 22 characters were found in pretrained elmo embedding.\n",
- "512\n",
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 7 words out of 7.\n",
- "768\n",
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 7 words out of 7.\n",
- "1536\n",
- "80\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import *\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "static_embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')\n",
- "print(static_embed.embedding_dim) # 50\n",
- "char_embed = CNNCharEmbedding(vocab, embed_size=30)\n",
- "print(char_embed.embedding_dim) # 30\n",
- "elmo_embed_1 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='2')\n",
- "print(elmo_embed_1.embedding_dim) # 256\n",
- "elmo_embed_2 = ElmoEmbedding(vocab, model_dir_or_name='en-small', layers='1,2')\n",
- "print(elmo_embed_2.embedding_dim) # 512\n",
- "bert_embed_1 = BertEmbedding(vocab, layers='-1', model_dir_or_name='en-base-cased')\n",
- "print(bert_embed_1.embedding_dim) # 768\n",
- "bert_embed_2 = BertEmbedding(vocab, layers='2,-1', model_dir_or_name='en-base-cased')\n",
- "print(bert_embed_2.embedding_dim) # 1536\n",
- "stack_embed = StackEmbedding([static_embed, char_embed])\n",
- "print(stack_embed.embedding_dim) # 80"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "loading vocabulary file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/vocab.txt\n",
- "Load pre-trained BERT parameters from file /remote-home/ynzheng/.fastNLP/embedding/bert-base-cased/pytorch_model.bin.\n",
- "Start to generate word pieces for word.\n",
- "Found(Or segment into word pieces) 7 words out of 7.\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import *\n",
- "\n",
- "vocab = Vocabulary()\n",
- "vocab.add_word_lst(\"this is a demo .\".split())\n",
- "\n",
- "embed = BertEmbedding(vocab, model_dir_or_name='en-base-cased', requires_grad=True) # 初始化时设定为需要更新\n",
- "embed.requires_grad = False # 修改BertEmbedding的权重为不更新"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "tensor([[ 0.3633, -0.2091, -0.0353, -0.3771, -0.5193]],\n",
- " grad_fn=<EmbeddingBackward>)\n",
- "tensor([[ 0.0926, -0.4812, -0.7744, 0.4836, -0.5475]],\n",
- " grad_fn=<EmbeddingBackward>)\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import StaticEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
- "# 下面用随机的StaticEmbedding演示,但与使用预训练词向量时效果是一致的\n",
- "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5)\n",
- "print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
- "print(embed(torch.LongTensor([vocab.to_index('the')])))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "All word in the vocab have been lowered. There are 6 words, 4 unique lowered words.\n",
- "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
- " grad_fn=<EmbeddingBackward>)\n",
- "tensor([[ 0.4530, -0.1558, -0.1941, 0.3203, 0.0355]],\n",
- " grad_fn=<EmbeddingBackward>)\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import StaticEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary().add_word_lst(\"The the a A\".split())\n",
- "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
- "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, lower=True)\n",
- "print(embed(torch.LongTensor([vocab.to_index('The')])))\n",
- "print(embed(torch.LongTensor([vocab.to_index('the')])))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "1 out of 4 words have frequency less than 2.\n",
- "tensor([[ 0.4724, -0.7277, -0.6350, -0.5258, -0.6063]],\n",
- " grad_fn=<EmbeddingBackward>)\n",
- "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
- " grad_fn=<EmbeddingBackward>)\n",
- "tensor([[ 0.7638, -0.0552, 0.1625, -0.2210, 0.4993]],\n",
- " grad_fn=<EmbeddingBackward>)\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import StaticEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary().add_word_lst(\"the the the a\".split())\n",
- "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
- "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2)\n",
- "print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
- "print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
- "print(embed(torch.LongTensor([vocab.unknown_idx])))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "0 out of 5 words have frequency less than 2.\n",
- "All word in the vocab have been lowered. There are 5 words, 4 unique lowered words.\n",
- "tensor([[ 0.1943, 0.3739, 0.2769, -0.4746, -0.3181]],\n",
- " grad_fn=<EmbeddingBackward>)\n",
- "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
- " grad_fn=<EmbeddingBackward>)\n",
- "tensor([[ 0.5892, -0.6916, 0.7319, -0.3803, 0.4979]],\n",
- " grad_fn=<EmbeddingBackward>)\n",
- "tensor([[-0.1348, -0.2172, -0.0071, 0.5704, -0.2607]],\n",
- " grad_fn=<EmbeddingBackward>)\n"
- ]
- }
- ],
- "source": [
- "from fastNLP.embeddings import StaticEmbedding\n",
- "from fastNLP import Vocabulary\n",
- "\n",
- "vocab = Vocabulary().add_word_lst(\"the the the a A\".split())\n",
- "# 下面用随机的StaticEmbedding演示,但与使用预训练时效果是一致的\n",
- "embed = StaticEmbedding(vocab, model_name_or_dir=None, embedding_dim=5, min_freq=2, lower=True)\n",
- "print(embed(torch.LongTensor([vocab.to_index('the')])))\n",
- "print(embed(torch.LongTensor([vocab.to_index('a')])))\n",
- "print(embed(torch.LongTensor([vocab.to_index('A')])))\n",
- "print(embed(torch.LongTensor([vocab.unknown_idx])))"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python Now",
- "language": "python",
- "name": "now"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.0"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 2
- }
|