# fastNLP中的 Vocabulary
## 构建 Vocabulary

In [1]:
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word_lst(['复', '旦', '大', '学']) # 加入新的字
vocab.add_word('上海') # `上海`会作为一个整体
vocab.to_index('复') # 应该会为3
vocab.to_index('我') # 会输出1,Vocabulary中默认pad的index为0, unk(没有找到的词)的index为1

# 在构建target的Vocabulary时,词表中应该用不上pad和unk,可以通过以下的初始化
vocab = Vocabulary(unknown=None, padding=None)

In [2]:
vocab.add_word_lst(['positive', 'negative'])

Vocabulary(['positive', 'negative']...)

In [3]:
vocab.to_index('positive')

0

### 没有设置 unk 的情况

In [4]:
vocab.to_index('neutral') # 会报错,因为没有unk这种情况

ValueError: word `neutral` not in vocabulary

### 设置 unk 的情况

In [25]:
from fastNLP import Vocabulary

vocab = Vocabulary(unknown='', padding=None)
vocab.add_word_lst(['positive', 'negative'])
vocab.to_index('neutral'), vocab.to_word(vocab.to_index('neutral'))

(0, '')

In [8]:
vocab

Vocabulary(['positive', 'negative']...)

In [7]:
from fastNLP import Vocabulary
from fastNLP import DataSet

dataset = DataSet({'chars': [
 ['今', '天', '天', '气', '很', '好', '。'],
 ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
 ],
 'target': ['neutral', 'negative']
})

vocab = Vocabulary()
vocab.from_dataset(dataset, field_name='chars')
vocab.index_dataset(dataset, field_name='chars')

target_vocab = Vocabulary(padding=None, unknown=None)
target_vocab.from_dataset(dataset, field_name='target')
target_vocab.index_dataset(dataset, field_name='target')
print(dataset)

+---------------------------------------------------+--------+
| chars | target |
+---------------------------------------------------+--------+
| [4, 2, 2, 5, 6, 7, 3] | 0 |
| [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 3] | 1 |
+---------------------------------------------------+--------+


In [8]:
from fastNLP import Vocabulary
from fastNLP import DataSet

tr_data = DataSet({'chars': [
 ['今', '天', '心', '情', '很', '好', '。'],
 ['被', '这', '部', '电', '影', '浪', '费', '了', '两', '个', '小', '时', '。']
 ],
 'target': ['positive', 'negative']
})
dev_data = DataSet({'chars': [
 ['住', '宿', '条', '件', '还', '不', '错'],
 ['糟', '糕', '的', '天', '气', ',', '无', '法', '出', '行', '。']
 ],
 'target': ['positive', 'negative']
})

vocab = Vocabulary()
# 将验证集或者测试集在建立词表是放入no_create_entry_dataset这个参数中。
vocab.from_dataset(tr_data, field_name='chars', no_create_entry_dataset=[dev_data])


Vocabulary(['今', '天', '心', '情', '很']...)

In [9]:
import torch
from fastNLP.embeddings import StaticEmbedding
from fastNLP import Vocabulary

vocab = Vocabulary()
vocab.add_word('train')
vocab.add_word('only_in_train') # 仅在train出现,但肯定在预训练词表中不存在
vocab.add_word('test', no_create_entry=True) # 该词只在dev或test中出现
vocab.add_word('only_in_test', no_create_entry=True) # 这个词在预训练的词表中找不到

embed = StaticEmbedding(vocab, model_dir_or_name='en-glove-6b-50d')
print(embed(torch.LongTensor([vocab.to_index('train')])))
print(embed(torch.LongTensor([vocab.to_index('only_in_train')])))
print(embed(torch.LongTensor([vocab.to_index('test')])))
print(embed(torch.LongTensor([vocab.to_index('only_in_test')])))
print(embed(torch.LongTensor([vocab.unknown_idx])))

 4%|▎ | 2.31M/63.5M [00:00<00:02, 22.9MB/s]

http://212.129.155.247/embedding/glove.6B.50d.zip not found in cache, downloading to /tmp/tmpvziobj_e


100%|██████████| 63.5M/63.5M [00:01<00:00, 41.3MB/s]


Finish download from http://212.129.155.247/embedding/glove.6B.50d.zip
Copy file to /remote-home/ynzheng/.fastNLP/embedding/glove.6B.50d
Found 2 out of 6 words in the pre-training embedding.
tensor([[ 0.9497, 0.3433, 0.8450, -0.8852, -0.7208, -0.2931, -0.7468, 0.6512,
 0.4730, -0.7401, 0.1877, -0.3828, -0.5590, 0.4295, -0.2698, -0.4238,
 -0.3124, 1.3423, -0.7857, -0.6302, 0.9182, 0.2113, -0.5744, 1.4549,
 0.7546, -1.6165, -0.0085, 0.0029, 0.5130, -0.4745, 2.5306, 0.8594,
 -0.3067, 0.0578, 0.6623, 0.2080, 0.6424, -0.5246, -0.0534, 1.1404,
 -0.1370, -0.1836, 0.4546, -0.5096, -0.0255, -0.0286, 0.1805, -0.4483,
 0.4053, -0.3682]], grad_fn=)
tensor([[ 0.1320, -0.2392, 0.1732, -0.2390, -0.0463, 0.0494, 0.0488, -0.0886,
 0.0224, -0.1300, 0.0369, 0.1800, 0.0750, -0.0183, 0.2264, 0.1628,
 0.1261, -0.1259, 0.1663, -0.1230, -0.1904, -0.0532, 0.1397, -0.0259,
 -0.1799, 0.0226, 0.1858, 0.1981, 0.1338, 0.2394, 0.0248, 0.0203,
 -0.1722, -0.1683, -0.1892, 0.0874, 0.0562, -0.0394, 0.0306, -0.1761,
 0.1