hummingbird
/
fastNLP

import pytest
import os

from fastNLP.embeddings.torch import StaticEmbedding
from fastNLP import Vocabulary
from fastNLP.envs.imports import _NEED_IMPORT_TORCH
if _NEED_IMPORT_TORCH:
    import torch
import numpy as np

tests_folder = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))

@pytest.mark.torch
class TestLoad:
    def test_norm1(self):
        # 测试只对可以找到的norm
        vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
        embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                only_norm_found_vector=True)
        assert round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4) == 1
        assert torch.norm(embed(torch.LongTensor([[4]]))).item() != 1

    def test_norm2(self):
        # 测试对所有都norm
        vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
        embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                normalize=True)
        assert round(torch.norm(embed(torch.LongTensor([[2]]))).item(), 4) == 1
        assert round(torch.norm(embed(torch.LongTensor([[4]]))).item(), 4) == 1

    def test_dropword(self):
        # 测试是否可以通过drop word
        vocab = Vocabulary().add_word_lst([chr(i) for i in range(1, 200)])
        embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=10, dropout=0.1, word_dropout=0.4)
        for i in range(10):
            length = torch.randint(1, 50, (1,)).item()
            batch = torch.randint(1, 4, (1,)).item()
            words = torch.randint(1, 200, (batch, length)).long()
            embed(words)

    def test_only_use_pretrain_word(self):
        def check_word_unk(words, vocab, embed):
            for word in words:
                assert embed(torch.LongTensor([vocab.to_index(word)])).tolist()[0] == embed(torch.LongTensor([1])).tolist()[0]

        def check_vector_equal(words, vocab, embed, embed_dict, lower=False):
            for word in words:
                index = vocab.to_index(word)
                v1 = embed(torch.LongTensor([index])).tolist()[0]
                if lower:
                    word = word.lower()
                v2 = embed_dict[word]
                for v1i, v2i in zip(v1, v2):
                    assert np.allclose(v1i, v2i)
        embed_dict = read_static_embed(tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt')

        # 测试是否只使用pretrain的word
        vocab = Vocabulary().add_word_lst(['the', 'a', 'notinfile'])
        vocab.add_word('of', no_create_entry=True)
        embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                only_use_pretrain_word=True)
        #   notinfile应该被置为unk
        check_vector_equal(['the', 'a', 'of'], vocab, embed, embed_dict)
        check_word_unk(['notinfile'], vocab, embed)

        # 测试在大小写情况下的使用
        vocab = Vocabulary().add_word_lst(['The', 'a', 'notinfile'])
        vocab.add_word('Of', no_create_entry=True)
        embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                only_use_pretrain_word=True)
        check_word_unk(['The', 'Of', 'notinfile'], vocab, embed)  # 这些词应该找不到
        check_vector_equal(['a'], vocab, embed, embed_dict)

        embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                only_use_pretrain_word=True, lower=True)
        check_vector_equal(['The', 'Of', 'a'], vocab, embed, embed_dict, lower=True)
        check_word_unk(['notinfile'], vocab, embed)

        # 测试min_freq
        vocab = Vocabulary().add_word_lst(['The', 'a', 'notinfile1', 'A', 'notinfile2', 'notinfile2'])
        vocab.add_word('Of', no_create_entry=True)

        embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt',
                                only_use_pretrain_word=True, lower=True, min_freq=2, only_train_min_freq=True)

        check_vector_equal(['Of', 'a'], vocab, embed, embed_dict, lower=True)
        check_word_unk(['notinfile1', 'The', 'notinfile2'], vocab, embed)

    def test_sequential_index(self):
        # 当不存在no_create_entry时，words_to_words应该是顺序的
        vocab = Vocabulary().add_word_lst(['The', 'a', 'notinfile1', 'A', 'notinfile2', 'notinfile2'])
        embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt')
        for index,i in enumerate(embed.words_to_words):
            assert index==i

        embed_dict = read_static_embed(tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                         'glove.6B.50d_test.txt')

        for word, index in vocab:
            if word in embed_dict:
                index = vocab.to_index(word)
                v1 = embed(torch.LongTensor([index])).tolist()[0]
                v2 = embed_dict[word]
                for v1i, v2i in zip(v1, v2):
                    assert np.allclose(v1i, v2i)

    def test_save_load_static_embed(self):
        static_test_folder = 'static_save_test'
        try:
            # 测试包含no_create_entry
            os.makedirs(static_test_folder, exist_ok=True)

            vocab = Vocabulary().add_word_lst(['The', 'a', 'notinfile1', 'A'])
            vocab.add_word_lst(['notinfile2', 'notinfile2'], no_create_entry=True)
            embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                             'glove.6B.50d_test.txt')
            embed.save(static_test_folder)
            load_embed = StaticEmbedding.load(static_test_folder)
            words = torch.randint(len(vocab), size=(2, 20))
            assert (embed(words) - load_embed(words)).sum() == 0

            # 测试不包含no_create_entry
            vocab = Vocabulary().add_word_lst(['The', 'a', 'notinfile1', 'A'])
            embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                             'glove.6B.50d_test.txt')
            embed.save(static_test_folder)
            load_embed = StaticEmbedding.load(static_test_folder)
            words = torch.randint(len(vocab), size=(2, 20))
            assert (embed(words) - load_embed(words)).sum() == 0

            # 测试lower, min_freq
            vocab = Vocabulary().add_word_lst(['The', 'the', 'the', 'A', 'a', 'B'])
            embed = StaticEmbedding(vocab, model_dir_or_name=tests_folder+'/helpers/data/embedding/small_static_embedding/'
                                                             'glove.6B.50d_test.txt', min_freq=2, lower=True)
            embed.save(static_test_folder)
            load_embed = StaticEmbedding.load(static_test_folder)
            words = torch.randint(len(vocab), size=(2, 20))
            assert (embed(words) - load_embed(words)).sum() == 0

            # 测试random的embedding
            vocab = Vocabulary().add_word_lst(['The', 'the', 'the', 'A', 'a', 'B'])
            vocab = vocab.add_word_lst(['b'], no_create_entry=True)
            embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=4, min_freq=2, lower=True,
                                    normalize=True)
            embed.weight.data += 0.2  # 使得它不是normalize
            embed.save(static_test_folder)
            load_embed = StaticEmbedding.load(static_test_folder)
            words = torch.randint(len(vocab), size=(2, 20))
            assert (embed(words) - load_embed(words)).sum()==0

        finally:
            if os.path.isdir(static_test_folder):
                import shutil
                shutil.rmtree(static_test_folder)


def read_static_embed(fp):
    """

    :param str fp: embedding的路径
    :return: {}, key是word, value是vector
    """
    embed = {}
    with open(fp, 'r') as f:
        for line in f:
            line = line.strip()
            if line:
                parts = line.split()
                vector = list(map(float, parts[1:]))
                word = parts[0]
                embed[word] = vector
    return embed


@pytest.mark.torch
class TestRandomSameEntry:
    def test_same_vector(self):
        vocab = Vocabulary().add_word_lst(["The", "the", "THE", 'a', "A"])
        embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True)
        words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'a', 'A']]])
        words = embed(words)
        embed_0 = words[0, 0]
        for i in range(1, 3):
            assert torch.sum(embed_0==words[0, i]).eq(len(embed_0))
        embed_0 = words[0, 3]
        for i in range(3, 5):
            assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0))

    def test_dropout_close(self):
        vocab = Vocabulary().add_word_lst(["The", "the", "THE", 'a', "A"])
        embed = StaticEmbedding(vocab, model_dir_or_name=None, embedding_dim=5, lower=True,
                                dropout=0.5, word_dropout=0.9)
        words = torch.LongTensor([[vocab.to_index(word) for word in ["The", "the", "THE", 'a', 'A']]])
        embed.eval()
        words = embed(words)
        embed_0 = words[0, 0]
        for i in range(1, 3):
            assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0))
        embed_0 = words[0, 3]
        for i in range(3, 5):
            assert torch.sum(embed_0 == words[0, i]).eq(len(embed_0))