import codecs import random import re import gensim import numpy as np from gensim import corpora from torch.utils.data import Dataset def clean_str(string): """ Tokenization/string cleaning for all datasets except for SST. Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py """ string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string.strip() def pad_sentences(sentence, padding_word=" "): sequence_length = 64 sent = sentence.split() padded_sentence = sentence + padding_word * (sequence_length - len(sent)) return padded_sentence #data loader class MRDataset(Dataset): def __init__(self): #load positive and negative sentenses from files with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f: positive_examples = list(f.readlines()) with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f: negative_examples = list(f.readlines()) #s.strip: clear "\n"; clear_str; pad positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] self.examples = positive_examples + negative_examples self.sentences_texts = [sample.split() for sample in self.examples] #word dictionary dictionary = corpora.Dictionary(self.sentences_texts) self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} #set lables: postive is 1; negative is 0 positive_labels = [1 for _ in positive_examples] negative_labels = [0 for _ in negative_examples] self.lables = positive_labels + negative_labels examples_lables = list(zip(self.examples,self.lables)) random.shuffle(examples_lables) self.MRDataset_frame = examples_lables #transform word to id self.MRDataset_wordid = \ [( np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), sent[1] ) for sent in self.MRDataset_frame] def word_embeddings(self, path="./GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"): # establish from google model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) print('Please wait ... (it could take a while to load the file : {})'.format(path)) word_dict = self.word2id_dict embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) for word in word_dict: word_id = word_dict[word] if word in model.wv.vocab: embedding_weights[word_id, :] = model[word] return embedding_weights def __len__(self): return len(self.MRDataset_frame) def __getitem__(self,idx): sample = self.MRDataset_wordid[idx] return sample def getsent(self, idx): sample = self.MRDataset_wordid[idx][0] return sample def getlabel(self, idx): label = self.MRDataset_wordid[idx][1] return label def word2id(self): return self.word2id_dict def id2word(self): id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) return id2word_dict class train_set(Dataset): def __init__(self, samples): self.train_frame = samples def __len__(self): return len(self.train_frame) def __getitem__(self, idx): return self.train_frame[idx] class test_set(Dataset): def __init__(self, samples): self.test_frame = samples def __len__(self): return len(self.test_frame) def __getitem__(self, idx): return self.test_frame[idx]