From 544ca8631b4ea13203005abeee4c27efc830e7a3 Mon Sep 17 00:00:00 2001 From: choocewhatulike <1901722105@qq.com> Date: Mon, 12 Mar 2018 00:54:28 +0800 Subject: [PATCH] tokenize data --- .../code/__pycache__/model.cpython-36.pyc | Bin 0 -> 2223 bytes model_inplement/code/model.py | 63 +++++++++++++++++---- model_inplement/code/preprocess.py | 42 ++++++++++++++ model_inplement/code/train.py | 0 4 files changed, 93 insertions(+), 12 deletions(-) create mode 100644 model_inplement/code/__pycache__/model.cpython-36.pyc create mode 100644 model_inplement/code/preprocess.py create mode 100644 model_inplement/code/train.py diff --git a/model_inplement/code/__pycache__/model.cpython-36.pyc b/model_inplement/code/__pycache__/model.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2e18ae7f3f1addfca6601d873620127d292e35aa GIT binary patch literal 2223 zcmV;g2vGMk4Gju5prcxm3IG7(0000000000000001ONa4KmY&$b6NlZ0Av7U0c-$T z0Av7U0c-(U0b~GV0c-+l16l)Q0AvDe1Z@Rc1pxqL0AvAd25JLp2U-Y60AvGX1cU%& z1Z4wi34;S#3P%8B1!M+<0AvPb18NC_16m7Y0aE}e2k8I+002%Y0on*wVRC6MtB zUtih`L3DIwZggpHZcb%%+6s4Xa%5j@VR>b8+6r@JZggL4VR>b8+5&EF+6GK%Ze?L| z+6-@XbZ~WaUubY*ySUvp`CWhnvL3144hY+-YAUtg&J`Z;AfTtjbcY-MLM?AWo>0{bX;R^WMy1!Z)9a`E^v8r1^@s62><{9a~1#q z0009H0t5j90RjOC0SEyC0RjOC0TKav3`jvvE?-}1ZfSI1U)lk8V*>yH0000000sa6 z00RI307C!(0CPwH004XfWC3deawdELY5;rzg8_X5d;n?zd;n?!d;^04g8_X6d<0Vf zd;(+wYXWjSd;n?#d;x<2eFc04Qvh@XWCMc%0RUtGQvfLhPTB!;aufgn0D2mAZfkCD zcOYzKc4cfJX>K4;VRB(@Wprh7At?oN2><{9at;6h0CEff0043b0002m1#og{ZgeRI zat{Ci0NMd~+68Q7c4chZ0e0F3Z*_EVb#!tS0001T5&!@IauNUl0CE!m007ztW^ZzL zVRB>{0001U5C8xG00RgC3IPxS1OW&E3IPNHdJ9NFPA+C|a(7{JWGMl16aWAKDF)gI zUtexvZDn6y+6rG^ZEs|CY-L|x+6-S`adlyAZeeX@Ute+t0001T8vpr+41)lC0ACGs2Wk$30DJ&n4|E4=5Ofi05_|+?1%m>E z0ek>o6MO(_6KWJ{6=Vlw27>|t0Av7D04WGgMpP*Uau5Ij09p%ZX=G$&ZeMe0dSzM) zZgp*6Y+-q2a&uY>VqtV+XkTV&a&vUr4Pt3zX>w&_bZKvHVQg{)0002#0RR91XPTLr znVFfnKWCbmnVFfHxxXnEas>bY0CEQa0043Z0001T5C8xG+7D-Pbzf*{WMpM-Uvp`C zW!esBa&=#Bb!}g4VR>b8bJ`GNZ*FvDcywQOWn*7+X?kUH3IG5AS_4N?RoVk*a&>YG z0002m0%l`c1XN*eXxapHVQy%04FCWDat;6h0NM#qVRB(@Wprh7+68oPa${&(22^Ek zb8m9m3uA9?bY*ySUv_0<+5}`_bYWTub#7^9Z*py4DFt#50001T5C8xGav}f#0CFP$ z0043%0000f0df%l0043l0001T6951Jas~hZ04M+e0COGy000014gv-N1_1^E1_K5G z0s#U60s#sF5&;SU3IY@XdKE!*bY*UIX>V>$WpplIUukY>bYEX%0ssI2000002LJ#7 z1ONa4LjV8(b6x-d0CWIq0ek^u0fPd40ek>z0(=330bBxo0(}E?0BQq#0%QS$0(}B} z0BQt$0BQw%0)qjA0eu8~0BQzw0BQ$(1bhH$2!jHH0euB@0BQ$(0%{3^0DJ|50(}O2 z22%hj0#0%v0000f3UVm`003GEb#8NUb!BCGWpX6|007ztb8&TLWqM_DCjbBdawY%( z0CElh007zoZEbQY0002m0dy$`at{Ci0NMs=Zg6#Ub6NvvUvydqXlZ0*Wp3I5b=n1C zY;b5{avA^t0CExl0043l0001T6951JavJ~u06qW!0CNri000023;_-S3;_@U6af$c zdK5u)bY*UIX>V>$WpplPZ*q5Ga%3q6avT5v0CF7w0043x0001T1^@s6avJ~u0CFDy z0043l0001T5&!@IDFJd30001T6951Jat8na04D$d0CNNY000O93>I<+0000f4B7>5 zb!~8Yawz}+09puiZ*pU3E^cme3IG5AS`Kt?a${&NVRdwGXL4a=asmJV09q4tZ*pU3 zE^clvW_503bZKvHVQg9oW_503bZKvHVQks~M%o5VZ)A0BWpV=m0043a0001T5&!@I xauNUl0CExl0043m0002m2s~|XWOZz1J^=s#0CNlg000OA2muNK3;_%R5+5>17w!N6 literal 0 KcmV+b0RR6000031 diff --git a/model_inplement/code/model.py b/model_inplement/code/model.py index f73cabe3..32ebdbf9 100644 --- a/model_inplement/code/model.py +++ b/model_inplement/code/model.py @@ -22,19 +22,16 @@ class HAN(nn.Module): self.output_layer = nn.Linear(2* sent_hidden_size, output_size) self.softmax = nn.Softmax() - def forward(self, x, level='w'): + def forward(self, doc): # input is a sequence of vector # if level == w, a seq of words (a sent); level == s, a seq of sents (a doc) - if level == 's': - v = self.sent_layer(x) - output = self.softmax(self.output_layer(v)) - return output - elif level == 'w': - s = self.word_layer(x) - return s - else: - print('unknow level in Parameter!') - + s_list = [] + for sent in doc: + s_list.append(self.word_layer(sent)) + s_vec = torch.cat(s_list, dim=1).t() + doc_vec = self.sent_layer(s_vec) + output = self.softmax(self.output_layer(doc_vec)) + return output class AttentionNet(nn.Module): def __init__(self, input_size, gru_hidden_size, gru_num_layers, context_vec_size): @@ -60,11 +57,53 @@ class AttentionNet(nn.Module): self.context_vec.data.uniform_(-0.1, 0.1) def forward(self, inputs): - # inputs's dim seq_len*word_dim + # inputs's dim (seq_len, word_dim) inputs = torch.unsqueeze(inputs, 1) h_t, hidden = self.gru(inputs) h_t = torch.squeeze(h_t, 1) u = self.tanh(self.fc(h_t)) alpha = self.softmax(torch.mm(u, self.context_vec)) output = torch.mm(h_t.t(), alpha) + # output's dim (2*hidden_size, 1) return output + + +''' +Train process +''' +import math +import os +import copy +import pickle + +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import numpy as np +import json +import nltk + +optimizer = torch.optim.SGD(lr=0.01) +criterion = nn.NLLLoss() +epoch = 1 +batch_size = 10 + +net = HAN(input_size=100, output_size=5, + word_hidden_size=50, word_num_layers=1, word_context_size=100, + sent_hidden_size=50, sent_num_layers=1, sent_context_size=100) + +def dataloader(filename): + samples = pickle.load(open(filename, 'rb')) + return samples + +def gen_doc(text): + pass + +class SampleDoc: + def __init__(self, doc, label): + self.doc = doc + self.label = label + + def __iter__(self): + for sent in self.doc: + for word in sent: + diff --git a/model_inplement/code/preprocess.py b/model_inplement/code/preprocess.py new file mode 100644 index 00000000..37f6eb25 --- /dev/null +++ b/model_inplement/code/preprocess.py @@ -0,0 +1,42 @@ +import pickle +import json +import nltk +from nltk.tokenize import stanford + +# f = open('dataset/review.json', encoding='utf-8') +# samples = [] +# j = 0 +# for i, line in enumerate(f.readlines()): +# review = json.loads(line) +# samples.append((review['stars'], review['text'])) +# if (i+1) % 5000 == 0: +# print(i) +# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) +# j += 1 +# samples = [] +# pickle.dump(samples, open('review/samples%d.pkl'%j, 'wb')) +samples = pickle.load(open('review/samples0.pkl', 'rb')) +# print(samples[0]) + +import os +os.environ['JAVAHOME'] = 'D:\\java\\bin\\java.exe' +path_to_jar = 'E:\\College\\fudanNLP\\stanford-corenlp-full-2018-02-27\\stanford-corenlp-3.9.1.jar' +tokenizer = stanford.CoreNLPTokenizer() + +dirname = 'review' +dirname1 = 'reviews' + +for fn in os.listdir(dirname): + print(fn) + precessed = [] + for stars, text in pickle.load(open(os.path.join(dirname, fn), 'rb')): + tokens = [] + sents = nltk.tokenize.sent_tokenize(text) + for s in sents: + tokens.append(tokenizer.tokenize(s)) + precessed.append((stars, tokens)) + # print(tokens) + if len(precessed) % 100 == 0: + print(len(precessed)) + pickle.dump(precessed, open(os.path.join(dirname1, fn), 'wb')) + diff --git a/model_inplement/code/train.py b/model_inplement/code/train.py new file mode 100644 index 00000000..e69de29b