@@ -0,0 +1,110 @@ | |||
# Byte-compiled / optimized / DLL files | |||
__pycache__/ | |||
*.py[cod] | |||
*$py.class | |||
# C extensions | |||
*.so | |||
# Distribution / packaging | |||
.Python | |||
build/ | |||
develop-eggs/ | |||
dist/ | |||
downloads/ | |||
eggs/ | |||
.eggs/ | |||
lib/ | |||
lib64/ | |||
parts/ | |||
sdist/ | |||
var/ | |||
wheels/ | |||
*.egg-info/ | |||
.installed.cfg | |||
*.egg | |||
MANIFEST | |||
# PyInstaller | |||
# Usually these files are written by a python script from a template | |||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | |||
*.manifest | |||
*.spec | |||
# Installer logs | |||
pip-log.txt | |||
pip-delete-this-directory.txt | |||
# Unit test / coverage reports | |||
htmlcov/ | |||
.tox/ | |||
.coverage | |||
.coverage.* | |||
.cache | |||
nosetests.xml | |||
coverage.xml | |||
*.cover | |||
.hypothesis/ | |||
.pytest_cache/ | |||
# Translations | |||
*.mo | |||
*.pot | |||
# Django stuff: | |||
*.log | |||
local_settings.py | |||
db.sqlite3 | |||
# Flask stuff: | |||
instance/ | |||
.webassets-cache | |||
# Scrapy stuff: | |||
.scrapy | |||
# Sphinx documentation | |||
docs/_build/ | |||
# PyBuilder | |||
target/ | |||
# Jupyter Notebook | |||
.ipynb_checkpoints | |||
# pyenv | |||
.python-version | |||
# celery beat schedule file | |||
celerybeat-schedule | |||
# SageMath parsed files | |||
*.sage.py | |||
# Environments | |||
.env | |||
.venv | |||
env/ | |||
venv/ | |||
ENV/ | |||
env.bak/ | |||
venv.bak/ | |||
# Spyder project settings | |||
.spyderproject | |||
.spyproject | |||
# Rope project settings | |||
.ropeproject | |||
# mkdocs documentation | |||
/site | |||
# mypy | |||
.mypy_cache | |||
#custom | |||
GoogleNews-vectors-negative300.bin/ | |||
GoogleNews-vectors-negative300.bin.gz | |||
models/ | |||
*.swp |
@@ -0,0 +1,77 @@ | |||
## Introduction | |||
This is the implementation of [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) paper in PyTorch. | |||
* MRDataset, non-static-model(word2vec rained by Mikolov etal. (2013) on 100 billion words of Google News) | |||
* It can be run in both CPU and GPU | |||
* The best accuracy is 82.61%, which is better than 81.5% in the paper | |||
(by Jingyuan Liu @Fudan University; Email:(fdjingyuan@outlook.com) Welcome to discussion!) | |||
## Requirement | |||
* python 3.6 | |||
* pytorch > 0.1 | |||
* numpy | |||
* gensim | |||
## Run | |||
STEP 1 | |||
install packages like gensim (other needed pakages is the same) | |||
``` | |||
pip install gensim | |||
``` | |||
STEP 2 | |||
install MRdataset and word2vec resources | |||
* MRdataset: you can download the dataset in (https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz) | |||
* word2vec: you can download the file in (https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit) | |||
Since this file is more than 1.5G, I did not display in folders. If you download the file, please remember modify the path in Function def word_embeddings(path = './GoogleNews-vectors-negative300.bin/'): | |||
STEP 3 | |||
train the model | |||
``` | |||
python train.py | |||
``` | |||
you will get the information printed in the screen, like | |||
``` | |||
Epoch [1/20], Iter [100/192] Loss: 0.7008 | |||
Test Accuracy: 71.869159 % | |||
Epoch [2/20], Iter [100/192] Loss: 0.5957 | |||
Test Accuracy: 75.700935 % | |||
Epoch [3/20], Iter [100/192] Loss: 0.4934 | |||
Test Accuracy: 78.130841 % | |||
...... | |||
Epoch [20/20], Iter [100/192] Loss: 0.0364 | |||
Test Accuracy: 81.495327 % | |||
Best Accuracy: 82.616822 % | |||
Best Model: models/cnn.pkl | |||
``` | |||
## Hyperparameters | |||
According to the paper and experiment, I set: | |||
|Epoch|Kernel Size|dropout|learning rate|batch size| | |||
|---|---|---|---|---| | |||
|20|\(h,300,100\)|0.5|0.0001|50| | |||
h = [3,4,5] | |||
If the accuracy is not improved, the learning rate will /*0.8. | |||
## Result | |||
I just tried one dataset : MR. (Other 6 dataset in paper SST-1, SST-2, TREC, CR, MPQA) | |||
There are four models in paper: CNN-rand, CNN-static, CNN-non-static, CNN-multichannel. | |||
I have tried CNN-non-static:A model with pre-trained vectors from word2vec. | |||
All words—including the unknown ones that are randomly initialized and the pretrained vectors are fine-tuned for each task | |||
(which has almost the best performance and the most difficut to implement among the four models) | |||
|Dataset|Class Size|Best Result|Kim's Paper Result| | |||
|---|---|---|---| | |||
|MR|2|82.617%(CNN-non-static)|81.5%(CNN-nonstatic)| | |||
## Reference | |||
* [Convolutional Neural Networks for Sentence Classification](https://arxiv.org/abs/1408.5882) | |||
* https://github.com/Shawn1993/cnn-text-classification-pytorch | |||
* https://github.com/junwang4/CNN-sentence-classification-pytorch-2017/blob/master/utils.py | |||
@@ -0,0 +1,149 @@ | |||
import re | |||
import sys | |||
import itertools | |||
import numpy as np | |||
from torch.utils.data import Dataset, DataLoader | |||
import random | |||
import os | |||
import pickle | |||
import codecs | |||
from gensim import corpora | |||
import gensim | |||
def clean_str(string): | |||
""" | |||
Tokenization/string cleaning for all datasets except for SST. | |||
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py | |||
""" | |||
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) | |||
string = re.sub(r"\'s", " \'s", string) | |||
string = re.sub(r"\'ve", " \'ve", string) | |||
string = re.sub(r"n\'t", " n\'t", string) | |||
string = re.sub(r"\'re", " \'re", string) | |||
string = re.sub(r"\'d", " \'d", string) | |||
string = re.sub(r"\'ll", " \'ll", string) | |||
string = re.sub(r",", " , ", string) | |||
string = re.sub(r"!", " ! ", string) | |||
string = re.sub(r"\(", " \( ", string) | |||
string = re.sub(r"\)", " \) ", string) | |||
string = re.sub(r"\?", " \? ", string) | |||
string = re.sub(r"\s{2,}", " ", string) | |||
return string.strip() | |||
def pad_sentences(sentence, padding_word=" <PAD/>"): | |||
sequence_length = 64 | |||
sent = sentence.split() | |||
padded_sentence = sentence + padding_word * (sequence_length - len(sent)) | |||
return padded_sentence | |||
#data loader | |||
class MRDataset(Dataset): | |||
def __init__(self): | |||
#load positive and negative sentenses from files | |||
with codecs.open("./rt-polaritydata/rt-polarity.pos",encoding ='ISO-8859-1') as f: | |||
positive_examples = list(f.readlines()) | |||
with codecs.open("./rt-polaritydata/rt-polarity.neg",encoding ='ISO-8859-1') as f: | |||
negative_examples = list(f.readlines()) | |||
#s.strip: clear "\n"; clear_str; pad | |||
positive_examples = [pad_sentences(clean_str(s.strip())) for s in positive_examples] | |||
negative_examples = [pad_sentences(clean_str(s.strip())) for s in negative_examples] | |||
self.examples = positive_examples + negative_examples | |||
self.sentences_texts = [sample.split() for sample in self.examples] | |||
#word dictionary | |||
dictionary = corpora.Dictionary(self.sentences_texts) | |||
self.word2id_dict = dictionary.token2id # transform to dict, like {"human":0, "a":1,...} | |||
#set lables: postive is 1; negative is 0 | |||
positive_labels = [1 for _ in positive_examples] | |||
negative_labels = [0 for _ in negative_examples] | |||
self.lables = positive_labels + negative_labels | |||
examples_lables = list(zip(self.examples,self.lables)) | |||
random.shuffle(examples_lables) | |||
self.MRDataset_frame = examples_lables | |||
#transform word to id | |||
self.MRDataset_wordid = \ | |||
[( | |||
np.array([self.word2id_dict[word] for word in sent[0].split()], dtype=np.int64), | |||
sent[1] | |||
) for sent in self.MRDataset_frame] | |||
def word_embeddings(self, path = './GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin'): | |||
#establish from google | |||
model = gensim.models.KeyedVectors.load_word2vec_format(path, binary=True) | |||
print('Please wait ... (it could take a while to load the file : {})'.format(path)) | |||
word_dict = self.word2id_dict | |||
embedding_weights = np.random.uniform(-0.25, 0.25, (len(self.word2id_dict), 300)) | |||
for word in word_dict: | |||
word_id = word_dict[word] | |||
if word in model.wv.vocab: | |||
embedding_weights[word_id, :] = model[word] | |||
return embedding_weights | |||
def __len__(self): | |||
return len(self.MRDataset_frame) | |||
def __getitem__(self,idx): | |||
sample = self.MRDataset_wordid[idx] | |||
return sample | |||
def getsent(self, idx): | |||
sample = self.MRDataset_wordid[idx][0] | |||
return sample | |||
def getlabel(self, idx): | |||
label = self.MRDataset_wordid[idx][1] | |||
return label | |||
def word2id(self): | |||
return self.word2id_dict | |||
def id2word(self): | |||
id2word_dict = dict([val,key] for key,val in self.word2id_dict.items()) | |||
return id2word_dict | |||
class train_set(Dataset): | |||
def __init__(self, samples): | |||
self.train_frame = samples | |||
def __len__(self): | |||
return len(self.train_frame) | |||
def __getitem__(self, idx): | |||
return self.train_frame[idx] | |||
class test_set(Dataset): | |||
def __init__(self, samples): | |||
self.test_frame = samples | |||
def __len__(self): | |||
return len(self.test_frame) | |||
def __getitem__(self, idx): | |||
return self.test_frame[idx] |
@@ -0,0 +1,61 @@ | |||
import os | |||
import sys | |||
import numpy as np | |||
import torch | |||
import torch.nn as nn | |||
import torch.nn.functional as F | |||
from torch.autograd import Variable | |||
from torch.utils.data import DataLoader, TensorDataset | |||
import dataset | |||
""" | |||
#some information | |||
mode = "static" | |||
use_pretrained_embedding = "gensim.word2vec" | |||
print('MODE = {}'.format(mode)) | |||
print('EMBEDDING = {}\n'.format(use_pretrained_embeddings) | |||
embedding_weights = dataset.word_embedding_300() | |||
embed_num = len(embedding_weights) | |||
embed_dim = 300 | |||
class_num = 2 | |||
len_sentence = 64 | |||
print('embedding size = {}'.format(embed_num)) | |||
print('embedding dimension = {}'.format(embed_dim)) | |||
print('sentence len n = {}'.format(len_sentence)) | |||
print('num of classes = {}'.format(class_num)) | |||
""" | |||
class CNN_text(nn.Module): | |||
def __init__(self, kernel_h=[3,4,5], kernel_num=100, embed_num=1000, embed_dim=300, dropout=0.5, L2_constrain=3, batchsize=50, pretrained_embeddings=None): | |||
super(CNN_text, self).__init__() | |||
self.embedding = nn.Embedding(embed_num,embed_dim) | |||
self.dropout = nn.Dropout(dropout) | |||
if pretrained_embeddings is not None: | |||
self.embedding.weight.data.copy_(torch.from_numpy(pretrained_embeddings)) | |||
#the network structure | |||
#Conv2d: input- N,C,H,W output- (50,100,62,1) | |||
self.conv1 = nn.ModuleList([nn.Conv2d(1, 100, (K, 300)) for K in kernel_h]) | |||
self.fc1 = nn.Linear(300,2) | |||
def max_pooling(self, x): | |||
x = F.relu(conv(x)).squeeze(3) #N,C,L - (50,100,62) | |||
x = F.max_pool1d(x, x.size(2)).squeeze(2) | |||
#x.size(2)=62 squeeze: (50,100,1) -> (50,100) | |||
return x | |||
def forward(self, x): | |||
x = self.embedding(x) #output: (N,H,W) = (50,64,300) | |||
x = x.unsqueeze(1) #(N,C,H,W) | |||
x = [F.relu(conv(x)).squeeze(3) for conv in self.conv1] #[N, C, H(50,100,62),(50,100,61),(50,100,60)] | |||
x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[N,C(50,100),(50,100),(50,100)] | |||
x = torch.cat(x,1) | |||
x = self.dropout(x) | |||
x = self.fc1(x) | |||
return x |
@@ -0,0 +1,100 @@ | |||
import os | |||
import torch | |||
import torch.nn as nn | |||
import torchvision.datasets as dsets | |||
import torchvision.transforms as transforms | |||
import dataset as dst | |||
from model import CNN_text | |||
from torch.autograd import Variable | |||
from sklearn import cross_validation | |||
from sklearn import datasets | |||
# Hyper Parameters | |||
batch_size = 50 | |||
learning_rate = 0.0001 | |||
num_epochs = 20 | |||
cuda = True | |||
#split Dataset | |||
dataset = dst.MRDataset() | |||
length = len(dataset) | |||
train_dataset = dataset[:int(0.9*length)] | |||
test_dataset = dataset[int(0.9*length):] | |||
train_dataset = dst.train_set(train_dataset) | |||
test_dataset = dst.test_set(test_dataset) | |||
# Data Loader | |||
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, | |||
batch_size=batch_size, | |||
shuffle=True) | |||
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, | |||
batch_size=batch_size, | |||
shuffle=False) | |||
cnn = CNN_text(embed_num=len(dataset.word2id()), pretrained_embeddings=dataset.word_embeddings()) | |||
if cuda: | |||
cnn.cuda() | |||
# Loss and Optimizer | |||
criterion = nn.CrossEntropyLoss() | |||
optimizer = torch.optim.Adam(cnn.parameters(), lr=learning_rate) | |||
best_acc = None | |||
for epoch in range(num_epochs): | |||
# Train the Model | |||
cnn.train() | |||
for i, (sents,labels) in enumerate(train_loader): | |||
sents = Variable(sents) | |||
labels = Variable(labels) | |||
if cuda: | |||
sents = sents.cuda() | |||
labels = labels.cuda() | |||
optimizer.zero_grad() | |||
outputs = cnn(sents) | |||
loss = criterion(outputs, labels) | |||
loss.backward() | |||
optimizer.step() | |||
if (i+1) % 100 == 0: | |||
print ('Epoch [%d/%d], Iter [%d/%d] Loss: %.4f' | |||
%(epoch+1, num_epochs, i+1, len(train_dataset)//batch_size, loss.data[0])) | |||
# Test the Model | |||
cnn.eval() | |||
correct = 0 | |||
total = 0 | |||
for sents, labels in test_loader: | |||
sents = Variable(sents) | |||
if cuda: | |||
sents = sents.cuda() | |||
labels = labels.cuda() | |||
outputs = cnn(sents) | |||
_, predicted = torch.max(outputs.data, 1) | |||
total += labels.size(0) | |||
correct += (predicted == labels).sum() | |||
acc = 100. * correct / total | |||
print('Test Accuracy: %f %%' % (acc)) | |||
if best_acc is None or acc > best_acc: | |||
best_acc = acc | |||
if os.path.exists("models") is False: | |||
os.makedirs("models") | |||
torch.save(cnn.state_dict(), 'models/cnn.pkl') | |||
else: | |||
learning_rate = learning_rate * 0.8 | |||
print("Best Accuracy: %f %%" % best_acc) | |||
print("Best Model: models/cnn.pkl") |