@@ -1,3 +1,5 @@ | |||
# Jiagu自然语言处理工具训练方法 | |||
1. 将数据放入data目录,格式见`data/train.txt` | |||
2. 运行`python3 perceptron.py`训练即可 | |||
@@ -1,20 +1,13 @@ | |||
# -*- coding:utf-8 -*- | |||
""" | |||
Averaged perceptron classifier. Implementation geared for simplicity rather than | |||
efficiency. | |||
""" | |||
from collections import defaultdict | |||
import os | |||
import gzip | |||
import pickle | |||
import random | |||
from collections import defaultdict | |||
''' | |||
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ | |||
''' | |||
class AveragedPerceptron(object): | |||
'''An averaged perceptron, as implemented by Matthew Honnibal. | |||
See more implementation details here: | |||
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ | |||
''' | |||
def __init__(self): | |||
# Each feature gets its own weight vector, so weights is a dict-of-dicts | |||
self.weights = {} | |||
@@ -72,27 +65,164 @@ class AveragedPerceptron(object): | |||
self.weights[feat] = new_feat_weights | |||
return None | |||
def save(self, path): | |||
'''Save the pickled model weights.''' | |||
return pickle.dump(dict(self.weights), open(path, 'w')) | |||
class Perceptron: | |||
def __init__(self, loc=None): | |||
self.START = ['-START-', '-START2-'] | |||
self.END = ['-END-', '-END2-'] | |||
self.model = AveragedPerceptron() | |||
if loc != None: | |||
self.load(loc) | |||
def load(self, path): | |||
'''Load the pickled model weights.''' | |||
self.weights = pickle.load(open(path)) | |||
return None | |||
def predict(self, words): | |||
prev, prev2 = self.START | |||
labels = [] | |||
context = self.START + words + self.END | |||
for i, word in enumerate(words): | |||
features = self._get_features(i, word, context, prev, prev2) | |||
tag = self.model.predict(features) | |||
labels.append(tag) | |||
prev2 = prev | |||
prev = tag | |||
return labels | |||
def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): | |||
self._make_tagdict(sentences) | |||
for iter_ in range(nr_iter): | |||
c = 0 | |||
n = 0 | |||
for words, tags in sentences: | |||
prev, prev2 = self.START | |||
context = self.START + words + self.END | |||
for i, word in enumerate(words): | |||
feats = self._get_features(i, word, context, prev, prev2) | |||
guess = self.model.predict(feats) | |||
self.model.update(tags[i], guess, feats) | |||
prev2 = prev | |||
prev = guess | |||
c += guess == tags[i] | |||
n += 1 | |||
if shuf == True: | |||
random.shuffle(sentences) | |||
print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) | |||
self.save(save_loc) | |||
self.model.average_weights() | |||
self.save(save_loc) | |||
def save(self, loc='model/ap.model', zip=True): | |||
if zip == False: | |||
pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) | |||
else: | |||
pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) | |||
def load(self, loc='model/ap.model', zip=True): | |||
if zip == False: | |||
self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) | |||
else: | |||
self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) | |||
def _get_features(self, i, word, context, prev, prev2): | |||
'''Map tokens into a feature representation, implemented as a | |||
{hashable: float} dict. If the features change, a new model must be | |||
trained. | |||
''' | |||
def add(name, *args): | |||
features[' '.join((name,) + tuple(args))] += 1 | |||
i += len(self.START) | |||
features = defaultdict(int) | |||
# It's useful to have a constant feature, which acts sort of like a prior | |||
add('bias') | |||
add('i suffix', word[-3:]) | |||
add('i pref1', word[0]) | |||
add('i-1 tag', prev) | |||
add('i-2 tag', prev2) | |||
add('i tag+i-2 tag', prev, prev2) | |||
add('i word', context[i]) | |||
add('i-1 tag+i word', prev, context[i]) | |||
add('i-1 word', context[i - 1]) | |||
add('i-1 suffix', context[i - 1][-3:]) | |||
add('i-2 word', context[i - 2]) | |||
add('i+1 word', context[i + 1]) | |||
add('i+1 suffix', context[i + 1][-3:]) | |||
add('i+2 word', context[i + 2]) | |||
return features | |||
def _make_tagdict(self, sentences): | |||
'''Make a tag dictionary for single-tag words.''' | |||
for words, tags in sentences: | |||
for word, tag in zip(words, tags): | |||
self.model.classes.add(tag) | |||
def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||
tagger = Perceptron() | |||
print('Reading corpus...') | |||
training_data = [] | |||
sentence = ([], []) | |||
fin = open(filepath, 'r', encoding='utf8') | |||
for index, line in enumerate(fin): | |||
line = line.strip() | |||
if line == '': | |||
training_data.append(sentence) | |||
sentence = ([], []) | |||
else: | |||
params = line.split() | |||
sentence[0].append(params[0]) | |||
sentence[1].append(params[1]) | |||
fin.close() | |||
print('training corpus size : %d', len(training_data)) | |||
print('Start training...') | |||
tagger.train(training_data, save_loc=model, nr_iter=nr_iter) | |||
def eval(filepath='data/test.txt', model='model/ap.model'): | |||
tagger = Perceptron(model) | |||
print('Start testing...') | |||
right = 0.0 | |||
total = 0.0 | |||
sentence = ([], []) | |||
fin = open(filepath, 'r', encoding='utf8') | |||
for index, line in enumerate(fin): | |||
line = line.strip() | |||
if line == '': | |||
words = sentence[0] | |||
tags = sentence[1] | |||
outputs = tagger.predict(words) | |||
assert len(tags) == len(outputs) | |||
total += len(tags) | |||
for o, t in zip(outputs, tags): | |||
if o == t: right += 1 | |||
sentence = ([], []) | |||
else: | |||
params = line.split() | |||
if len(params) != 2: continue | |||
sentence[0].append(params[0]) | |||
sentence[1].append(params[1]) | |||
fin.close() | |||
print("Precision : %f", right / total) | |||
def predict(model='model/ap.model'): | |||
tagger = Perceptron(model) | |||
while True: | |||
text = input('>') | |||
words = list(text) | |||
labels = tagger.predict(words) | |||
for word, label in zip(words, labels): | |||
print(word, label) | |||
if __name__ == '__main__': | |||
train() | |||
eval() | |||
# predict() | |||
def train(nr_iter, examples): | |||
'''Return an averaged perceptron model trained on ``examples`` for | |||
``nr_iter`` iterations. | |||
''' | |||
model = AveragedPerceptron() | |||
for i in range(nr_iter): | |||
random.shuffle(examples) | |||
for features, class_ in examples: | |||
scores = model.predict(features) | |||
guess, score = max(scores.items(), key=lambda i: i[1]) | |||
if guess != class_: | |||
model.update(class_, guess, features) | |||
model.average_weights() | |||
return model | |||
@@ -1,210 +0,0 @@ | |||
# -*- coding:utf-8 -*- | |||
from __future__ import absolute_import | |||
import os | |||
import random | |||
from collections import defaultdict | |||
import pickle | |||
import logging | |||
from AveragedPerceptron import AveragedPerceptron | |||
PICKLE = "data/trontagger-0.1.0.pickle" | |||
class PerceptronTagger(): | |||
'''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. | |||
See more implementation details here: | |||
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ | |||
:param load: Load the pickled model upon instantiation. | |||
''' | |||
START = ['-START-', '-START2-'] | |||
END = ['-END-', '-END2-'] | |||
AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE) | |||
def __init__(self, load=True): | |||
self.model = AveragedPerceptron() | |||
self.tagdict = {} | |||
self.classes = set() | |||
if load: | |||
self.load(self.AP_MODEL_LOC) | |||
def tag(self, corpus): | |||
'''Tags a string `corpus`.''' | |||
# Assume untokenized corpus has \n between sentences and ' ' between words | |||
s_split = lambda t: t.split('\n') | |||
w_split = lambda s: s.split() | |||
def split_sents(corpus): | |||
for s in s_split(corpus): | |||
yield w_split(s) | |||
prev, prev2 = self.START | |||
tokens = [] | |||
for words in split_sents(corpus): | |||
context = self.START + [self._normalize(w) for w in words] + self.END | |||
for i, word in enumerate(words): | |||
tag = self.tagdict.get(word) | |||
if not tag: | |||
features = self._get_features(i, word, context, prev, prev2) | |||
tag = self.model.predict(features) | |||
tokens.append((word, tag)) | |||
prev2 = prev | |||
prev = tag | |||
return tokens | |||
def train(self, sentences, save_loc=None, nr_iter=5): | |||
'''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` | |||
controls the number of Perceptron training iterations. | |||
:param sentences: A list of (words, tags) tuples. | |||
:param save_loc: If not ``None``, saves a pickled model in this location. | |||
:param nr_iter: Number of training iterations. | |||
''' | |||
self._make_tagdict(sentences) | |||
self.model.classes = self.classes | |||
for iter_ in range(nr_iter): | |||
c = 0 | |||
n = 0 | |||
for words, tags in sentences: | |||
prev, prev2 = self.START | |||
context = self.START + [self._normalize(w) for w in words] \ | |||
+ self.END | |||
for i, word in enumerate(words): | |||
guess = self.tagdict.get(word) | |||
if not guess: | |||
feats = self._get_features(i, word, context, prev, prev2) | |||
guess = self.model.predict(feats) | |||
self.model.update(tags[i], guess, feats) | |||
prev2 = prev | |||
prev = guess | |||
c += guess == tags[i] | |||
n += 1 | |||
random.shuffle(sentences) | |||
logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n))) | |||
self.model.average_weights() | |||
# Pickle as a binary file | |||
if save_loc is not None: | |||
pickle.dump((self.model.weights, self.tagdict, self.classes), | |||
open(save_loc, 'wb'), -1) | |||
return None | |||
def load(self, loc): | |||
'''Load a pickled model.''' | |||
try: | |||
w_td_c = pickle.load(open(loc, 'rb')) | |||
except IOError: | |||
msg = ("Missing trontagger.pickle file.") | |||
raise IOError(msg) | |||
self.model.weights, self.tagdict, self.classes = w_td_c | |||
self.model.classes = self.classes | |||
return None | |||
def _normalize(self, word): | |||
'''Normalization used in pre-processing. | |||
- All words are lower cased | |||
- Digits in the range 1800-2100 are represented as !YEAR; | |||
- Other digits are represented as !DIGITS | |||
:rtype: str | |||
''' | |||
if '-' in word and word[0] != '-': | |||
return '!HYPHEN' | |||
elif word.isdigit() and len(word) == 4: | |||
return '!YEAR' | |||
elif word[0].isdigit(): | |||
return '!DIGITS' | |||
else: | |||
return word.lower() | |||
def _get_features(self, i, word, context, prev, prev2): | |||
'''Map tokens into a feature representation, implemented as a | |||
{hashable: float} dict. If the features change, a new model must be | |||
trained. | |||
''' | |||
def add(name, *args): | |||
features[' '.join((name,) + tuple(args))] += 1 | |||
i += len(self.START) | |||
features = defaultdict(int) | |||
# It's useful to have a constant feature, which acts sort of like a prior | |||
add('bias') | |||
add('i suffix', word[-3:]) | |||
add('i pref1', word[0]) | |||
add('i-1 tag', prev) | |||
add('i-2 tag', prev2) | |||
add('i tag+i-2 tag', prev, prev2) | |||
add('i word', context[i]) | |||
add('i-1 tag+i word', prev, context[i]) | |||
add('i-1 word', context[i - 1]) | |||
add('i-1 suffix', context[i - 1][-3:]) | |||
add('i-2 word', context[i - 2]) | |||
add('i+1 word', context[i + 1]) | |||
add('i+1 suffix', context[i + 1][-3:]) | |||
add('i+2 word', context[i + 2]) | |||
return features | |||
def _make_tagdict(self, sentences): | |||
'''Make a tag dictionary for single-tag words.''' | |||
counts = defaultdict(lambda: defaultdict(int)) | |||
for words, tags in sentences: | |||
for word, tag in zip(words, tags): | |||
counts[word][tag] += 1 | |||
self.classes.add(tag) | |||
freq_thresh = 20 | |||
ambiguity_thresh = 0.97 | |||
for word, tag_freqs in counts.items(): | |||
tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) | |||
n = sum(tag_freqs.values()) | |||
# Don't add rare words to the tag dictionary | |||
# Only add quite unambiguous words | |||
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: | |||
self.tagdict[word] = tag | |||
def _pc(n, d): | |||
return (float(n) / d) * 100 | |||
if __name__ == '__main__': | |||
logging.basicConfig(level=logging.INFO) | |||
tagger = PerceptronTagger(False) | |||
try: | |||
tagger.load(PICKLE) | |||
print(tagger.tag('how are you ?')) | |||
logging.info('Start testing...') | |||
right = 0.0 | |||
total = 0.0 | |||
sentence = ([], []) | |||
for line in open('data/test.txt'): | |||
params = line.split() | |||
if len(params) != 2: continue | |||
sentence[0].append(params[0]) | |||
sentence[1].append(params[1]) | |||
if params[0] == '.': | |||
text = '' | |||
words = sentence[0] | |||
tags = sentence[1] | |||
for i, word in enumerate(words): | |||
text += word | |||
if i < len(words): text += ' ' | |||
outputs = tagger.tag(text) | |||
assert len(tags) == len(outputs) | |||
total += len(tags) | |||
for o, t in zip(outputs, tags): | |||
if o[1].strip() == t: right += 1 | |||
sentence = ([], []) | |||
logging.info("Precision : %f", right / total) | |||
except IOError: | |||
logging.info('Reading corpus...') | |||
training_data = [] | |||
sentence = ([], []) | |||
for line in open('data/train.txt'): | |||
params = line.split('\t') | |||
sentence[0].append(params[0]) | |||
sentence[1].append(params[1]) | |||
if params[0] == '.': | |||
training_data.append(sentence) | |||
sentence = ([], []) | |||
logging.info('training corpus size : %d', len(training_data)) | |||
logging.info('Start training...') | |||
tagger.train(training_data, save_loc=PICKLE) |