update

5 years ago · 7193608d8f
--- a/train/README.md
+++ b/train/README.md
@@ -1,3 +1,5 @@
 # Jiagu自然语言处理工具训练方法

 1. 将数据放入data目录，格式见`data/train.txt`
 2. 运行`python3 perceptron.py`训练即可

--- a/train/data/msr.txt
+++ b/train/data/msr.txt
--- a/train/model/ap.model
+++ b/train/model/ap.model
--- a/train/perceptron.py
+++ b/train/perceptron.py
@@ -1,20 +1,13 @@
 # -*- coding:utf-8 -*-
 """
 Averaged perceptron classifier. Implementation geared for simplicity rather than
 efficiency.
 """
 from collections import defaultdict
 import os
 import gzip
 import pickle
 import random


 from collections import defaultdict
 '''
 http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
 '''
 class AveragedPerceptron(object):

    '''An averaged perceptron, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    '''

    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        self.weights = {}
@@ -72,27 +65,164 @@ class AveragedPerceptron(object):
            self.weights[feat] = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump(dict(self.weights), open(path, 'w'))
 class Perceptron:
 	def __init__(self, loc=None):
 		self.START = ['-START-', '-START2-']
 		self.END = ['-END-', '-END2-']
 		self.model = AveragedPerceptron()
 		
 		if loc != None:
 			self.load(loc)

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None
 	def predict(self, words):
 		prev, prev2 = self.START
 		labels = []
 		context = self.START + words + self.END
 		for i, word in enumerate(words):
 			features = self._get_features(i, word, context, prev, prev2)
 			tag = self.model.predict(features)
 			labels.append(tag)
 			prev2 = prev
 			prev = tag
 		return labels
 		
 	def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
 		self._make_tagdict(sentences)
 		for iter_ in range(nr_iter):
 			c = 0
 			n = 0
 			for words, tags in sentences:
 				prev, prev2 = self.START
 				context = self.START + words + self.END
 				for i, word in enumerate(words):
 					feats = self._get_features(i, word, context, prev, prev2)
 					guess = self.model.predict(feats)
 					self.model.update(tags[i], guess, feats)

 					prev2 = prev
 					prev = guess
 					c += guess == tags[i]
 					n += 1
 			if shuf == True:
 				random.shuffle(sentences)
 				
 			print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
 			self.save(save_loc)
 			
 		self.model.average_weights()
 		self.save(save_loc)
 		
 	def save(self, loc='model/ap.model', zip=True):
 		if zip == False:
 			pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
 		else:
 			pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
 			
 	def load(self, loc='model/ap.model', zip=True):
 		if zip == False:
 			self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
 		else:
 			self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
 			
 	def _get_features(self, i, word, context, prev, prev2):
 		'''Map tokens into a feature representation, implemented as a
 		{hashable: float} dict. If the features change, a new model must be
 		trained.
 		'''
 		def add(name, *args):
 			features[' '.join((name,) + tuple(args))] += 1

 		i += len(self.START)
 		features = defaultdict(int)
 		# It's useful to have a constant feature, which acts sort of like a prior
 		add('bias')
 		add('i suffix', word[-3:])
 		add('i pref1', word[0])
 		add('i-1 tag', prev)
 		add('i-2 tag', prev2)
 		add('i tag+i-2 tag', prev, prev2)
 		add('i word', context[i])
 		add('i-1 tag+i word', prev, context[i])
 		add('i-1 word', context[i - 1])
 		add('i-1 suffix', context[i - 1][-3:])
 		add('i-2 word', context[i - 2])
 		add('i+1 word', context[i + 1])
 		add('i+1 suffix', context[i + 1][-3:])
 		add('i+2 word', context[i + 2])
 		return features

 	def _make_tagdict(self, sentences):
 		'''Make a tag dictionary for single-tag words.'''
 		for words, tags in sentences:
 			for word, tag in zip(words, tags):
 				self.model.classes.add(tag)
 				
 def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
 	tagger = Perceptron()
 	print('Reading corpus...')
 	training_data = []
 	sentence = ([], [])
 	fin = open(filepath, 'r', encoding='utf8')
 	for index, line in enumerate(fin):
 		line = line.strip()
 		if line == '':
 			training_data.append(sentence)
 			sentence = ([], [])
 		else:
 			params = line.split()
 			sentence[0].append(params[0])
 			sentence[1].append(params[1])
 	fin.close()
 	print('training corpus size : %d', len(training_data))
 	print('Start training...')
 	tagger.train(training_data, save_loc=model, nr_iter=nr_iter)

 def eval(filepath='data/test.txt', model='model/ap.model'):
 	tagger = Perceptron(model)
 	
 	print('Start testing...')
 	right = 0.0
 	total = 0.0
 	sentence = ([], [])
 	fin = open(filepath, 'r', encoding='utf8')
 	for index, line in enumerate(fin):
 		line = line.strip()
 		if line == '':
 			words = sentence[0]
 			tags = sentence[1]
 			outputs = tagger.predict(words)
 			assert len(tags) == len(outputs)
 			total += len(tags)
 			for o, t in zip(outputs, tags):
 				if o == t: right += 1
 			sentence = ([], [])
 		else:
 			params = line.split()
 			if len(params) != 2: continue
 			sentence[0].append(params[0])
 			sentence[1].append(params[1])
 	fin.close()
 	print("Precision : %f", right / total)
 	
 def predict(model='model/ap.model'):
 	tagger = Perceptron(model)

 	while True:
 		text = input('>')
 		words = list(text)
 		labels = tagger.predict(words)
 		
 		for word, label in zip(words, labels):
 			print(word, label)
 			

 if __name__ == '__main__':
 	train()
 	eval()
 	# predict()
 	
 	

 def train(nr_iter, examples):
    '''Return an averaged perceptron model trained on ``examples`` for
    ``nr_iter`` iterations.
    '''
    model = AveragedPerceptron()
    for i in range(nr_iter):
        random.shuffle(examples)
        for features, class_ in examples:
            scores = model.predict(features)
            guess, score = max(scores.items(), key=lambda i: i[1])
            if guess != class_:
                model.update(class_, guess, features)
    model.average_weights()
    return model
 	
 	
 	
--- a/train/train.py
+++ b/train/train.py
@@ -1,210 +0,0 @@
 # -*- coding:utf-8 -*-
 from __future__ import absolute_import
 import os
 import random
 from collections import defaultdict
 import pickle
 import logging

 from AveragedPerceptron import AveragedPerceptron

 PICKLE = "data/trontagger-0.1.0.pickle"


 class PerceptronTagger():
    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    :param load: Load the pickled model upon instantiation.
    '''

    START = ['-START-', '-START2-']
    END = ['-END-', '-END2-']
    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

    def __init__(self, load=True):
        self.model = AveragedPerceptron()
        self.tagdict = {}
        self.classes = set()
        if load:
            self.load(self.AP_MODEL_LOC)

    def tag(self, corpus):
        '''Tags a string `corpus`.'''
        # Assume untokenized corpus has \n between sentences and ' ' between words
        s_split = lambda t: t.split('\n')
        w_split = lambda s: s.split()

        def split_sents(corpus):
            for s in s_split(corpus):
                yield w_split(s)

        prev, prev2 = self.START
        tokens = []
        for words in split_sents(corpus):
            context = self.START + [self._normalize(w) for w in words] + self.END
            for i, word in enumerate(words):
                tag = self.tagdict.get(word)
                if not tag:
                    features = self._get_features(i, word, context, prev, prev2)
                    tag = self.model.predict(features)
                tokens.append((word, tag))
                prev2 = prev
                prev = tag
        return tokens

    def train(self, sentences, save_loc=None, nr_iter=5):
        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
        controls the number of Perceptron training iterations.
        :param sentences: A list of (words, tags) tuples.
        :param save_loc: If not ``None``, saves a pickled model in this location.
        :param nr_iter: Number of training iterations.
        '''
        self._make_tagdict(sentences)
        self.model.classes = self.classes
        for iter_ in range(nr_iter):
            c = 0
            n = 0
            for words, tags in sentences:
                prev, prev2 = self.START
                context = self.START + [self._normalize(w) for w in words] \
                          + self.END
                for i, word in enumerate(words):
                    guess = self.tagdict.get(word)
                    if not guess:
                        feats = self._get_features(i, word, context, prev, prev2)
                        guess = self.model.predict(feats)
                        self.model.update(tags[i], guess, feats)
                    prev2 = prev
                    prev = guess
                    c += guess == tags[i]
                    n += 1
            random.shuffle(sentences)
            logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
        self.model.average_weights()
        # Pickle as a binary file
        if save_loc is not None:
            pickle.dump((self.model.weights, self.tagdict, self.classes),
                        open(save_loc, 'wb'), -1)
        return None

    def load(self, loc):
        '''Load a pickled model.'''
        try:
            w_td_c = pickle.load(open(loc, 'rb'))
        except IOError:
            msg = ("Missing trontagger.pickle file.")
            raise IOError(msg)
        self.model.weights, self.tagdict, self.classes = w_td_c
        self.model.classes = self.classes
        return None

    def _normalize(self, word):
        '''Normalization used in pre-processing.
        - All words are lower cased
        - Digits in the range 1800-2100 are represented as !YEAR;
        - Other digits are represented as !DIGITS
        :rtype: str
        '''
        if '-' in word and word[0] != '-':
            return '!HYPHEN'
        elif word.isdigit() and len(word) == 4:
            return '!YEAR'
        elif word[0].isdigit():
            return '!DIGITS'
        else:
            return word.lower()

    def _get_features(self, i, word, context, prev, prev2):
        '''Map tokens into a feature representation, implemented as a
        {hashable: float} dict. If the features change, a new model must be
        trained.
        '''

        def add(name, *args):
            features[' '.join((name,) + tuple(args))] += 1

        i += len(self.START)
        features = defaultdict(int)
        # It's useful to have a constant feature, which acts sort of like a prior
        add('bias')
        add('i suffix', word[-3:])
        add('i pref1', word[0])
        add('i-1 tag', prev)
        add('i-2 tag', prev2)
        add('i tag+i-2 tag', prev, prev2)
        add('i word', context[i])
        add('i-1 tag+i word', prev, context[i])
        add('i-1 word', context[i - 1])
        add('i-1 suffix', context[i - 1][-3:])
        add('i-2 word', context[i - 2])
        add('i+1 word', context[i + 1])
        add('i+1 suffix', context[i + 1][-3:])
        add('i+2 word', context[i + 2])
        return features

    def _make_tagdict(self, sentences):
        '''Make a tag dictionary for single-tag words.'''
        counts = defaultdict(lambda: defaultdict(int))
        for words, tags in sentences:
            for word, tag in zip(words, tags):
                counts[word][tag] += 1
                self.classes.add(tag)
        freq_thresh = 20
        ambiguity_thresh = 0.97
        for word, tag_freqs in counts.items():
            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
            n = sum(tag_freqs.values())
            # Don't add rare words to the tag dictionary
            # Only add quite unambiguous words
            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
                self.tagdict[word] = tag


 def _pc(n, d):
    return (float(n) / d) * 100


 if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    tagger = PerceptronTagger(False)
    try:
        tagger.load(PICKLE)
        print(tagger.tag('how are you ?'))
        logging.info('Start testing...')
        right = 0.0
        total = 0.0
        sentence = ([], [])
        for line in open('data/test.txt'):
            params = line.split()
            if len(params) != 2: continue
            sentence[0].append(params[0])
            sentence[1].append(params[1])
            if params[0] == '.':
                text = ''
                words = sentence[0]
                tags = sentence[1]
                for i, word in enumerate(words):
                    text += word
                    if i < len(words): text += ' '
                outputs = tagger.tag(text)
                assert len(tags) == len(outputs)
                total += len(tags)
                for o, t in zip(outputs, tags):
                    if o[1].strip() == t: right += 1
                sentence = ([], [])
        logging.info("Precision : %f", right / total)
    except IOError:
        logging.info('Reading corpus...')
        training_data = []
        sentence = ([], [])
        for line in open('data/train.txt'):
            params = line.split('\t')
            sentence[0].append(params[0])
            sentence[1].append(params[1])
            if params[0] == '.':
                training_data.append(sentence)
                sentence = ([], [])
        logging.info('training corpus size : %d', len(training_data))
        logging.info('Start training...')
        tagger.train(training_data, save_loc=PICKLE)