update

5 years ago · 7ee0f29bb1
--- a/demo.py
+++ b/demo.py
@@ -3,11 +3,14 @@ import jiagu
 # jiagu.init() # 可手动初始化，也可以动态初始化


 text = '在苏州冻成狗'
 text = '思知机器人挺好用的'

 words = jiagu.seg(text)  # 分词
 print(words)

 words = jiagu.cut(text)  # 分词
 print(words)

 pos = jiagu.pos(words)  # 词性标注
 print(pos)

@@ -47,7 +50,7 @@ print(summarize)


 # 知识图谱关系抽取
 text = '姚明（Yao Ming），1980年9月12日出生于上海市徐汇区，祖籍江苏省苏州市吴江区震泽镇，前中国职业篮球运动员，司职中锋，现任中职联公司董事长兼总经理。'
 text = '姚明1980年9月12日出生于上海市徐汇区，祖籍江苏省苏州市吴江区震泽镇，前中国职业篮球运动员，司职中锋，现任中职联公司董事长兼总经理。'
 knowledge = jiagu.knowledge(text)
 print(knowledge)

--- a/jiagu/analyze.py
+++ b/jiagu/analyze.py
@@ -35,17 +35,16 @@ class Analyze(object):
 		self.init_cws()
 		self.init_pos()
 		self.init_ner()
 		self.seg_nroute.init()
 		
 		
 	def load_userdict(self, userdict):
 		self.seg_nroute.load_userdict(userdict)

 	def init_cws(self):
 		if self.seg_model is None:
 			self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model'))
 		self.seg_nroute.init()

 	def load_model(self, model_path):
 		self.seg_model = perceptron.Perceptron(model_path)
 		pass

 	def init_pos(self):
 		if self.pos_model is None:
@@ -63,58 +62,11 @@ class Analyze(object):
 		if self.kg_model is None:
 			self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model'))

 	@staticmethod
 	def __lab2word(sentence, labels):
 		sen_len = len(sentence)
 		tmp_word = ""
 		words = []
 		for i in range(sen_len):
 			label = labels[i]
 			w = sentence[i]
 			if label == "B":
 				tmp_word += w
 			elif label == "M":
 				tmp_word += w
 			elif label == "E":
 				tmp_word += w
 				words.append(tmp_word)
 				tmp_word = ""
 			else:
 				if tmp_word != '':
 					words.append(tmp_word)
 					tmp_word = ""
 				words.append(w)
 		if tmp_word:
 			words.append(tmp_word)
 		return words

 	def cws_text(self, sentence):
 		if sentence == '':
 			return ['']
 			
 		sentence = list(sentence)
 		labels = self.seg_model.predict(sentence)
 		return self.__lab2word(sentence, labels)

 	def seg(self, sentence):
 		return self.seg_nroute.seg(sentence, mode="default")
 		
 	def cws(self, sentence, model='default'):
 		"""中文分词

 		:param sentence: str or list
 			文本或者文本列表，根据input的模式来定
 		:param model: str
 			分词所使用的模式，default为默认模式包含新词发现
 		:return:
 		"""
 		if model == 'default':
 			self.init_cws()
 			words = self.cws_text(sentence)
 			return words
 		else:
 			pass
 		return []
 	def cws(self, sentence, mode='probe'):
 		return self.seg_nroute.seg(sentence, mode)

 	def pos(self, words):  # 传入的是词语
 		self.init_pos()
--- a/jiagu/model/kg.model
+++ b/jiagu/model/kg.model
--- a/jiagu/segment/nroute.py
+++ b/jiagu/segment/nroute.py
@@ -243,29 +243,29 @@ class Segment:
 				continue
 			if re_han.match(block):
 				words1 = list(cut_block(block))
 				print(words1)
 				# print(words1)

 				words2 = self.model_cut(block)
 				print(words2)
 				# print(words2)
 				

 				# new_word = [] # 有冲突的不加，长度大于4的不加，加完记得删除
 				# length = len(words1)
 				# for n in range(3):
 					# can_limit = length - n + 1
 					# for i in range(0, can_limit):
 						# ngram = ''.join(words1[i:i + n])
 						# word_len = len(ngram)
 						# if word_len > 4 or word_len==1:
 							# continue
 						# if ngram in words2 and ngram not in words1:
 				new_word = [] # 有冲突的不加，长度大于4的不加，加完记得删除
 				length = len(words1)
 				for n in range(3):
 					can_limit = length - n + 1
 					for i in range(0, can_limit):
 						ngram = ''.join(words1[i:i + n])
 						word_len = len(ngram)
 						if word_len > 4 or word_len==1:
 							continue
 						if ngram in words2 and ngram not in words1:
 							# print(ngram)
 							# new_word.append([ngram, 1])
 							new_word.append([ngram, 1])
 				
 				new_word = []
 				for word in words2:
 					if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
 						new_word.append([word, 1])
 				# new_word = []
 				# for word in words2:
 				# 	if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
 				#		new_word.append([word, 1])
 				
 				
 				self.load_userdict(new_word)
--- a/jiagu/segment/perceptron.py
+++ b/jiagu/segment/perceptron.py
@@ -1,227 +0,0 @@
 # -*- coding:utf-8 -*-
 import os
 import gzip
 import pickle
 import random
 from collections import defaultdict

 class AveragedPerceptron(object):
    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        self.weights = {}
        self.classes = set()
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return the best label.'''
        scores = defaultdict(float)
        for feat, value in features.items():
            if feat not in self.weights or value == 0:
                continue
            weights = self.weights[feat]
            for label, weight in weights.items():
                scores[label] += value * weight
        # Do a secondary alphabetic sort, for stability
        return max(self.classes, key=lambda label: (scores[label], label))

    def update(self, truth, guess, features):
        '''Update the feature weights.'''
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            weights = self.weights.setdefault(f, {})
            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights
        return None

 class Perceptron:
 	def __init__(self, loc=None):
 		self.START = ['-START-', '-START2-']
 		self.END = ['-END-', '-END2-']
 		self.model = AveragedPerceptron()
 		
 		if loc != None:
 			self.load(loc)

 	def predict(self, words):
 		prev, prev2 = self.START
 		labels = []
 		context = self.START + words + self.END
 		for i, word in enumerate(words):
 			features = self._get_features(i, word, context, prev, prev2)
 			tag = self.model.predict(features)
 			labels.append(tag)
 			prev2 = prev
 			prev = tag
 		return labels
 		
 	def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
 		self._make_tagdict(sentences)
 		for iter_ in range(nr_iter):
 			c = 0
 			n = 0
 			for words, tags in sentences:
 				prev, prev2 = self.START
 				context = self.START + words + self.END
 				for i, word in enumerate(words):
 					feats = self._get_features(i, word, context, prev, prev2)
 					guess = self.model.predict(feats)
 					self.model.update(tags[i], guess, feats)

 					prev2 = prev
 					prev = guess
 					c += guess == tags[i]
 					n += 1
 			if shuf == True:
 				random.shuffle(sentences)
 				
 			print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
 			self.save(save_loc)
 			
 		self.model.average_weights()
 		self.save(save_loc)
 		
 	def save(self, loc='model/ap.model', zip=True):
 		if zip == False:
 			pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
 		else:
 			pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
 			
 	def load(self, loc='model/ap.model', zip=True):
 		if zip == False:
 			self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
 		else:
 			self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
 			
 	def _get_features(self, i, word, context, prev, prev2):
 		'''Map tokens into a feature representation, implemented as a
 		{hashable: float} dict. If the features change, a new model must be
 		trained.
 		'''
 		def add(name, *args):
 			features[' '.join((name,) + tuple(args))] += 1

 		i += len(self.START)
 		features = defaultdict(int)
 		# It's useful to have a constant feature, which acts sort of like a prior
 		add('bias')
 		add('i suffix', word[-3:])
 		add('i pref1', word[0])
 		add('i-1 tag', prev)
 		add('i-2 tag', prev2)
 		add('i tag+i-2 tag', prev, prev2)
 		add('i word', context[i])
 		add('i-1 tag+i word', prev, context[i])
 		add('i-1 word', context[i - 1])
 		add('i-1 suffix', context[i - 1][-3:])
 		add('i-2 word', context[i - 2])
 		add('i+1 word', context[i + 1])
 		add('i+1 suffix', context[i + 1][-3:])
 		add('i+2 word', context[i + 2])
 		return features

 	def _make_tagdict(self, sentences):
 		'''Make a tag dictionary for single-tag words.'''
 		for words, tags in sentences:
 			for word, tag in zip(words, tags):
 				self.model.classes.add(tag)
 				
 def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
 	tagger = Perceptron()
 	print('Reading corpus...')
 	training_data = []
 	sentence = ([], [])
 	fin = open(filepath, 'r', encoding='utf8')
 	for index, line in enumerate(fin):
 		line = line.strip()
 		if line == '':
 			training_data.append(sentence)
 			sentence = ([], [])
 		else:
 			params = line.split()
 			if len(params) != 2: continue
 			sentence[0].append(params[0])
 			sentence[1].append(params[1])
 	fin.close()
 	print('training corpus size : %d', len(training_data))
 	print('Start training...')
 	tagger.train(training_data, save_loc=model, nr_iter=nr_iter)

 def eval(filepath='data/test.txt', model='model/ap.model'):
 	tagger = Perceptron(model)
 	
 	print('Start testing...')
 	right = 0.0
 	total = 0.0
 	sentence = ([], [])
 	fin = open(filepath, 'r', encoding='utf8')
 	for index, line in enumerate(fin):
 		line = line.strip()
 		if line == '':
 			words = sentence[0]
 			tags = sentence[1]
 			outputs = tagger.predict(words)
 			assert len(tags) == len(outputs)
 			total += len(tags)
 			for o, t in zip(outputs, tags):
 				if o == t: right += 1
 			sentence = ([], [])
 		else:
 			params = line.split()
 			if len(params) != 2: continue
 			sentence[0].append(params[0])
 			sentence[1].append(params[1])
 	fin.close()
 	print("Precision : %f", right / total)
 	
 def predict(model='model/ap.model'):
 	tagger = Perceptron(model)

 	while True:
 		text = input('>')
 		words = list(text)
 		labels = tagger.predict(words)
 		
 		for word, label in zip(words, labels):
 			print(word, label)
 			

 if __name__ == '__main__':
 	train()
 	eval()
 	# predict()