Browse Source

update

master
Yener 5 years ago
parent
commit
7ee0f29bb1
5 changed files with 27 additions and 299 deletions
  1. +5
    -2
      demo.py
  2. +5
    -53
      jiagu/analyze.py
  3. BIN
      jiagu/model/kg.model
  4. +17
    -17
      jiagu/segment/nroute.py
  5. +0
    -227
      jiagu/segment/perceptron.py

+ 5
- 2
demo.py View File

@@ -3,11 +3,14 @@ import jiagu
# jiagu.init() # 可手动初始化,也可以动态初始化


text = '在苏州冻成狗'
text = '思知机器人挺好用的'

words = jiagu.seg(text) # 分词
print(words)

words = jiagu.cut(text) # 分词
print(words)

pos = jiagu.pos(words) # 词性标注
print(pos)

@@ -47,7 +50,7 @@ print(summarize)


# 知识图谱关系抽取
text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。'
text = '姚明1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。'
knowledge = jiagu.knowledge(text)
print(knowledge)



+ 5
- 53
jiagu/analyze.py View File

@@ -35,17 +35,16 @@ class Analyze(object):
self.init_cws()
self.init_pos()
self.init_ner()
self.seg_nroute.init()
def load_userdict(self, userdict):
self.seg_nroute.load_userdict(userdict)

def init_cws(self):
if self.seg_model is None:
self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model'))
self.seg_nroute.init()

def load_model(self, model_path):
self.seg_model = perceptron.Perceptron(model_path)
pass

def init_pos(self):
if self.pos_model is None:
@@ -63,58 +62,11 @@ class Analyze(object):
if self.kg_model is None:
self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model'))

@staticmethod
def __lab2word(sentence, labels):
sen_len = len(sentence)
tmp_word = ""
words = []
for i in range(sen_len):
label = labels[i]
w = sentence[i]
if label == "B":
tmp_word += w
elif label == "M":
tmp_word += w
elif label == "E":
tmp_word += w
words.append(tmp_word)
tmp_word = ""
else:
if tmp_word != '':
words.append(tmp_word)
tmp_word = ""
words.append(w)
if tmp_word:
words.append(tmp_word)
return words

def cws_text(self, sentence):
if sentence == '':
return ['']
sentence = list(sentence)
labels = self.seg_model.predict(sentence)
return self.__lab2word(sentence, labels)

def seg(self, sentence):
return self.seg_nroute.seg(sentence, mode="default")
def cws(self, sentence, model='default'):
"""中文分词

:param sentence: str or list
文本或者文本列表,根据input的模式来定
:param model: str
分词所使用的模式,default为默认模式包含新词发现
:return:
"""
if model == 'default':
self.init_cws()
words = self.cws_text(sentence)
return words
else:
pass
return []
def cws(self, sentence, mode='probe'):
return self.seg_nroute.seg(sentence, mode)

def pos(self, words): # 传入的是词语
self.init_pos()


BIN
jiagu/model/kg.model View File


+ 17
- 17
jiagu/segment/nroute.py View File

@@ -243,29 +243,29 @@ class Segment:
continue
if re_han.match(block):
words1 = list(cut_block(block))
print(words1)
# print(words1)

words2 = self.model_cut(block)
print(words2)
# print(words2)

# new_word = [] # 有冲突的不加,长度大于4的不加,加完记得删除
# length = len(words1)
# for n in range(3):
# can_limit = length - n + 1
# for i in range(0, can_limit):
# ngram = ''.join(words1[i:i + n])
# word_len = len(ngram)
# if word_len > 4 or word_len==1:
# continue
# if ngram in words2 and ngram not in words1:
new_word = [] # 有冲突的不加,长度大于4的不加,加完记得删除
length = len(words1)
for n in range(3):
can_limit = length - n + 1
for i in range(0, can_limit):
ngram = ''.join(words1[i:i + n])
word_len = len(ngram)
if word_len > 4 or word_len==1:
continue
if ngram in words2 and ngram not in words1:
# print(ngram)
# new_word.append([ngram, 1])
new_word.append([ngram, 1])
new_word = []
for word in words2:
if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
new_word.append([word, 1])
# new_word = []
# for word in words2:
# if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
# new_word.append([word, 1])
self.load_userdict(new_word)


+ 0
- 227
jiagu/segment/perceptron.py View File

@@ -1,227 +0,0 @@
# -*- coding:utf-8 -*-
import os
import gzip
import pickle
import random
from collections import defaultdict

class AveragedPerceptron(object):
def __init__(self):
# Each feature gets its own weight vector, so weights is a dict-of-dicts
self.weights = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
self._totals = defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/clas tuples
# (tstamps is short for timestamps)
self._tstamps = defaultdict(int)
# Number of instances seen
self.i = 0

def predict(self, features):
'''Dot-product the features and current weights and return the best label.'''
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
continue
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda label: (scores[label], label))

def update(self, truth, guess, features):
'''Update the feature weights.'''
def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v

self.i += 1
if truth == guess:
return None
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
return None

def average_weights(self):
'''Average weights from all iterations.'''
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights
return None

class Perceptron:
def __init__(self, loc=None):
self.START = ['-START-', '-START2-']
self.END = ['-END-', '-END2-']
self.model = AveragedPerceptron()
if loc != None:
self.load(loc)

def predict(self, words):
prev, prev2 = self.START
labels = []
context = self.START + words + self.END
for i, word in enumerate(words):
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
labels.append(tag)
prev2 = prev
prev = tag
return labels
def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
self._make_tagdict(sentences)
for iter_ in range(nr_iter):
c = 0
n = 0
for words, tags in sentences:
prev, prev2 = self.START
context = self.START + words + self.END
for i, word in enumerate(words):
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)

prev2 = prev
prev = guess
c += guess == tags[i]
n += 1
if shuf == True:
random.shuffle(sentences)
print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
self.save(save_loc)
self.model.average_weights()
self.save(save_loc)
def save(self, loc='model/ap.model', zip=True):
if zip == False:
pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
else:
pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
def load(self, loc='model/ap.model', zip=True):
if zip == False:
self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
else:
self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.
'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1

i += len(self.START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i - 1])
add('i-1 suffix', context[i - 1][-3:])
add('i-2 word', context[i - 2])
add('i+1 word', context[i + 1])
add('i+1 suffix', context[i + 1][-3:])
add('i+2 word', context[i + 2])
return features

def _make_tagdict(self, sentences):
'''Make a tag dictionary for single-tag words.'''
for words, tags in sentences:
for word, tag in zip(words, tags):
self.model.classes.add(tag)
def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
tagger = Perceptron()
print('Reading corpus...')
training_data = []
sentence = ([], [])
fin = open(filepath, 'r', encoding='utf8')
for index, line in enumerate(fin):
line = line.strip()
if line == '':
training_data.append(sentence)
sentence = ([], [])
else:
params = line.split()
if len(params) != 2: continue
sentence[0].append(params[0])
sentence[1].append(params[1])
fin.close()
print('training corpus size : %d', len(training_data))
print('Start training...')
tagger.train(training_data, save_loc=model, nr_iter=nr_iter)

def eval(filepath='data/test.txt', model='model/ap.model'):
tagger = Perceptron(model)
print('Start testing...')
right = 0.0
total = 0.0
sentence = ([], [])
fin = open(filepath, 'r', encoding='utf8')
for index, line in enumerate(fin):
line = line.strip()
if line == '':
words = sentence[0]
tags = sentence[1]
outputs = tagger.predict(words)
assert len(tags) == len(outputs)
total += len(tags)
for o, t in zip(outputs, tags):
if o == t: right += 1
sentence = ([], [])
else:
params = line.split()
if len(params) != 2: continue
sentence[0].append(params[0])
sentence[1].append(params[1])
fin.close()
print("Precision : %f", right / total)
def predict(model='model/ap.model'):
tagger = Perceptron(model)

while True:
text = input('>')
words = list(text)
labels = tagger.predict(words)
for word, label in zip(words, labels):
print(word, label)

if __name__ == '__main__':
train()
eval()
# predict()


Loading…
Cancel
Save