diff --git a/README.md b/README.md index 363e252..34327a3 100644 --- a/README.md +++ b/README.md @@ -53,7 +53,7 @@ print(words) pos = jiagu.pos(words) # 词性标注 print(pos) -ner = jiagu.ner(text) # 命名实体识别 +ner = jiagu.ner(words) # 命名实体识别 print(ner) ``` @@ -61,7 +61,7 @@ print(ner) ```python3 import jiagu -text = '汉服和服装、知识图谱机器人' +text = '汉服和服装、维基图谱' words = jiagu.cut(text) # 深度学习分词 print(words) @@ -70,7 +70,7 @@ words = jiagu.seg(text) # 字典分词 print(words) # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 -jiagu.load_userdict(['知识图谱']) +jiagu.load_userdict(['汉服和服装']) words = jiagu.seg(text) # 自定义分词,字典分词模式有效 print(words) diff --git a/demo.py b/demo.py index a2e0c70..32d6b1a 100644 --- a/demo.py +++ b/demo.py @@ -3,25 +3,25 @@ import jiagu # jiagu.init() # 可手动初始化,也可以动态初始化 -text = '厦门明天会不会下雨' +text = '在苏州冻成狗' -words = jiagu.cut(text) # 分词 +words = jiagu.seg(text) # 分词 print(words) pos = jiagu.pos(words) # 词性标注 print(pos) -ner = jiagu.ner(text) # 命名实体识别 +ner = jiagu.ner(words) # 命名实体识别 print(ner) # 字典模式分词 -text = '知识图谱机器人' +text = '思知机器人挺好用的' words = jiagu.seg(text) print(words) # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 -jiagu.load_userdict(['知识图谱']) +jiagu.load_userdict(['思知机器人']) words = jiagu.seg(text) print(words) diff --git a/jiagu/__init__.py b/jiagu/__init__.py index 5307580..13597f7 100644 --- a/jiagu/__init__.py +++ b/jiagu/__init__.py @@ -1,13 +1,5 @@ #!/usr/bin/env python3 # -*-coding:utf-8-*- -""" - * Copyright (C) 2018 OwnThink. - * - * Name : __init__.py - * Author : Yener - * Version : 0.01 - * Description : -""" from jiagu import analyze any = analyze.Analyze() diff --git a/jiagu/__main__.py b/jiagu/__main__.py index 68e374c..fe02def 100644 --- a/jiagu/__main__.py +++ b/jiagu/__main__.py @@ -1,11 +1,2 @@ #!/usr/bin/env python3 # -*-coding:utf-8-*- -""" - * Copyright (C) 2018 OwnThink. - * - * Name : __main__.py - * Author : Yener - * Version : 0.01 - * Description : -""" - diff --git a/jiagu/analyze.py b/jiagu/analyze.py index 395dd38..58508de 100644 --- a/jiagu/analyze.py +++ b/jiagu/analyze.py @@ -1,17 +1,9 @@ #!/usr/bin/env python3 # -*-coding:utf-8-*- -""" - * Copyright (C) 2018 OwnThink. - * - * Name : analyze.py - 解析模块 - * Author : Yener - * Version : 0.01 - * Description : -""" import os from jiagu import mmseg from jiagu import findword -from jiagu import bilstm_crf +from jiagu import perceptron from jiagu.textrank import Keywords from jiagu.textrank import Summarize from jiagu.segment.nroute import Segment @@ -50,18 +42,18 @@ class Analyze(object): def init_cws(self): if self.seg_model is None: - self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) + self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model')) def load_model(self, model_path): - self.seg_model = bilstm_crf.Predict(model_path) + self.seg_model = perceptron.Perceptron(model_path) def init_pos(self): if self.pos_model is None: - self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) + self.pos_model = perceptron.Perceptron(add_curr_dir('model/pos.model')) def init_ner(self): if self.ner_model is None: - self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) + self.ner_model = perceptron.Perceptron(add_curr_dir('model/ner.model')) def init_mmseg(self): if self.seg_mmseg is None: @@ -69,7 +61,7 @@ class Analyze(object): def init_kg(self): if self.kg_model is None: - self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model')) + self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model')) @staticmethod def __lab2word(sentence, labels): @@ -97,22 +89,13 @@ class Analyze(object): def cws_text(self, sentence): if sentence == '': return [''] - labels = self.seg_model.predict([sentence])[0] + labels = self.seg_model.predict(list(sentence)) return self.__lab2word(sentence, labels) - def cws_list(self, sentences): - text_list = sentences - all_labels = self.seg_model.predict(text_list) - sent_words = [] - for ti, text in enumerate(text_list): - seg_labels = all_labels[ti] - sent_words.append(self.__lab2word(text, seg_labels)) - return sent_words - def seg(self, sentence): return self.seg_nroute.seg(sentence, mode="default") - def cws(self, sentence, input='text', model='default'): + def cws(self, sentence, model='default'): """中文分词 :param sentence: str or list @@ -125,54 +108,31 @@ class Analyze(object): """ if model == 'default': self.init_cws() - - if input == 'batch': - words_list = self.cws_list(sentence) - return words_list - else: - words = self.cws_text(sentence) - return words + words = self.cws_text(sentence) + return words elif model == 'mmseg': self.init_mmseg() - words = self.seg_mmseg.cws(sentence) return words else: pass return [] - def pos(self, sentence, input='words'): # 传入的是词语 + def pos(self, words): # 传入的是词语 self.init_pos() + labels = self.pos_model.predict(words) + return labels - if input == 'batch': - all_labels = self.pos_model.predict(sentence) - return all_labels - else: - labels = self.pos_model.predict([sentence])[0] - return labels - - def ner(self, sentence, input='text'): # 传入的是文本 + def ner(self, words): # 传入的是词语 self.init_ner() + labels = self.ner_model.predict(words) + return labels - if input == 'batch': - all_labels = self.ner_model.predict(sentence) - return all_labels - else: - labels = self.ner_model.predict([sentence])[0] - return labels - - def knowledge(self, sentence, input='text'): + def knowledge(self, text): # 传入的是文本 self.init_kg() - - if input == 'batch': - all_labels = self.kg_model.predict(sentence) - result = [] - for sent, labels in zip(sentence, all_labels): - result.append(self.lab2spo(sent, labels)) - return result - else: - labels = self.kg_model.predict([sentence])[0] - return self.lab2spo(sentence, labels) + words = self.seg(text) + labels = self.kg_model.predict(words) + return self.lab2spo(words, labels) def keywords(self, text, topkey=5): if self.keywords_model == None: @@ -195,11 +155,11 @@ class Analyze(object): def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) - def lab2spo(self, text, epp_labels): + def lab2spo(self, words, epp_labels): subject_list = [] # 存放实体的列表 object_list = [] index = 0 - for word, ep in zip(list(text), epp_labels): + for word, ep in zip(words, epp_labels): if ep[0] == 'B' and ep[2:] == '实体': subject_list.append([word, ep[2:], index]) elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': diff --git a/jiagu/bilstm_crf.py b/jiagu/bilstm_crf.py deleted file mode 100644 index 56e6cf0..0000000 --- a/jiagu/bilstm_crf.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -# -*-coding:utf-8-*- -""" - * Copyright (C) 2018 OwnThink. - * - * Name : bilstm_crf.py - 预测 - * Author : Yener - * Version : 0.01 - * Description : -""" -import os -os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' -import pickle -import numpy as np -import tensorflow as tf -from tensorflow.contrib.crf import viterbi_decode - - -class Predict(object): - def __init__(self, model_file): - with open(model_file, 'rb') as f: - model, char_to_id, id_to_tag = pickle.load(f) - - self.char_to_id = char_to_id - self.id_to_tag = {int(k): v for k, v in id_to_tag.items()} - self.num_class = len(self.id_to_tag) - - graph_def = tf.GraphDef() - graph_def.ParseFromString(model) - - with tf.Graph().as_default() as graph: - tf.import_graph_def(graph_def, name="prefix") - - self.input_x = graph.get_tensor_by_name("prefix/char_inputs:0") - self.lengths = graph.get_tensor_by_name("prefix/lengths:0") - self.dropout = graph.get_tensor_by_name("prefix/dropout:0") - self.logits = graph.get_tensor_by_name("prefix/project/logits:0") - self.trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0") - - self.sess = tf.Session(graph=graph) - self.sess.as_default() - - def decode(self, logits, trans, sequence_lengths, tag_num): - small = -1000.0 - viterbi_sequences = [] - start = np.asarray([[small] * tag_num + [0]]) - for logit, length in zip(logits, sequence_lengths): - score = logit[:length] - pad = small * np.ones([length, 1]) - score = np.concatenate([score, pad], axis=1) - score = np.concatenate([start, score], axis=0) - viterbi_seq, viterbi_score = viterbi_decode(score, trans) - viterbi_sequences.append(viterbi_seq[1:]) - return viterbi_sequences - - def predict(self, sents): - inputs = [] - lengths = [len(text) for text in sents] - max_len = max(lengths) - - for sent in sents: - sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("") for w in sent] - padding = [0] * (max_len - len(sent_ids)) - sent_ids += padding - inputs.append(sent_ids) - inputs = np.array(inputs, dtype=np.int32) - - feed_dict = { - self.input_x: inputs, - self.lengths: lengths, - self.dropout: 1.0 - } - - logits, trans = self.sess.run([self.logits, self.trans], feed_dict=feed_dict) - path = self.decode(logits, trans, lengths, self.num_class) - labels = [[self.id_to_tag.get(l) for l in p] for p in path] - return labels diff --git a/jiagu/findword.py b/jiagu/findword.py index c954e1b..bad7b75 100644 --- a/jiagu/findword.py +++ b/jiagu/findword.py @@ -1,15 +1,4 @@ # -*- encoding:utf-8 -*- -""" - * Copyright (C) 2017 OwnThink. - * - * Name : findword.py - 新词发现 - * Author : Yener - * Version : 0.01 - * Description : 新词发现算法实现 - special thanks to - http://www.matrix67.com/blog/archives/5044 - https://github.com/zoulala/New_words_find -""" import re from math import log from collections import Counter diff --git a/jiagu/mmseg.py b/jiagu/mmseg.py index a2e6234..c154ef2 100644 --- a/jiagu/mmseg.py +++ b/jiagu/mmseg.py @@ -1,13 +1,5 @@ #!/usr/bin/env python # encoding: utf-8 -""" - * Copyright (C) 2018 OwnThink. - * - * Name : mmseg.py - * Author : Leo <1162441289@qq.com> - * Version : 0.01 - * Description : mmseg分词方法,目前算法比较耗时,仍在优化中 -""" import os import pickle from math import log diff --git a/jiagu/model/cws.model b/jiagu/model/cws.model index 2992ea1..c56c9d0 100644 Binary files a/jiagu/model/cws.model and b/jiagu/model/cws.model differ diff --git a/jiagu/model/kg.model b/jiagu/model/kg.model index 26cf41e..5a24946 100644 Binary files a/jiagu/model/kg.model and b/jiagu/model/kg.model differ diff --git a/jiagu/model/ner.model b/jiagu/model/ner.model index 43e33f4..4e79d67 100644 Binary files a/jiagu/model/ner.model and b/jiagu/model/ner.model differ diff --git a/jiagu/model/pos.model b/jiagu/model/pos.model index 4f6f46e..6870c5d 100644 Binary files a/jiagu/model/pos.model and b/jiagu/model/pos.model differ diff --git a/jiagu/perceptron.py b/jiagu/perceptron.py new file mode 100644 index 0000000..3ab7584 --- /dev/null +++ b/jiagu/perceptron.py @@ -0,0 +1,227 @@ +# -*- coding:utf-8 -*- +import os +import gzip +import pickle +import random +from collections import defaultdict + +class AveragedPerceptron(object): + def __init__(self): + # Each feature gets its own weight vector, so weights is a dict-of-dicts + self.weights = {} + self.classes = set() + # The accumulated values, for the averaging. These will be keyed by + # feature/clas tuples + self._totals = defaultdict(int) + # The last time the feature was changed, for the averaging. Also + # keyed by feature/clas tuples + # (tstamps is short for timestamps) + self._tstamps = defaultdict(int) + # Number of instances seen + self.i = 0 + + def predict(self, features): + '''Dot-product the features and current weights and return the best label.''' + scores = defaultdict(float) + for feat, value in features.items(): + if feat not in self.weights or value == 0: + continue + weights = self.weights[feat] + for label, weight in weights.items(): + scores[label] += value * weight + # Do a secondary alphabetic sort, for stability + return max(self.classes, key=lambda label: (scores[label], label)) + + def update(self, truth, guess, features): + '''Update the feature weights.''' + def upd_feat(c, f, w, v): + param = (f, c) + self._totals[param] += (self.i - self._tstamps[param]) * w + self._tstamps[param] = self.i + self.weights[f][c] = w + v + + self.i += 1 + if truth == guess: + return None + for f in features: + weights = self.weights.setdefault(f, {}) + upd_feat(truth, f, weights.get(truth, 0.0), 1.0) + upd_feat(guess, f, weights.get(guess, 0.0), -1.0) + return None + + def average_weights(self): + '''Average weights from all iterations.''' + for feat, weights in self.weights.items(): + new_feat_weights = {} + for clas, weight in weights.items(): + param = (feat, clas) + total = self._totals[param] + total += (self.i - self._tstamps[param]) * weight + averaged = round(total / float(self.i), 3) + if averaged: + new_feat_weights[clas] = averaged + self.weights[feat] = new_feat_weights + return None + +class Perceptron: + def __init__(self, loc=None): + self.START = ['-START-', '-START2-'] + self.END = ['-END-', '-END2-'] + self.model = AveragedPerceptron() + + if loc != None: + self.load(loc) + + def predict(self, words): + prev, prev2 = self.START + labels = [] + context = self.START + words + self.END + for i, word in enumerate(words): + features = self._get_features(i, word, context, prev, prev2) + tag = self.model.predict(features) + labels.append(tag) + prev2 = prev + prev = tag + return labels + + def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): + self._make_tagdict(sentences) + for iter_ in range(nr_iter): + c = 0 + n = 0 + for words, tags in sentences: + prev, prev2 = self.START + context = self.START + words + self.END + for i, word in enumerate(words): + feats = self._get_features(i, word, context, prev, prev2) + guess = self.model.predict(feats) + self.model.update(tags[i], guess, feats) + + prev2 = prev + prev = guess + c += guess == tags[i] + n += 1 + if shuf == True: + random.shuffle(sentences) + + print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) + self.save(save_loc) + + self.model.average_weights() + self.save(save_loc) + + def save(self, loc='model/ap.model', zip=True): + if zip == False: + pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) + else: + pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) + + def load(self, loc='model/ap.model', zip=True): + if zip == False: + self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) + else: + self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) + + def _get_features(self, i, word, context, prev, prev2): + '''Map tokens into a feature representation, implemented as a + {hashable: float} dict. If the features change, a new model must be + trained. + ''' + def add(name, *args): + features[' '.join((name,) + tuple(args))] += 1 + + i += len(self.START) + features = defaultdict(int) + # It's useful to have a constant feature, which acts sort of like a prior + add('bias') + add('i suffix', word[-3:]) + add('i pref1', word[0]) + add('i-1 tag', prev) + add('i-2 tag', prev2) + add('i tag+i-2 tag', prev, prev2) + add('i word', context[i]) + add('i-1 tag+i word', prev, context[i]) + add('i-1 word', context[i - 1]) + add('i-1 suffix', context[i - 1][-3:]) + add('i-2 word', context[i - 2]) + add('i+1 word', context[i + 1]) + add('i+1 suffix', context[i + 1][-3:]) + add('i+2 word', context[i + 2]) + return features + + def _make_tagdict(self, sentences): + '''Make a tag dictionary for single-tag words.''' + for words, tags in sentences: + for word, tag in zip(words, tags): + self.model.classes.add(tag) + +def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): + tagger = Perceptron() + print('Reading corpus...') + training_data = [] + sentence = ([], []) + fin = open(filepath, 'r', encoding='utf8') + for index, line in enumerate(fin): + line = line.strip() + if line == '': + training_data.append(sentence) + sentence = ([], []) + else: + params = line.split() + if len(params) != 2: continue + sentence[0].append(params[0]) + sentence[1].append(params[1]) + fin.close() + print('training corpus size : %d', len(training_data)) + print('Start training...') + tagger.train(training_data, save_loc=model, nr_iter=nr_iter) + +def eval(filepath='data/test.txt', model='model/ap.model'): + tagger = Perceptron(model) + + print('Start testing...') + right = 0.0 + total = 0.0 + sentence = ([], []) + fin = open(filepath, 'r', encoding='utf8') + for index, line in enumerate(fin): + line = line.strip() + if line == '': + words = sentence[0] + tags = sentence[1] + outputs = tagger.predict(words) + assert len(tags) == len(outputs) + total += len(tags) + for o, t in zip(outputs, tags): + if o == t: right += 1 + sentence = ([], []) + else: + params = line.split() + if len(params) != 2: continue + sentence[0].append(params[0]) + sentence[1].append(params[1]) + fin.close() + print("Precision : %f", right / total) + +def predict(model='model/ap.model'): + tagger = Perceptron(model) + + while True: + text = input('>') + words = list(text) + labels = tagger.predict(words) + + for word, label in zip(words, labels): + print(word, label) + + +if __name__ == '__main__': + train() + eval() + # predict() + + + + + + \ No newline at end of file diff --git a/jiagu/textrank.py b/jiagu/textrank.py index 1e469e1..80f8f66 100644 --- a/jiagu/textrank.py +++ b/jiagu/textrank.py @@ -1,13 +1,4 @@ # -*- encoding:utf-8 -*- -""" - * Copyright (C) 2017 OwnThink. - * - * Name : textrank.py - 解析 - * Author : zengbin93 - * Version : 0.01 - * Description : TextRank算法实现 - special thanks to https://github.com/ArtistScript/FastTextRank -""" import sys import numpy as np from jiagu import utils diff --git a/jiagu/utils.py b/jiagu/utils.py index 304fa9e..286715e 100644 --- a/jiagu/utils.py +++ b/jiagu/utils.py @@ -1,16 +1,7 @@ # -*- encoding:utf-8 -*- -""" - * Copyright (C) 2017 OwnThink. - * - * Name : utils.py - 解析 - * Author : zengbin93 - * Version : 0.01 - * Description : 常用工具函数 -""" import os import jiagu import math -import numpy as np def default_stopwords_file(): @@ -138,22 +129,6 @@ def different(scores, old_scores, tol=0.0001): return flag -def cosine_similarity(vec1, vec2): - """计算两个向量的余弦相似度 - - :param vec1: list or np.array - :param vec2: list or np.array - :return: float - """ - tx = np.array(vec1) - ty = np.array(vec2) - cos1 = np.sum(tx * ty) - cos21 = np.sqrt(sum(tx ** 2)) - cos22 = np.sqrt(sum(ty ** 2)) - cosine_value = cos1 / float(cos21 * cos22) - return cosine_value - - def combine(word_list, window=2): if window < 2: window = 2 diff --git a/setup.py b/setup.py index f8eda34..b475dac 100644 --- a/setup.py +++ b/setup.py @@ -3,16 +3,15 @@ from setuptools import setup setup(name='jiagu', - version='0.1.8', + version='0.1.9', description='Jiagu Natural Language Processing', author='Yener(Zheng Wenyu)', author_email='help@ownthink.com', url='https://github.com/ownthink/Jiagu', license='MIT', - install_requires=['tensorflow==1.6.0', 'numpy>=1.12.1'], packages=['jiagu'], package_dir={'jiagu': 'jiagu'}, package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', - 'normal/*', 'segment/*', 'segment/dict/*', - 'sentiment/*', 'sentiment/model/*', 'topic/*']} + 'normal/*', 'segment/*', 'segment/dict/*', + 'sentiment/*', 'sentiment/model/*', 'topic/*']} ) diff --git a/train/perceptron.py b/train/perceptron.py index 921cee9..346e1b5 100644 --- a/train/perceptron.py +++ b/train/perceptron.py @@ -170,6 +170,7 @@ def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): sentence = ([], []) else: params = line.split() + if len(params) != 2: continue sentence[0].append(params[0]) sentence[1].append(params[1]) fin.close()