@@ -53,7 +53,7 @@ print(words) | |||
pos = jiagu.pos(words) # 词性标注 | |||
print(pos) | |||
ner = jiagu.ner(text) # 命名实体识别 | |||
ner = jiagu.ner(words) # 命名实体识别 | |||
print(ner) | |||
``` | |||
@@ -61,7 +61,7 @@ print(ner) | |||
```python3 | |||
import jiagu | |||
text = '汉服和服装、知识图谱机器人' | |||
text = '汉服和服装、维基图谱' | |||
words = jiagu.cut(text) # 深度学习分词 | |||
print(words) | |||
@@ -70,7 +70,7 @@ words = jiagu.seg(text) # 字典分词 | |||
print(words) | |||
# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | |||
jiagu.load_userdict(['知识图谱']) | |||
jiagu.load_userdict(['汉服和服装']) | |||
words = jiagu.seg(text) # 自定义分词,字典分词模式有效 | |||
print(words) | |||
@@ -3,25 +3,25 @@ import jiagu | |||
# jiagu.init() # 可手动初始化,也可以动态初始化 | |||
text = '厦门明天会不会下雨' | |||
text = '在苏州冻成狗' | |||
words = jiagu.cut(text) # 分词 | |||
words = jiagu.seg(text) # 分词 | |||
print(words) | |||
pos = jiagu.pos(words) # 词性标注 | |||
print(pos) | |||
ner = jiagu.ner(text) # 命名实体识别 | |||
ner = jiagu.ner(words) # 命名实体识别 | |||
print(ner) | |||
# 字典模式分词 | |||
text = '知识图谱机器人' | |||
text = '思知机器人挺好用的' | |||
words = jiagu.seg(text) | |||
print(words) | |||
# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | |||
jiagu.load_userdict(['知识图谱']) | |||
jiagu.load_userdict(['思知机器人']) | |||
words = jiagu.seg(text) | |||
print(words) | |||
@@ -1,13 +1,5 @@ | |||
#!/usr/bin/env python3 | |||
# -*-coding:utf-8-*- | |||
""" | |||
* Copyright (C) 2018 OwnThink. | |||
* | |||
* Name : __init__.py | |||
* Author : Yener <yener@ownthink.com> | |||
* Version : 0.01 | |||
* Description : | |||
""" | |||
from jiagu import analyze | |||
any = analyze.Analyze() | |||
@@ -1,11 +1,2 @@ | |||
#!/usr/bin/env python3 | |||
# -*-coding:utf-8-*- | |||
""" | |||
* Copyright (C) 2018 OwnThink. | |||
* | |||
* Name : __main__.py | |||
* Author : Yener <yener@ownthink.com> | |||
* Version : 0.01 | |||
* Description : | |||
""" | |||
@@ -1,17 +1,9 @@ | |||
#!/usr/bin/env python3 | |||
# -*-coding:utf-8-*- | |||
""" | |||
* Copyright (C) 2018 OwnThink. | |||
* | |||
* Name : analyze.py - 解析模块 | |||
* Author : Yener <yener@ownthink.com> | |||
* Version : 0.01 | |||
* Description : | |||
""" | |||
import os | |||
from jiagu import mmseg | |||
from jiagu import findword | |||
from jiagu import bilstm_crf | |||
from jiagu import perceptron | |||
from jiagu.textrank import Keywords | |||
from jiagu.textrank import Summarize | |||
from jiagu.segment.nroute import Segment | |||
@@ -50,18 +42,18 @@ class Analyze(object): | |||
def init_cws(self): | |||
if self.seg_model is None: | |||
self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) | |||
self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model')) | |||
def load_model(self, model_path): | |||
self.seg_model = bilstm_crf.Predict(model_path) | |||
self.seg_model = perceptron.Perceptron(model_path) | |||
def init_pos(self): | |||
if self.pos_model is None: | |||
self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) | |||
self.pos_model = perceptron.Perceptron(add_curr_dir('model/pos.model')) | |||
def init_ner(self): | |||
if self.ner_model is None: | |||
self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) | |||
self.ner_model = perceptron.Perceptron(add_curr_dir('model/ner.model')) | |||
def init_mmseg(self): | |||
if self.seg_mmseg is None: | |||
@@ -69,7 +61,7 @@ class Analyze(object): | |||
def init_kg(self): | |||
if self.kg_model is None: | |||
self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model')) | |||
self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model')) | |||
@staticmethod | |||
def __lab2word(sentence, labels): | |||
@@ -97,22 +89,13 @@ class Analyze(object): | |||
def cws_text(self, sentence): | |||
if sentence == '': | |||
return [''] | |||
labels = self.seg_model.predict([sentence])[0] | |||
labels = self.seg_model.predict(list(sentence)) | |||
return self.__lab2word(sentence, labels) | |||
def cws_list(self, sentences): | |||
text_list = sentences | |||
all_labels = self.seg_model.predict(text_list) | |||
sent_words = [] | |||
for ti, text in enumerate(text_list): | |||
seg_labels = all_labels[ti] | |||
sent_words.append(self.__lab2word(text, seg_labels)) | |||
return sent_words | |||
def seg(self, sentence): | |||
return self.seg_nroute.seg(sentence, mode="default") | |||
def cws(self, sentence, input='text', model='default'): | |||
def cws(self, sentence, model='default'): | |||
"""中文分词 | |||
:param sentence: str or list | |||
@@ -125,54 +108,31 @@ class Analyze(object): | |||
""" | |||
if model == 'default': | |||
self.init_cws() | |||
if input == 'batch': | |||
words_list = self.cws_list(sentence) | |||
return words_list | |||
else: | |||
words = self.cws_text(sentence) | |||
return words | |||
words = self.cws_text(sentence) | |||
return words | |||
elif model == 'mmseg': | |||
self.init_mmseg() | |||
words = self.seg_mmseg.cws(sentence) | |||
return words | |||
else: | |||
pass | |||
return [] | |||
def pos(self, sentence, input='words'): # 传入的是词语 | |||
def pos(self, words): # 传入的是词语 | |||
self.init_pos() | |||
labels = self.pos_model.predict(words) | |||
return labels | |||
if input == 'batch': | |||
all_labels = self.pos_model.predict(sentence) | |||
return all_labels | |||
else: | |||
labels = self.pos_model.predict([sentence])[0] | |||
return labels | |||
def ner(self, sentence, input='text'): # 传入的是文本 | |||
def ner(self, words): # 传入的是词语 | |||
self.init_ner() | |||
labels = self.ner_model.predict(words) | |||
return labels | |||
if input == 'batch': | |||
all_labels = self.ner_model.predict(sentence) | |||
return all_labels | |||
else: | |||
labels = self.ner_model.predict([sentence])[0] | |||
return labels | |||
def knowledge(self, sentence, input='text'): | |||
def knowledge(self, text): # 传入的是文本 | |||
self.init_kg() | |||
if input == 'batch': | |||
all_labels = self.kg_model.predict(sentence) | |||
result = [] | |||
for sent, labels in zip(sentence, all_labels): | |||
result.append(self.lab2spo(sent, labels)) | |||
return result | |||
else: | |||
labels = self.kg_model.predict([sentence])[0] | |||
return self.lab2spo(sentence, labels) | |||
words = self.seg(text) | |||
labels = self.kg_model.predict(words) | |||
return self.lab2spo(words, labels) | |||
def keywords(self, text, topkey=5): | |||
if self.keywords_model == None: | |||
@@ -195,11 +155,11 @@ class Analyze(object): | |||
def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | |||
return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | |||
def lab2spo(self, text, epp_labels): | |||
def lab2spo(self, words, epp_labels): | |||
subject_list = [] # 存放实体的列表 | |||
object_list = [] | |||
index = 0 | |||
for word, ep in zip(list(text), epp_labels): | |||
for word, ep in zip(words, epp_labels): | |||
if ep[0] == 'B' and ep[2:] == '实体': | |||
subject_list.append([word, ep[2:], index]) | |||
elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': | |||
@@ -1,77 +0,0 @@ | |||
#!/usr/bin/env python3 | |||
# -*-coding:utf-8-*- | |||
""" | |||
* Copyright (C) 2018 OwnThink. | |||
* | |||
* Name : bilstm_crf.py - 预测 | |||
* Author : Yener <yener@ownthink.com> | |||
* Version : 0.01 | |||
* Description : | |||
""" | |||
import os | |||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |||
import pickle | |||
import numpy as np | |||
import tensorflow as tf | |||
from tensorflow.contrib.crf import viterbi_decode | |||
class Predict(object): | |||
def __init__(self, model_file): | |||
with open(model_file, 'rb') as f: | |||
model, char_to_id, id_to_tag = pickle.load(f) | |||
self.char_to_id = char_to_id | |||
self.id_to_tag = {int(k): v for k, v in id_to_tag.items()} | |||
self.num_class = len(self.id_to_tag) | |||
graph_def = tf.GraphDef() | |||
graph_def.ParseFromString(model) | |||
with tf.Graph().as_default() as graph: | |||
tf.import_graph_def(graph_def, name="prefix") | |||
self.input_x = graph.get_tensor_by_name("prefix/char_inputs:0") | |||
self.lengths = graph.get_tensor_by_name("prefix/lengths:0") | |||
self.dropout = graph.get_tensor_by_name("prefix/dropout:0") | |||
self.logits = graph.get_tensor_by_name("prefix/project/logits:0") | |||
self.trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0") | |||
self.sess = tf.Session(graph=graph) | |||
self.sess.as_default() | |||
def decode(self, logits, trans, sequence_lengths, tag_num): | |||
small = -1000.0 | |||
viterbi_sequences = [] | |||
start = np.asarray([[small] * tag_num + [0]]) | |||
for logit, length in zip(logits, sequence_lengths): | |||
score = logit[:length] | |||
pad = small * np.ones([length, 1]) | |||
score = np.concatenate([score, pad], axis=1) | |||
score = np.concatenate([start, score], axis=0) | |||
viterbi_seq, viterbi_score = viterbi_decode(score, trans) | |||
viterbi_sequences.append(viterbi_seq[1:]) | |||
return viterbi_sequences | |||
def predict(self, sents): | |||
inputs = [] | |||
lengths = [len(text) for text in sents] | |||
max_len = max(lengths) | |||
for sent in sents: | |||
sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("<OOV>") for w in sent] | |||
padding = [0] * (max_len - len(sent_ids)) | |||
sent_ids += padding | |||
inputs.append(sent_ids) | |||
inputs = np.array(inputs, dtype=np.int32) | |||
feed_dict = { | |||
self.input_x: inputs, | |||
self.lengths: lengths, | |||
self.dropout: 1.0 | |||
} | |||
logits, trans = self.sess.run([self.logits, self.trans], feed_dict=feed_dict) | |||
path = self.decode(logits, trans, lengths, self.num_class) | |||
labels = [[self.id_to_tag.get(l) for l in p] for p in path] | |||
return labels |
@@ -1,15 +1,4 @@ | |||
# -*- encoding:utf-8 -*- | |||
""" | |||
* Copyright (C) 2017 OwnThink. | |||
* | |||
* Name : findword.py - 新词发现 | |||
* Author : Yener <yener@ownthink.com> | |||
* Version : 0.01 | |||
* Description : 新词发现算法实现 | |||
special thanks to | |||
http://www.matrix67.com/blog/archives/5044 | |||
https://github.com/zoulala/New_words_find | |||
""" | |||
import re | |||
from math import log | |||
from collections import Counter | |||
@@ -1,13 +1,5 @@ | |||
#!/usr/bin/env python | |||
# encoding: utf-8 | |||
""" | |||
* Copyright (C) 2018 OwnThink. | |||
* | |||
* Name : mmseg.py | |||
* Author : Leo <1162441289@qq.com> | |||
* Version : 0.01 | |||
* Description : mmseg分词方法,目前算法比较耗时,仍在优化中 | |||
""" | |||
import os | |||
import pickle | |||
from math import log | |||
@@ -0,0 +1,227 @@ | |||
# -*- coding:utf-8 -*- | |||
import os | |||
import gzip | |||
import pickle | |||
import random | |||
from collections import defaultdict | |||
class AveragedPerceptron(object): | |||
def __init__(self): | |||
# Each feature gets its own weight vector, so weights is a dict-of-dicts | |||
self.weights = {} | |||
self.classes = set() | |||
# The accumulated values, for the averaging. These will be keyed by | |||
# feature/clas tuples | |||
self._totals = defaultdict(int) | |||
# The last time the feature was changed, for the averaging. Also | |||
# keyed by feature/clas tuples | |||
# (tstamps is short for timestamps) | |||
self._tstamps = defaultdict(int) | |||
# Number of instances seen | |||
self.i = 0 | |||
def predict(self, features): | |||
'''Dot-product the features and current weights and return the best label.''' | |||
scores = defaultdict(float) | |||
for feat, value in features.items(): | |||
if feat not in self.weights or value == 0: | |||
continue | |||
weights = self.weights[feat] | |||
for label, weight in weights.items(): | |||
scores[label] += value * weight | |||
# Do a secondary alphabetic sort, for stability | |||
return max(self.classes, key=lambda label: (scores[label], label)) | |||
def update(self, truth, guess, features): | |||
'''Update the feature weights.''' | |||
def upd_feat(c, f, w, v): | |||
param = (f, c) | |||
self._totals[param] += (self.i - self._tstamps[param]) * w | |||
self._tstamps[param] = self.i | |||
self.weights[f][c] = w + v | |||
self.i += 1 | |||
if truth == guess: | |||
return None | |||
for f in features: | |||
weights = self.weights.setdefault(f, {}) | |||
upd_feat(truth, f, weights.get(truth, 0.0), 1.0) | |||
upd_feat(guess, f, weights.get(guess, 0.0), -1.0) | |||
return None | |||
def average_weights(self): | |||
'''Average weights from all iterations.''' | |||
for feat, weights in self.weights.items(): | |||
new_feat_weights = {} | |||
for clas, weight in weights.items(): | |||
param = (feat, clas) | |||
total = self._totals[param] | |||
total += (self.i - self._tstamps[param]) * weight | |||
averaged = round(total / float(self.i), 3) | |||
if averaged: | |||
new_feat_weights[clas] = averaged | |||
self.weights[feat] = new_feat_weights | |||
return None | |||
class Perceptron: | |||
def __init__(self, loc=None): | |||
self.START = ['-START-', '-START2-'] | |||
self.END = ['-END-', '-END2-'] | |||
self.model = AveragedPerceptron() | |||
if loc != None: | |||
self.load(loc) | |||
def predict(self, words): | |||
prev, prev2 = self.START | |||
labels = [] | |||
context = self.START + words + self.END | |||
for i, word in enumerate(words): | |||
features = self._get_features(i, word, context, prev, prev2) | |||
tag = self.model.predict(features) | |||
labels.append(tag) | |||
prev2 = prev | |||
prev = tag | |||
return labels | |||
def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): | |||
self._make_tagdict(sentences) | |||
for iter_ in range(nr_iter): | |||
c = 0 | |||
n = 0 | |||
for words, tags in sentences: | |||
prev, prev2 = self.START | |||
context = self.START + words + self.END | |||
for i, word in enumerate(words): | |||
feats = self._get_features(i, word, context, prev, prev2) | |||
guess = self.model.predict(feats) | |||
self.model.update(tags[i], guess, feats) | |||
prev2 = prev | |||
prev = guess | |||
c += guess == tags[i] | |||
n += 1 | |||
if shuf == True: | |||
random.shuffle(sentences) | |||
print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) | |||
self.save(save_loc) | |||
self.model.average_weights() | |||
self.save(save_loc) | |||
def save(self, loc='model/ap.model', zip=True): | |||
if zip == False: | |||
pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) | |||
else: | |||
pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) | |||
def load(self, loc='model/ap.model', zip=True): | |||
if zip == False: | |||
self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) | |||
else: | |||
self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) | |||
def _get_features(self, i, word, context, prev, prev2): | |||
'''Map tokens into a feature representation, implemented as a | |||
{hashable: float} dict. If the features change, a new model must be | |||
trained. | |||
''' | |||
def add(name, *args): | |||
features[' '.join((name,) + tuple(args))] += 1 | |||
i += len(self.START) | |||
features = defaultdict(int) | |||
# It's useful to have a constant feature, which acts sort of like a prior | |||
add('bias') | |||
add('i suffix', word[-3:]) | |||
add('i pref1', word[0]) | |||
add('i-1 tag', prev) | |||
add('i-2 tag', prev2) | |||
add('i tag+i-2 tag', prev, prev2) | |||
add('i word', context[i]) | |||
add('i-1 tag+i word', prev, context[i]) | |||
add('i-1 word', context[i - 1]) | |||
add('i-1 suffix', context[i - 1][-3:]) | |||
add('i-2 word', context[i - 2]) | |||
add('i+1 word', context[i + 1]) | |||
add('i+1 suffix', context[i + 1][-3:]) | |||
add('i+2 word', context[i + 2]) | |||
return features | |||
def _make_tagdict(self, sentences): | |||
'''Make a tag dictionary for single-tag words.''' | |||
for words, tags in sentences: | |||
for word, tag in zip(words, tags): | |||
self.model.classes.add(tag) | |||
def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||
tagger = Perceptron() | |||
print('Reading corpus...') | |||
training_data = [] | |||
sentence = ([], []) | |||
fin = open(filepath, 'r', encoding='utf8') | |||
for index, line in enumerate(fin): | |||
line = line.strip() | |||
if line == '': | |||
training_data.append(sentence) | |||
sentence = ([], []) | |||
else: | |||
params = line.split() | |||
if len(params) != 2: continue | |||
sentence[0].append(params[0]) | |||
sentence[1].append(params[1]) | |||
fin.close() | |||
print('training corpus size : %d', len(training_data)) | |||
print('Start training...') | |||
tagger.train(training_data, save_loc=model, nr_iter=nr_iter) | |||
def eval(filepath='data/test.txt', model='model/ap.model'): | |||
tagger = Perceptron(model) | |||
print('Start testing...') | |||
right = 0.0 | |||
total = 0.0 | |||
sentence = ([], []) | |||
fin = open(filepath, 'r', encoding='utf8') | |||
for index, line in enumerate(fin): | |||
line = line.strip() | |||
if line == '': | |||
words = sentence[0] | |||
tags = sentence[1] | |||
outputs = tagger.predict(words) | |||
assert len(tags) == len(outputs) | |||
total += len(tags) | |||
for o, t in zip(outputs, tags): | |||
if o == t: right += 1 | |||
sentence = ([], []) | |||
else: | |||
params = line.split() | |||
if len(params) != 2: continue | |||
sentence[0].append(params[0]) | |||
sentence[1].append(params[1]) | |||
fin.close() | |||
print("Precision : %f", right / total) | |||
def predict(model='model/ap.model'): | |||
tagger = Perceptron(model) | |||
while True: | |||
text = input('>') | |||
words = list(text) | |||
labels = tagger.predict(words) | |||
for word, label in zip(words, labels): | |||
print(word, label) | |||
if __name__ == '__main__': | |||
train() | |||
eval() | |||
# predict() | |||
@@ -1,13 +1,4 @@ | |||
# -*- encoding:utf-8 -*- | |||
""" | |||
* Copyright (C) 2017 OwnThink. | |||
* | |||
* Name : textrank.py - 解析 | |||
* Author : zengbin93 <zeng_bin8888@163.com> | |||
* Version : 0.01 | |||
* Description : TextRank算法实现 | |||
special thanks to https://github.com/ArtistScript/FastTextRank | |||
""" | |||
import sys | |||
import numpy as np | |||
from jiagu import utils | |||
@@ -1,16 +1,7 @@ | |||
# -*- encoding:utf-8 -*- | |||
""" | |||
* Copyright (C) 2017 OwnThink. | |||
* | |||
* Name : utils.py - 解析 | |||
* Author : zengbin93 <zeng_bin8888@163.com> | |||
* Version : 0.01 | |||
* Description : 常用工具函数 | |||
""" | |||
import os | |||
import jiagu | |||
import math | |||
import numpy as np | |||
def default_stopwords_file(): | |||
@@ -138,22 +129,6 @@ def different(scores, old_scores, tol=0.0001): | |||
return flag | |||
def cosine_similarity(vec1, vec2): | |||
"""计算两个向量的余弦相似度 | |||
:param vec1: list or np.array | |||
:param vec2: list or np.array | |||
:return: float | |||
""" | |||
tx = np.array(vec1) | |||
ty = np.array(vec2) | |||
cos1 = np.sum(tx * ty) | |||
cos21 = np.sqrt(sum(tx ** 2)) | |||
cos22 = np.sqrt(sum(ty ** 2)) | |||
cosine_value = cos1 / float(cos21 * cos22) | |||
return cosine_value | |||
def combine(word_list, window=2): | |||
if window < 2: | |||
window = 2 | |||
@@ -3,16 +3,15 @@ | |||
from setuptools import setup | |||
setup(name='jiagu', | |||
version='0.1.8', | |||
version='0.1.9', | |||
description='Jiagu Natural Language Processing', | |||
author='Yener(Zheng Wenyu)', | |||
author_email='help@ownthink.com', | |||
url='https://github.com/ownthink/Jiagu', | |||
license='MIT', | |||
install_requires=['tensorflow==1.6.0', 'numpy>=1.12.1'], | |||
packages=['jiagu'], | |||
package_dir={'jiagu': 'jiagu'}, | |||
package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', | |||
'normal/*', 'segment/*', 'segment/dict/*', | |||
'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||
'normal/*', 'segment/*', 'segment/dict/*', | |||
'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||
) |
@@ -170,6 +170,7 @@ def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||
sentence = ([], []) | |||
else: | |||
params = line.split() | |||
if len(params) != 2: continue | |||
sentence[0].append(params[0]) | |||
sentence[1].append(params[1]) | |||
fin.close() | |||