Browse Source

update

master
Yener 5 years ago
parent
commit
3f79679e60
17 changed files with 261 additions and 221 deletions
  1. +3
    -3
      README.md
  2. +5
    -5
      demo.py
  3. +0
    -8
      jiagu/__init__.py
  4. +0
    -9
      jiagu/__main__.py
  5. +22
    -62
      jiagu/analyze.py
  6. +0
    -77
      jiagu/bilstm_crf.py
  7. +0
    -11
      jiagu/findword.py
  8. +0
    -8
      jiagu/mmseg.py
  9. BIN
      jiagu/model/cws.model
  10. BIN
      jiagu/model/kg.model
  11. BIN
      jiagu/model/ner.model
  12. BIN
      jiagu/model/pos.model
  13. +227
    -0
      jiagu/perceptron.py
  14. +0
    -9
      jiagu/textrank.py
  15. +0
    -25
      jiagu/utils.py
  16. +3
    -4
      setup.py
  17. +1
    -0
      train/perceptron.py

+ 3
- 3
README.md View File

@@ -53,7 +53,7 @@ print(words)
pos = jiagu.pos(words) # 词性标注
print(pos)

ner = jiagu.ner(text) # 命名实体识别
ner = jiagu.ner(words) # 命名实体识别
print(ner)
```

@@ -61,7 +61,7 @@ print(ner)
```python3
import jiagu

text = '汉服和服装、知识图谱机器人'
text = '汉服和服装、维基图谱'

words = jiagu.cut(text) # 深度学习分词
print(words)
@@ -70,7 +70,7 @@ words = jiagu.seg(text) # 字典分词
print(words)

# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。
jiagu.load_userdict(['知识图谱'])
jiagu.load_userdict(['汉服和服装'])

words = jiagu.seg(text) # 自定义分词,字典分词模式有效
print(words)


+ 5
- 5
demo.py View File

@@ -3,25 +3,25 @@ import jiagu
# jiagu.init() # 可手动初始化,也可以动态初始化


text = '厦门明天会不会下雨'
text = '在苏州冻成狗'

words = jiagu.cut(text) # 分词
words = jiagu.seg(text) # 分词
print(words)

pos = jiagu.pos(words) # 词性标注
print(pos)

ner = jiagu.ner(text) # 命名实体识别
ner = jiagu.ner(words) # 命名实体识别
print(ner)


# 字典模式分词
text = '知识图谱机器人'
text = '知机器人挺好用的'
words = jiagu.seg(text)
print(words)

# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。
jiagu.load_userdict(['知识图谱'])
jiagu.load_userdict(['思知机器人'])

words = jiagu.seg(text)
print(words)


+ 0
- 8
jiagu/__init__.py View File

@@ -1,13 +1,5 @@
#!/usr/bin/env python3
# -*-coding:utf-8-*-
"""
* Copyright (C) 2018 OwnThink.
*
* Name : __init__.py
* Author : Yener <yener@ownthink.com>
* Version : 0.01
* Description :
"""
from jiagu import analyze

any = analyze.Analyze()


+ 0
- 9
jiagu/__main__.py View File

@@ -1,11 +1,2 @@
#!/usr/bin/env python3
# -*-coding:utf-8-*-
"""
* Copyright (C) 2018 OwnThink.
*
* Name : __main__.py
* Author : Yener <yener@ownthink.com>
* Version : 0.01
* Description :
"""


+ 22
- 62
jiagu/analyze.py View File

@@ -1,17 +1,9 @@
#!/usr/bin/env python3
# -*-coding:utf-8-*-
"""
* Copyright (C) 2018 OwnThink.
*
* Name : analyze.py - 解析模块
* Author : Yener <yener@ownthink.com>
* Version : 0.01
* Description :
"""
import os
from jiagu import mmseg
from jiagu import findword
from jiagu import bilstm_crf
from jiagu import perceptron
from jiagu.textrank import Keywords
from jiagu.textrank import Summarize
from jiagu.segment.nroute import Segment
@@ -50,18 +42,18 @@ class Analyze(object):

def init_cws(self):
if self.seg_model is None:
self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model'))
self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model'))

def load_model(self, model_path):
self.seg_model = bilstm_crf.Predict(model_path)
self.seg_model = perceptron.Perceptron(model_path)

def init_pos(self):
if self.pos_model is None:
self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model'))
self.pos_model = perceptron.Perceptron(add_curr_dir('model/pos.model'))

def init_ner(self):
if self.ner_model is None:
self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model'))
self.ner_model = perceptron.Perceptron(add_curr_dir('model/ner.model'))

def init_mmseg(self):
if self.seg_mmseg is None:
@@ -69,7 +61,7 @@ class Analyze(object):

def init_kg(self):
if self.kg_model is None:
self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model'))
self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model'))

@staticmethod
def __lab2word(sentence, labels):
@@ -97,22 +89,13 @@ class Analyze(object):
def cws_text(self, sentence):
if sentence == '':
return ['']
labels = self.seg_model.predict([sentence])[0]
labels = self.seg_model.predict(list(sentence))
return self.__lab2word(sentence, labels)

def cws_list(self, sentences):
text_list = sentences
all_labels = self.seg_model.predict(text_list)
sent_words = []
for ti, text in enumerate(text_list):
seg_labels = all_labels[ti]
sent_words.append(self.__lab2word(text, seg_labels))
return sent_words

def seg(self, sentence):
return self.seg_nroute.seg(sentence, mode="default")
def cws(self, sentence, input='text', model='default'):
def cws(self, sentence, model='default'):
"""中文分词

:param sentence: str or list
@@ -125,54 +108,31 @@ class Analyze(object):
"""
if model == 'default':
self.init_cws()

if input == 'batch':
words_list = self.cws_list(sentence)
return words_list
else:
words = self.cws_text(sentence)
return words
words = self.cws_text(sentence)
return words
elif model == 'mmseg':
self.init_mmseg()

words = self.seg_mmseg.cws(sentence)
return words
else:
pass
return []

def pos(self, sentence, input='words'): # 传入的是词语
def pos(self, words): # 传入的是词语
self.init_pos()
labels = self.pos_model.predict(words)
return labels

if input == 'batch':
all_labels = self.pos_model.predict(sentence)
return all_labels
else:
labels = self.pos_model.predict([sentence])[0]
return labels

def ner(self, sentence, input='text'): # 传入的是文本
def ner(self, words): # 传入的是词语
self.init_ner()
labels = self.ner_model.predict(words)
return labels

if input == 'batch':
all_labels = self.ner_model.predict(sentence)
return all_labels
else:
labels = self.ner_model.predict([sentence])[0]
return labels

def knowledge(self, sentence, input='text'):
def knowledge(self, text): # 传入的是文本
self.init_kg()

if input == 'batch':
all_labels = self.kg_model.predict(sentence)
result = []
for sent, labels in zip(sentence, all_labels):
result.append(self.lab2spo(sent, labels))
return result
else:
labels = self.kg_model.predict([sentence])[0]
return self.lab2spo(sentence, labels)
words = self.seg(text)
labels = self.kg_model.predict(words)
return self.lab2spo(words, labels)
def keywords(self, text, topkey=5):
if self.keywords_model == None:
@@ -195,11 +155,11 @@ class Analyze(object):
def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2):
return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg)
def lab2spo(self, text, epp_labels):
def lab2spo(self, words, epp_labels):
subject_list = [] # 存放实体的列表
object_list = []
index = 0
for word, ep in zip(list(text), epp_labels):
for word, ep in zip(words, epp_labels):
if ep[0] == 'B' and ep[2:] == '实体':
subject_list.append([word, ep[2:], index])
elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体':


+ 0
- 77
jiagu/bilstm_crf.py View File

@@ -1,77 +0,0 @@
#!/usr/bin/env python3
# -*-coding:utf-8-*-
"""
* Copyright (C) 2018 OwnThink.
*
* Name : bilstm_crf.py - 预测
* Author : Yener <yener@ownthink.com>
* Version : 0.01
* Description :
"""
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.contrib.crf import viterbi_decode


class Predict(object):
def __init__(self, model_file):
with open(model_file, 'rb') as f:
model, char_to_id, id_to_tag = pickle.load(f)

self.char_to_id = char_to_id
self.id_to_tag = {int(k): v for k, v in id_to_tag.items()}
self.num_class = len(self.id_to_tag)

graph_def = tf.GraphDef()
graph_def.ParseFromString(model)

with tf.Graph().as_default() as graph:
tf.import_graph_def(graph_def, name="prefix")

self.input_x = graph.get_tensor_by_name("prefix/char_inputs:0")
self.lengths = graph.get_tensor_by_name("prefix/lengths:0")
self.dropout = graph.get_tensor_by_name("prefix/dropout:0")
self.logits = graph.get_tensor_by_name("prefix/project/logits:0")
self.trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0")

self.sess = tf.Session(graph=graph)
self.sess.as_default()

def decode(self, logits, trans, sequence_lengths, tag_num):
small = -1000.0
viterbi_sequences = []
start = np.asarray([[small] * tag_num + [0]])
for logit, length in zip(logits, sequence_lengths):
score = logit[:length]
pad = small * np.ones([length, 1])
score = np.concatenate([score, pad], axis=1)
score = np.concatenate([start, score], axis=0)
viterbi_seq, viterbi_score = viterbi_decode(score, trans)
viterbi_sequences.append(viterbi_seq[1:])
return viterbi_sequences

def predict(self, sents):
inputs = []
lengths = [len(text) for text in sents]
max_len = max(lengths)

for sent in sents:
sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("<OOV>") for w in sent]
padding = [0] * (max_len - len(sent_ids))
sent_ids += padding
inputs.append(sent_ids)
inputs = np.array(inputs, dtype=np.int32)

feed_dict = {
self.input_x: inputs,
self.lengths: lengths,
self.dropout: 1.0
}

logits, trans = self.sess.run([self.logits, self.trans], feed_dict=feed_dict)
path = self.decode(logits, trans, lengths, self.num_class)
labels = [[self.id_to_tag.get(l) for l in p] for p in path]
return labels

+ 0
- 11
jiagu/findword.py View File

@@ -1,15 +1,4 @@
# -*- encoding:utf-8 -*-
"""
* Copyright (C) 2017 OwnThink.
*
* Name : findword.py - 新词发现
* Author : Yener <yener@ownthink.com>
* Version : 0.01
* Description : 新词发现算法实现
special thanks to
http://www.matrix67.com/blog/archives/5044
https://github.com/zoulala/New_words_find
"""
import re
from math import log
from collections import Counter


+ 0
- 8
jiagu/mmseg.py View File

@@ -1,13 +1,5 @@
#!/usr/bin/env python
# encoding: utf-8
"""
* Copyright (C) 2018 OwnThink.
*
* Name : mmseg.py
* Author : Leo <1162441289@qq.com>
* Version : 0.01
* Description : mmseg分词方法,目前算法比较耗时,仍在优化中
"""
import os
import pickle
from math import log


BIN
jiagu/model/cws.model View File


BIN
jiagu/model/kg.model View File


BIN
jiagu/model/ner.model View File


BIN
jiagu/model/pos.model View File


+ 227
- 0
jiagu/perceptron.py View File

@@ -0,0 +1,227 @@
# -*- coding:utf-8 -*-
import os
import gzip
import pickle
import random
from collections import defaultdict

class AveragedPerceptron(object):
def __init__(self):
# Each feature gets its own weight vector, so weights is a dict-of-dicts
self.weights = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
self._totals = defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/clas tuples
# (tstamps is short for timestamps)
self._tstamps = defaultdict(int)
# Number of instances seen
self.i = 0

def predict(self, features):
'''Dot-product the features and current weights and return the best label.'''
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
continue
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda label: (scores[label], label))

def update(self, truth, guess, features):
'''Update the feature weights.'''
def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v

self.i += 1
if truth == guess:
return None
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
return None

def average_weights(self):
'''Average weights from all iterations.'''
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights
return None

class Perceptron:
def __init__(self, loc=None):
self.START = ['-START-', '-START2-']
self.END = ['-END-', '-END2-']
self.model = AveragedPerceptron()
if loc != None:
self.load(loc)

def predict(self, words):
prev, prev2 = self.START
labels = []
context = self.START + words + self.END
for i, word in enumerate(words):
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
labels.append(tag)
prev2 = prev
prev = tag
return labels
def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
self._make_tagdict(sentences)
for iter_ in range(nr_iter):
c = 0
n = 0
for words, tags in sentences:
prev, prev2 = self.START
context = self.START + words + self.END
for i, word in enumerate(words):
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)

prev2 = prev
prev = guess
c += guess == tags[i]
n += 1
if shuf == True:
random.shuffle(sentences)
print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
self.save(save_loc)
self.model.average_weights()
self.save(save_loc)
def save(self, loc='model/ap.model', zip=True):
if zip == False:
pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
else:
pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
def load(self, loc='model/ap.model', zip=True):
if zip == False:
self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
else:
self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.
'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1

i += len(self.START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i - 1])
add('i-1 suffix', context[i - 1][-3:])
add('i-2 word', context[i - 2])
add('i+1 word', context[i + 1])
add('i+1 suffix', context[i + 1][-3:])
add('i+2 word', context[i + 2])
return features

def _make_tagdict(self, sentences):
'''Make a tag dictionary for single-tag words.'''
for words, tags in sentences:
for word, tag in zip(words, tags):
self.model.classes.add(tag)
def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
tagger = Perceptron()
print('Reading corpus...')
training_data = []
sentence = ([], [])
fin = open(filepath, 'r', encoding='utf8')
for index, line in enumerate(fin):
line = line.strip()
if line == '':
training_data.append(sentence)
sentence = ([], [])
else:
params = line.split()
if len(params) != 2: continue
sentence[0].append(params[0])
sentence[1].append(params[1])
fin.close()
print('training corpus size : %d', len(training_data))
print('Start training...')
tagger.train(training_data, save_loc=model, nr_iter=nr_iter)

def eval(filepath='data/test.txt', model='model/ap.model'):
tagger = Perceptron(model)
print('Start testing...')
right = 0.0
total = 0.0
sentence = ([], [])
fin = open(filepath, 'r', encoding='utf8')
for index, line in enumerate(fin):
line = line.strip()
if line == '':
words = sentence[0]
tags = sentence[1]
outputs = tagger.predict(words)
assert len(tags) == len(outputs)
total += len(tags)
for o, t in zip(outputs, tags):
if o == t: right += 1
sentence = ([], [])
else:
params = line.split()
if len(params) != 2: continue
sentence[0].append(params[0])
sentence[1].append(params[1])
fin.close()
print("Precision : %f", right / total)
def predict(model='model/ap.model'):
tagger = Perceptron(model)

while True:
text = input('>')
words = list(text)
labels = tagger.predict(words)
for word, label in zip(words, labels):
print(word, label)

if __name__ == '__main__':
train()
eval()
# predict()


+ 0
- 9
jiagu/textrank.py View File

@@ -1,13 +1,4 @@
# -*- encoding:utf-8 -*-
"""
* Copyright (C) 2017 OwnThink.
*
* Name : textrank.py - 解析
* Author : zengbin93 <zeng_bin8888@163.com>
* Version : 0.01
* Description : TextRank算法实现
special thanks to https://github.com/ArtistScript/FastTextRank
"""
import sys
import numpy as np
from jiagu import utils


+ 0
- 25
jiagu/utils.py View File

@@ -1,16 +1,7 @@
# -*- encoding:utf-8 -*-
"""
* Copyright (C) 2017 OwnThink.
*
* Name : utils.py - 解析
* Author : zengbin93 <zeng_bin8888@163.com>
* Version : 0.01
* Description : 常用工具函数
"""
import os
import jiagu
import math
import numpy as np
def default_stopwords_file():
@@ -138,22 +129,6 @@ def different(scores, old_scores, tol=0.0001):
return flag
def cosine_similarity(vec1, vec2):
"""计算两个向量的余弦相似度
:param vec1: list or np.array
:param vec2: list or np.array
:return: float
"""
tx = np.array(vec1)
ty = np.array(vec2)
cos1 = np.sum(tx * ty)
cos21 = np.sqrt(sum(tx ** 2))
cos22 = np.sqrt(sum(ty ** 2))
cosine_value = cos1 / float(cos21 * cos22)
return cosine_value
def combine(word_list, window=2):
if window < 2:
window = 2


+ 3
- 4
setup.py View File

@@ -3,16 +3,15 @@
from setuptools import setup

setup(name='jiagu',
version='0.1.8',
version='0.1.9',
description='Jiagu Natural Language Processing',
author='Yener(Zheng Wenyu)',
author_email='help@ownthink.com',
url='https://github.com/ownthink/Jiagu',
license='MIT',
install_requires=['tensorflow==1.6.0', 'numpy>=1.12.1'],
packages=['jiagu'],
package_dir={'jiagu': 'jiagu'},
package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*',
'normal/*', 'segment/*', 'segment/dict/*',
'sentiment/*', 'sentiment/model/*', 'topic/*']}
'normal/*', 'segment/*', 'segment/dict/*',
'sentiment/*', 'sentiment/model/*', 'topic/*']}
)

+ 1
- 0
train/perceptron.py View File

@@ -170,6 +170,7 @@ def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
sentence = ([], [])
else:
params = line.split()
if len(params) != 2: continue
sentence[0].append(params[0])
sentence[1].append(params[1])
fin.close()


Loading…
Cancel
Save