@@ -53,7 +53,7 @@ print(words) | |||||
pos = jiagu.pos(words) # 词性标注 | pos = jiagu.pos(words) # 词性标注 | ||||
print(pos) | print(pos) | ||||
ner = jiagu.ner(text) # 命名实体识别 | |||||
ner = jiagu.ner(words) # 命名实体识别 | |||||
print(ner) | print(ner) | ||||
``` | ``` | ||||
@@ -61,7 +61,7 @@ print(ner) | |||||
```python3 | ```python3 | ||||
import jiagu | import jiagu | ||||
text = '汉服和服装、知识图谱机器人' | |||||
text = '汉服和服装、维基图谱' | |||||
words = jiagu.cut(text) # 深度学习分词 | words = jiagu.cut(text) # 深度学习分词 | ||||
print(words) | print(words) | ||||
@@ -70,7 +70,7 @@ words = jiagu.seg(text) # 字典分词 | |||||
print(words) | print(words) | ||||
# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | ||||
jiagu.load_userdict(['知识图谱']) | |||||
jiagu.load_userdict(['汉服和服装']) | |||||
words = jiagu.seg(text) # 自定义分词,字典分词模式有效 | words = jiagu.seg(text) # 自定义分词,字典分词模式有效 | ||||
print(words) | print(words) | ||||
@@ -3,25 +3,25 @@ import jiagu | |||||
# jiagu.init() # 可手动初始化,也可以动态初始化 | # jiagu.init() # 可手动初始化,也可以动态初始化 | ||||
text = '厦门明天会不会下雨' | |||||
text = '在苏州冻成狗' | |||||
words = jiagu.cut(text) # 分词 | |||||
words = jiagu.seg(text) # 分词 | |||||
print(words) | print(words) | ||||
pos = jiagu.pos(words) # 词性标注 | pos = jiagu.pos(words) # 词性标注 | ||||
print(pos) | print(pos) | ||||
ner = jiagu.ner(text) # 命名实体识别 | |||||
ner = jiagu.ner(words) # 命名实体识别 | |||||
print(ner) | print(ner) | ||||
# 字典模式分词 | # 字典模式分词 | ||||
text = '知识图谱机器人' | |||||
text = '思知机器人挺好用的' | |||||
words = jiagu.seg(text) | words = jiagu.seg(text) | ||||
print(words) | print(words) | ||||
# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | # jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | ||||
jiagu.load_userdict(['知识图谱']) | |||||
jiagu.load_userdict(['思知机器人']) | |||||
words = jiagu.seg(text) | words = jiagu.seg(text) | ||||
print(words) | print(words) | ||||
@@ -1,13 +1,5 @@ | |||||
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||
# -*-coding:utf-8-*- | # -*-coding:utf-8-*- | ||||
""" | |||||
* Copyright (C) 2018 OwnThink. | |||||
* | |||||
* Name : __init__.py | |||||
* Author : Yener <yener@ownthink.com> | |||||
* Version : 0.01 | |||||
* Description : | |||||
""" | |||||
from jiagu import analyze | from jiagu import analyze | ||||
any = analyze.Analyze() | any = analyze.Analyze() | ||||
@@ -1,11 +1,2 @@ | |||||
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||
# -*-coding:utf-8-*- | # -*-coding:utf-8-*- | ||||
""" | |||||
* Copyright (C) 2018 OwnThink. | |||||
* | |||||
* Name : __main__.py | |||||
* Author : Yener <yener@ownthink.com> | |||||
* Version : 0.01 | |||||
* Description : | |||||
""" | |||||
@@ -1,17 +1,9 @@ | |||||
#!/usr/bin/env python3 | #!/usr/bin/env python3 | ||||
# -*-coding:utf-8-*- | # -*-coding:utf-8-*- | ||||
""" | |||||
* Copyright (C) 2018 OwnThink. | |||||
* | |||||
* Name : analyze.py - 解析模块 | |||||
* Author : Yener <yener@ownthink.com> | |||||
* Version : 0.01 | |||||
* Description : | |||||
""" | |||||
import os | import os | ||||
from jiagu import mmseg | from jiagu import mmseg | ||||
from jiagu import findword | from jiagu import findword | ||||
from jiagu import bilstm_crf | |||||
from jiagu import perceptron | |||||
from jiagu.textrank import Keywords | from jiagu.textrank import Keywords | ||||
from jiagu.textrank import Summarize | from jiagu.textrank import Summarize | ||||
from jiagu.segment.nroute import Segment | from jiagu.segment.nroute import Segment | ||||
@@ -50,18 +42,18 @@ class Analyze(object): | |||||
def init_cws(self): | def init_cws(self): | ||||
if self.seg_model is None: | if self.seg_model is None: | ||||
self.seg_model = bilstm_crf.Predict(add_curr_dir('model/cws.model')) | |||||
self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model')) | |||||
def load_model(self, model_path): | def load_model(self, model_path): | ||||
self.seg_model = bilstm_crf.Predict(model_path) | |||||
self.seg_model = perceptron.Perceptron(model_path) | |||||
def init_pos(self): | def init_pos(self): | ||||
if self.pos_model is None: | if self.pos_model is None: | ||||
self.pos_model = bilstm_crf.Predict(add_curr_dir('model/pos.model')) | |||||
self.pos_model = perceptron.Perceptron(add_curr_dir('model/pos.model')) | |||||
def init_ner(self): | def init_ner(self): | ||||
if self.ner_model is None: | if self.ner_model is None: | ||||
self.ner_model = bilstm_crf.Predict(add_curr_dir('model/ner.model')) | |||||
self.ner_model = perceptron.Perceptron(add_curr_dir('model/ner.model')) | |||||
def init_mmseg(self): | def init_mmseg(self): | ||||
if self.seg_mmseg is None: | if self.seg_mmseg is None: | ||||
@@ -69,7 +61,7 @@ class Analyze(object): | |||||
def init_kg(self): | def init_kg(self): | ||||
if self.kg_model is None: | if self.kg_model is None: | ||||
self.kg_model = bilstm_crf.Predict(add_curr_dir('model/kg.model')) | |||||
self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model')) | |||||
@staticmethod | @staticmethod | ||||
def __lab2word(sentence, labels): | def __lab2word(sentence, labels): | ||||
@@ -97,22 +89,13 @@ class Analyze(object): | |||||
def cws_text(self, sentence): | def cws_text(self, sentence): | ||||
if sentence == '': | if sentence == '': | ||||
return [''] | return [''] | ||||
labels = self.seg_model.predict([sentence])[0] | |||||
labels = self.seg_model.predict(list(sentence)) | |||||
return self.__lab2word(sentence, labels) | return self.__lab2word(sentence, labels) | ||||
def cws_list(self, sentences): | |||||
text_list = sentences | |||||
all_labels = self.seg_model.predict(text_list) | |||||
sent_words = [] | |||||
for ti, text in enumerate(text_list): | |||||
seg_labels = all_labels[ti] | |||||
sent_words.append(self.__lab2word(text, seg_labels)) | |||||
return sent_words | |||||
def seg(self, sentence): | def seg(self, sentence): | ||||
return self.seg_nroute.seg(sentence, mode="default") | return self.seg_nroute.seg(sentence, mode="default") | ||||
def cws(self, sentence, input='text', model='default'): | |||||
def cws(self, sentence, model='default'): | |||||
"""中文分词 | """中文分词 | ||||
:param sentence: str or list | :param sentence: str or list | ||||
@@ -125,54 +108,31 @@ class Analyze(object): | |||||
""" | """ | ||||
if model == 'default': | if model == 'default': | ||||
self.init_cws() | self.init_cws() | ||||
if input == 'batch': | |||||
words_list = self.cws_list(sentence) | |||||
return words_list | |||||
else: | |||||
words = self.cws_text(sentence) | |||||
return words | |||||
words = self.cws_text(sentence) | |||||
return words | |||||
elif model == 'mmseg': | elif model == 'mmseg': | ||||
self.init_mmseg() | self.init_mmseg() | ||||
words = self.seg_mmseg.cws(sentence) | words = self.seg_mmseg.cws(sentence) | ||||
return words | return words | ||||
else: | else: | ||||
pass | pass | ||||
return [] | return [] | ||||
def pos(self, sentence, input='words'): # 传入的是词语 | |||||
def pos(self, words): # 传入的是词语 | |||||
self.init_pos() | self.init_pos() | ||||
labels = self.pos_model.predict(words) | |||||
return labels | |||||
if input == 'batch': | |||||
all_labels = self.pos_model.predict(sentence) | |||||
return all_labels | |||||
else: | |||||
labels = self.pos_model.predict([sentence])[0] | |||||
return labels | |||||
def ner(self, sentence, input='text'): # 传入的是文本 | |||||
def ner(self, words): # 传入的是词语 | |||||
self.init_ner() | self.init_ner() | ||||
labels = self.ner_model.predict(words) | |||||
return labels | |||||
if input == 'batch': | |||||
all_labels = self.ner_model.predict(sentence) | |||||
return all_labels | |||||
else: | |||||
labels = self.ner_model.predict([sentence])[0] | |||||
return labels | |||||
def knowledge(self, sentence, input='text'): | |||||
def knowledge(self, text): # 传入的是文本 | |||||
self.init_kg() | self.init_kg() | ||||
if input == 'batch': | |||||
all_labels = self.kg_model.predict(sentence) | |||||
result = [] | |||||
for sent, labels in zip(sentence, all_labels): | |||||
result.append(self.lab2spo(sent, labels)) | |||||
return result | |||||
else: | |||||
labels = self.kg_model.predict([sentence])[0] | |||||
return self.lab2spo(sentence, labels) | |||||
words = self.seg(text) | |||||
labels = self.kg_model.predict(words) | |||||
return self.lab2spo(words, labels) | |||||
def keywords(self, text, topkey=5): | def keywords(self, text, topkey=5): | ||||
if self.keywords_model == None: | if self.keywords_model == None: | ||||
@@ -195,11 +155,11 @@ class Analyze(object): | |||||
def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | ||||
return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | ||||
def lab2spo(self, text, epp_labels): | |||||
def lab2spo(self, words, epp_labels): | |||||
subject_list = [] # 存放实体的列表 | subject_list = [] # 存放实体的列表 | ||||
object_list = [] | object_list = [] | ||||
index = 0 | index = 0 | ||||
for word, ep in zip(list(text), epp_labels): | |||||
for word, ep in zip(words, epp_labels): | |||||
if ep[0] == 'B' and ep[2:] == '实体': | if ep[0] == 'B' and ep[2:] == '实体': | ||||
subject_list.append([word, ep[2:], index]) | subject_list.append([word, ep[2:], index]) | ||||
elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': | elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体': | ||||
@@ -1,77 +0,0 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*-coding:utf-8-*- | |||||
""" | |||||
* Copyright (C) 2018 OwnThink. | |||||
* | |||||
* Name : bilstm_crf.py - 预测 | |||||
* Author : Yener <yener@ownthink.com> | |||||
* Version : 0.01 | |||||
* Description : | |||||
""" | |||||
import os | |||||
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' | |||||
import pickle | |||||
import numpy as np | |||||
import tensorflow as tf | |||||
from tensorflow.contrib.crf import viterbi_decode | |||||
class Predict(object): | |||||
def __init__(self, model_file): | |||||
with open(model_file, 'rb') as f: | |||||
model, char_to_id, id_to_tag = pickle.load(f) | |||||
self.char_to_id = char_to_id | |||||
self.id_to_tag = {int(k): v for k, v in id_to_tag.items()} | |||||
self.num_class = len(self.id_to_tag) | |||||
graph_def = tf.GraphDef() | |||||
graph_def.ParseFromString(model) | |||||
with tf.Graph().as_default() as graph: | |||||
tf.import_graph_def(graph_def, name="prefix") | |||||
self.input_x = graph.get_tensor_by_name("prefix/char_inputs:0") | |||||
self.lengths = graph.get_tensor_by_name("prefix/lengths:0") | |||||
self.dropout = graph.get_tensor_by_name("prefix/dropout:0") | |||||
self.logits = graph.get_tensor_by_name("prefix/project/logits:0") | |||||
self.trans = graph.get_tensor_by_name("prefix/crf_loss/transitions:0") | |||||
self.sess = tf.Session(graph=graph) | |||||
self.sess.as_default() | |||||
def decode(self, logits, trans, sequence_lengths, tag_num): | |||||
small = -1000.0 | |||||
viterbi_sequences = [] | |||||
start = np.asarray([[small] * tag_num + [0]]) | |||||
for logit, length in zip(logits, sequence_lengths): | |||||
score = logit[:length] | |||||
pad = small * np.ones([length, 1]) | |||||
score = np.concatenate([score, pad], axis=1) | |||||
score = np.concatenate([start, score], axis=0) | |||||
viterbi_seq, viterbi_score = viterbi_decode(score, trans) | |||||
viterbi_sequences.append(viterbi_seq[1:]) | |||||
return viterbi_sequences | |||||
def predict(self, sents): | |||||
inputs = [] | |||||
lengths = [len(text) for text in sents] | |||||
max_len = max(lengths) | |||||
for sent in sents: | |||||
sent_ids = [self.char_to_id.get(w) if w in self.char_to_id else self.char_to_id.get("<OOV>") for w in sent] | |||||
padding = [0] * (max_len - len(sent_ids)) | |||||
sent_ids += padding | |||||
inputs.append(sent_ids) | |||||
inputs = np.array(inputs, dtype=np.int32) | |||||
feed_dict = { | |||||
self.input_x: inputs, | |||||
self.lengths: lengths, | |||||
self.dropout: 1.0 | |||||
} | |||||
logits, trans = self.sess.run([self.logits, self.trans], feed_dict=feed_dict) | |||||
path = self.decode(logits, trans, lengths, self.num_class) | |||||
labels = [[self.id_to_tag.get(l) for l in p] for p in path] | |||||
return labels |
@@ -1,15 +1,4 @@ | |||||
# -*- encoding:utf-8 -*- | # -*- encoding:utf-8 -*- | ||||
""" | |||||
* Copyright (C) 2017 OwnThink. | |||||
* | |||||
* Name : findword.py - 新词发现 | |||||
* Author : Yener <yener@ownthink.com> | |||||
* Version : 0.01 | |||||
* Description : 新词发现算法实现 | |||||
special thanks to | |||||
http://www.matrix67.com/blog/archives/5044 | |||||
https://github.com/zoulala/New_words_find | |||||
""" | |||||
import re | import re | ||||
from math import log | from math import log | ||||
from collections import Counter | from collections import Counter | ||||
@@ -1,13 +1,5 @@ | |||||
#!/usr/bin/env python | #!/usr/bin/env python | ||||
# encoding: utf-8 | # encoding: utf-8 | ||||
""" | |||||
* Copyright (C) 2018 OwnThink. | |||||
* | |||||
* Name : mmseg.py | |||||
* Author : Leo <1162441289@qq.com> | |||||
* Version : 0.01 | |||||
* Description : mmseg分词方法,目前算法比较耗时,仍在优化中 | |||||
""" | |||||
import os | import os | ||||
import pickle | import pickle | ||||
from math import log | from math import log | ||||
@@ -0,0 +1,227 @@ | |||||
# -*- coding:utf-8 -*- | |||||
import os | |||||
import gzip | |||||
import pickle | |||||
import random | |||||
from collections import defaultdict | |||||
class AveragedPerceptron(object): | |||||
def __init__(self): | |||||
# Each feature gets its own weight vector, so weights is a dict-of-dicts | |||||
self.weights = {} | |||||
self.classes = set() | |||||
# The accumulated values, for the averaging. These will be keyed by | |||||
# feature/clas tuples | |||||
self._totals = defaultdict(int) | |||||
# The last time the feature was changed, for the averaging. Also | |||||
# keyed by feature/clas tuples | |||||
# (tstamps is short for timestamps) | |||||
self._tstamps = defaultdict(int) | |||||
# Number of instances seen | |||||
self.i = 0 | |||||
def predict(self, features): | |||||
'''Dot-product the features and current weights and return the best label.''' | |||||
scores = defaultdict(float) | |||||
for feat, value in features.items(): | |||||
if feat not in self.weights or value == 0: | |||||
continue | |||||
weights = self.weights[feat] | |||||
for label, weight in weights.items(): | |||||
scores[label] += value * weight | |||||
# Do a secondary alphabetic sort, for stability | |||||
return max(self.classes, key=lambda label: (scores[label], label)) | |||||
def update(self, truth, guess, features): | |||||
'''Update the feature weights.''' | |||||
def upd_feat(c, f, w, v): | |||||
param = (f, c) | |||||
self._totals[param] += (self.i - self._tstamps[param]) * w | |||||
self._tstamps[param] = self.i | |||||
self.weights[f][c] = w + v | |||||
self.i += 1 | |||||
if truth == guess: | |||||
return None | |||||
for f in features: | |||||
weights = self.weights.setdefault(f, {}) | |||||
upd_feat(truth, f, weights.get(truth, 0.0), 1.0) | |||||
upd_feat(guess, f, weights.get(guess, 0.0), -1.0) | |||||
return None | |||||
def average_weights(self): | |||||
'''Average weights from all iterations.''' | |||||
for feat, weights in self.weights.items(): | |||||
new_feat_weights = {} | |||||
for clas, weight in weights.items(): | |||||
param = (feat, clas) | |||||
total = self._totals[param] | |||||
total += (self.i - self._tstamps[param]) * weight | |||||
averaged = round(total / float(self.i), 3) | |||||
if averaged: | |||||
new_feat_weights[clas] = averaged | |||||
self.weights[feat] = new_feat_weights | |||||
return None | |||||
class Perceptron: | |||||
def __init__(self, loc=None): | |||||
self.START = ['-START-', '-START2-'] | |||||
self.END = ['-END-', '-END2-'] | |||||
self.model = AveragedPerceptron() | |||||
if loc != None: | |||||
self.load(loc) | |||||
def predict(self, words): | |||||
prev, prev2 = self.START | |||||
labels = [] | |||||
context = self.START + words + self.END | |||||
for i, word in enumerate(words): | |||||
features = self._get_features(i, word, context, prev, prev2) | |||||
tag = self.model.predict(features) | |||||
labels.append(tag) | |||||
prev2 = prev | |||||
prev = tag | |||||
return labels | |||||
def train(self, sentences, save_loc=None, nr_iter=5, shuf=False): | |||||
self._make_tagdict(sentences) | |||||
for iter_ in range(nr_iter): | |||||
c = 0 | |||||
n = 0 | |||||
for words, tags in sentences: | |||||
prev, prev2 = self.START | |||||
context = self.START + words + self.END | |||||
for i, word in enumerate(words): | |||||
feats = self._get_features(i, word, context, prev, prev2) | |||||
guess = self.model.predict(feats) | |||||
self.model.update(tags[i], guess, feats) | |||||
prev2 = prev | |||||
prev = guess | |||||
c += guess == tags[i] | |||||
n += 1 | |||||
if shuf == True: | |||||
random.shuffle(sentences) | |||||
print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100)) | |||||
self.save(save_loc) | |||||
self.model.average_weights() | |||||
self.save(save_loc) | |||||
def save(self, loc='model/ap.model', zip=True): | |||||
if zip == False: | |||||
pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb')) | |||||
else: | |||||
pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb')) | |||||
def load(self, loc='model/ap.model', zip=True): | |||||
if zip == False: | |||||
self.model.weights, self.model.classes = pickle.load(open(loc, 'rb')) | |||||
else: | |||||
self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb')) | |||||
def _get_features(self, i, word, context, prev, prev2): | |||||
'''Map tokens into a feature representation, implemented as a | |||||
{hashable: float} dict. If the features change, a new model must be | |||||
trained. | |||||
''' | |||||
def add(name, *args): | |||||
features[' '.join((name,) + tuple(args))] += 1 | |||||
i += len(self.START) | |||||
features = defaultdict(int) | |||||
# It's useful to have a constant feature, which acts sort of like a prior | |||||
add('bias') | |||||
add('i suffix', word[-3:]) | |||||
add('i pref1', word[0]) | |||||
add('i-1 tag', prev) | |||||
add('i-2 tag', prev2) | |||||
add('i tag+i-2 tag', prev, prev2) | |||||
add('i word', context[i]) | |||||
add('i-1 tag+i word', prev, context[i]) | |||||
add('i-1 word', context[i - 1]) | |||||
add('i-1 suffix', context[i - 1][-3:]) | |||||
add('i-2 word', context[i - 2]) | |||||
add('i+1 word', context[i + 1]) | |||||
add('i+1 suffix', context[i + 1][-3:]) | |||||
add('i+2 word', context[i + 2]) | |||||
return features | |||||
def _make_tagdict(self, sentences): | |||||
'''Make a tag dictionary for single-tag words.''' | |||||
for words, tags in sentences: | |||||
for word, tag in zip(words, tags): | |||||
self.model.classes.add(tag) | |||||
def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||||
tagger = Perceptron() | |||||
print('Reading corpus...') | |||||
training_data = [] | |||||
sentence = ([], []) | |||||
fin = open(filepath, 'r', encoding='utf8') | |||||
for index, line in enumerate(fin): | |||||
line = line.strip() | |||||
if line == '': | |||||
training_data.append(sentence) | |||||
sentence = ([], []) | |||||
else: | |||||
params = line.split() | |||||
if len(params) != 2: continue | |||||
sentence[0].append(params[0]) | |||||
sentence[1].append(params[1]) | |||||
fin.close() | |||||
print('training corpus size : %d', len(training_data)) | |||||
print('Start training...') | |||||
tagger.train(training_data, save_loc=model, nr_iter=nr_iter) | |||||
def eval(filepath='data/test.txt', model='model/ap.model'): | |||||
tagger = Perceptron(model) | |||||
print('Start testing...') | |||||
right = 0.0 | |||||
total = 0.0 | |||||
sentence = ([], []) | |||||
fin = open(filepath, 'r', encoding='utf8') | |||||
for index, line in enumerate(fin): | |||||
line = line.strip() | |||||
if line == '': | |||||
words = sentence[0] | |||||
tags = sentence[1] | |||||
outputs = tagger.predict(words) | |||||
assert len(tags) == len(outputs) | |||||
total += len(tags) | |||||
for o, t in zip(outputs, tags): | |||||
if o == t: right += 1 | |||||
sentence = ([], []) | |||||
else: | |||||
params = line.split() | |||||
if len(params) != 2: continue | |||||
sentence[0].append(params[0]) | |||||
sentence[1].append(params[1]) | |||||
fin.close() | |||||
print("Precision : %f", right / total) | |||||
def predict(model='model/ap.model'): | |||||
tagger = Perceptron(model) | |||||
while True: | |||||
text = input('>') | |||||
words = list(text) | |||||
labels = tagger.predict(words) | |||||
for word, label in zip(words, labels): | |||||
print(word, label) | |||||
if __name__ == '__main__': | |||||
train() | |||||
eval() | |||||
# predict() | |||||
@@ -1,13 +1,4 @@ | |||||
# -*- encoding:utf-8 -*- | # -*- encoding:utf-8 -*- | ||||
""" | |||||
* Copyright (C) 2017 OwnThink. | |||||
* | |||||
* Name : textrank.py - 解析 | |||||
* Author : zengbin93 <zeng_bin8888@163.com> | |||||
* Version : 0.01 | |||||
* Description : TextRank算法实现 | |||||
special thanks to https://github.com/ArtistScript/FastTextRank | |||||
""" | |||||
import sys | import sys | ||||
import numpy as np | import numpy as np | ||||
from jiagu import utils | from jiagu import utils | ||||
@@ -1,16 +1,7 @@ | |||||
# -*- encoding:utf-8 -*- | # -*- encoding:utf-8 -*- | ||||
""" | |||||
* Copyright (C) 2017 OwnThink. | |||||
* | |||||
* Name : utils.py - 解析 | |||||
* Author : zengbin93 <zeng_bin8888@163.com> | |||||
* Version : 0.01 | |||||
* Description : 常用工具函数 | |||||
""" | |||||
import os | import os | ||||
import jiagu | import jiagu | ||||
import math | import math | ||||
import numpy as np | |||||
def default_stopwords_file(): | def default_stopwords_file(): | ||||
@@ -138,22 +129,6 @@ def different(scores, old_scores, tol=0.0001): | |||||
return flag | return flag | ||||
def cosine_similarity(vec1, vec2): | |||||
"""计算两个向量的余弦相似度 | |||||
:param vec1: list or np.array | |||||
:param vec2: list or np.array | |||||
:return: float | |||||
""" | |||||
tx = np.array(vec1) | |||||
ty = np.array(vec2) | |||||
cos1 = np.sum(tx * ty) | |||||
cos21 = np.sqrt(sum(tx ** 2)) | |||||
cos22 = np.sqrt(sum(ty ** 2)) | |||||
cosine_value = cos1 / float(cos21 * cos22) | |||||
return cosine_value | |||||
def combine(word_list, window=2): | def combine(word_list, window=2): | ||||
if window < 2: | if window < 2: | ||||
window = 2 | window = 2 | ||||
@@ -3,16 +3,15 @@ | |||||
from setuptools import setup | from setuptools import setup | ||||
setup(name='jiagu', | setup(name='jiagu', | ||||
version='0.1.8', | |||||
version='0.1.9', | |||||
description='Jiagu Natural Language Processing', | description='Jiagu Natural Language Processing', | ||||
author='Yener(Zheng Wenyu)', | author='Yener(Zheng Wenyu)', | ||||
author_email='help@ownthink.com', | author_email='help@ownthink.com', | ||||
url='https://github.com/ownthink/Jiagu', | url='https://github.com/ownthink/Jiagu', | ||||
license='MIT', | license='MIT', | ||||
install_requires=['tensorflow==1.6.0', 'numpy>=1.12.1'], | |||||
packages=['jiagu'], | packages=['jiagu'], | ||||
package_dir={'jiagu': 'jiagu'}, | package_dir={'jiagu': 'jiagu'}, | ||||
package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', | package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', | ||||
'normal/*', 'segment/*', 'segment/dict/*', | |||||
'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||||
'normal/*', 'segment/*', 'segment/dict/*', | |||||
'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||||
) | ) |
@@ -170,6 +170,7 @@ def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1): | |||||
sentence = ([], []) | sentence = ([], []) | ||||
else: | else: | ||||
params = line.split() | params = line.split() | ||||
if len(params) != 2: continue | |||||
sentence[0].append(params[0]) | sentence[0].append(params[0]) | ||||
sentence[1].append(params[1]) | sentence[1].append(params[1]) | ||||
fin.close() | fin.close() | ||||