@@ -1,5 +1,5 @@ | |||
# Jiagu自然语言处理工具 | |||
>>> Jiagu以BiLSTM等模型为基础,使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家。 | |||
>>> Jiagu以BiLSTM等模型为基础,使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家。 | |||
## 目录 | |||
* [安装方式](#安装方式) | |||
@@ -17,6 +17,8 @@ | |||
* 关键词提取 | |||
* 文本摘要 | |||
* 新词发现 | |||
* 情感分析 | |||
* 文本聚类 | |||
* 等等。。。。 | |||
--- | |||
@@ -42,7 +44,7 @@ import jiagu | |||
text = '厦门明天会不会下雨' | |||
words = jiagu.seg(text) # 分词 | |||
words = jiagu.cut(text) # 分词 | |||
print(words) | |||
pos = jiagu.pos(words) # 词性标注 | |||
@@ -54,35 +56,38 @@ print(ner) | |||
2. 中文分词 | |||
分词各种模式使用方式 | |||
自定义分词模型(将单独提供msr、pku、cnc等分词标准) | |||
```python3 | |||
import jiagu | |||
text = '汉服和服装' | |||
# 独立标准模型路径 | |||
# msr:test/extra_data/model/msr.model | |||
# pku:test/extra_data/model/pku.model | |||
# cnc:test/extra_data/model/cnc.model | |||
words = jiagu.seg(text) # 默认分词 | |||
print(words) | |||
jiagu.load_model('test/extra_data/model/cnc.model') # 使用国家语委分词标准 | |||
words = jiagu.seg([text, text, text], input='batch') # 批量分词,加快速度。 | |||
print(words) | |||
words = jiagu.cut('结婚的和尚未结婚的') | |||
words = jiagu.seg(text, model='mmseg') # 使用mmseg算法进行分词 | |||
print(list(words)) | |||
print(words) | |||
``` | |||
自定义分词模型(将单独提供msr、pku、cnc等分词标准) | |||
分词各种模式使用方式 | |||
```python3 | |||
import jiagu | |||
# 独立标准模型路径 | |||
# msr:test/extra_data/model/msr.model | |||
# pku:test/extra_data/model/pku.model | |||
# cnc:test/extra_data/model/cnc.model | |||
text = '汉服和服装、知识图谱机器人' | |||
jiagu.load_model('test/extra_data/model/cnc.model') # 使用国家语委分词标准 | |||
words = jiagu.cut(text) # 默认分词 | |||
print(words) | |||
words = jiagu.seg(text) # 字典分词 | |||
print(words) | |||
words = jiagu.seg('结婚的和尚未结婚的') | |||
# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | |||
jiagu.load_userdict(['知识图谱']) | |||
words = jiagu.seg(text) # 自定义分词,字典分词模式有效 | |||
print(words) | |||
``` | |||
@@ -132,8 +137,31 @@ import jiagu | |||
jiagu.findword('input.txt', 'output.txt') # 根据文本,利用信息熵做新词发现。 | |||
``` | |||
7. 情感分析 | |||
```python3 | |||
text = '很讨厌还是个懒鬼' | |||
sentiment = jiagu.sentiment(text) | |||
print(sentiment) | |||
``` | |||
8. 文本聚类 | |||
```python3 | |||
docs = [ | |||
"百度深度学习中文情感分析工具Senta试用及在线测试", | |||
"情感分析是自然语言处理里面一个热门话题", | |||
"AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", | |||
"深度学习实践:从零开始做电影评论文本情感分析", | |||
"BERT相关论文、文章和代码资源汇总", | |||
"将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上", | |||
"自然语言处理工具包spaCy介绍", | |||
"现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文" | |||
] | |||
cluster = jiagu.text_cluster(docs) | |||
print(cluster) | |||
``` | |||
### 评价标准 | |||
1. msr测试结果 | |||
1. msr测试结果(旧版本) | |||
 | |||
@@ -5,18 +5,29 @@ import jiagu | |||
text = '厦门明天会不会下雨' | |||
words = jiagu.seg(text) # 分词,可以用model选择分词模式,不填则默认,mmseg则使用mmseg算法。 | |||
words = jiagu.cut(text) # 分词 | |||
print(words) | |||
# words = jiagu.seg(text, model="mmseg") # mmseg 分词得到generator,需要用list进行转换 | |||
# print(list(words)) | |||
pos = jiagu.pos(words) # 词性标注 | |||
print(pos) | |||
ner = jiagu.ner(text) # 命名实体识别 | |||
print(ner) | |||
# 字典模式分词 | |||
text = '知识图谱机器人' | |||
words = jiagu.seg(text) | |||
print(words) | |||
# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。 | |||
jiagu.load_userdict(['知识图谱']) | |||
words = jiagu.seg(text) | |||
print(words) | |||
text = ''' | |||
该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。” | |||
NASA埃姆斯研究中心的科学家拉玛·内曼尼(Rama Nemani)说,“这一长期数据能让我们深入分析地表绿化背后的影响因素。我们一开始以为,植被增加是由于更多二氧化碳排放,导致气候更加温暖、潮湿,适宜生长。” | |||
@@ -25,17 +36,38 @@ NASA文章介绍,在中国为全球绿化进程做出的贡献中,有42%来 | |||
据观察者网过往报道,2017年我国全国共完成造林736.2万公顷、森林抚育830.2万公顷。其中,天然林资源保护工程完成造林26万公顷,退耕还林工程完成造林91.2万公顷。京津风沙源治理工程完成造林18.5万公顷。三北及长江流域等重点防护林体系工程完成造林99.1万公顷。完成国家储备林建设任务68万公顷。 | |||
''' | |||
keywords = jiagu.keywords(text, 5) # 关键词 | |||
keywords = jiagu.keywords(text, 5) # 关键词抽取 | |||
print(keywords) | |||
summarize = jiagu.summarize(text, 3) # 摘要 | |||
summarize = jiagu.summarize(text, 3) # 文本摘要 | |||
print(summarize) | |||
# iagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 | |||
# jiagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。 | |||
# 知识图谱关系抽取 | |||
text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。' | |||
knowledge = jiagu.knowledge(text) | |||
print(knowledge) | |||
# 情感分析 | |||
text = '很讨厌还是个懒鬼' | |||
sentiment = jiagu.sentiment(text) | |||
print(sentiment) | |||
# 文本聚类(需要调参) | |||
docs = [ | |||
"百度深度学习中文情感分析工具Senta试用及在线测试", | |||
"情感分析是自然语言处理里面一个热门话题", | |||
"AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", | |||
"深度学习实践:从零开始做电影评论文本情感分析", | |||
"BERT相关论文、文章和代码资源汇总", | |||
"将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上", | |||
"自然语言处理工具包spaCy介绍", | |||
"现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文" | |||
] | |||
cluster = jiagu.text_cluster(docs) | |||
print(cluster) |
@@ -15,7 +15,7 @@ any = analyze.Analyze() | |||
init = any.init | |||
# 分词 | |||
seg = any.cws | |||
seg = any.seg | |||
cws = any.cws | |||
cut = any.cws | |||
@@ -29,7 +29,7 @@ ner = any.ner | |||
# parser | |||
# 加载用户字典 | |||
# load_userdict | |||
load_userdict = any.load_userdict | |||
# 自定义分词模型 | |||
load_model = any.load_model | |||
@@ -44,4 +44,10 @@ summarize = any.summarize | |||
findword = any.findword | |||
# 知识图谱 | |||
knowledge = any.knowledge | |||
knowledge = any.knowledge | |||
# 情感分析 | |||
sentiment = any.sentiment | |||
# 文本聚类 | |||
text_cluster = any.text_cluster |
@@ -14,7 +14,9 @@ from jiagu import findword | |||
from jiagu import bilstm_crf | |||
from jiagu.textrank import Keywords | |||
from jiagu.textrank import Summarize | |||
from jiagu.segment.nroute import Segment | |||
from jiagu.sentiment.bayes import Bayes | |||
from jiagu.cluster.text import text_cluster as cluster | |||
def add_curr_dir(name): | |||
return os.path.join(os.path.dirname(__file__), name) | |||
@@ -32,11 +34,19 @@ class Analyze(object): | |||
self.keywords_model = None | |||
self.summarize_model = None | |||
self.seg_nroute = Segment() | |||
self.sentiment_model = Bayes() | |||
def init(self): | |||
self.init_cws() | |||
self.init_pos() | |||
self.init_ner() | |||
self.seg_nroute.init() | |||
def load_userdict(self, userdict): | |||
self.seg_nroute.load_userdict(userdict) | |||
def init_cws(self): | |||
if self.seg_model is None: | |||
@@ -99,6 +109,9 @@ class Analyze(object): | |||
sent_words.append(self.__lab2word(text, seg_labels)) | |||
return sent_words | |||
def seg(self, sentence): | |||
return self.seg_nroute.seg(sentence, mode="default") | |||
def cws(self, sentence, input='text', model='default'): | |||
"""中文分词 | |||
@@ -171,9 +184,17 @@ class Analyze(object): | |||
self.summarize_model = Summarize(tol=0.0001) | |||
return self.summarize_model.summarize(text, topsen) | |||
def findword(self, input, output): | |||
findword.new_word_find(input, output) | |||
def findword(self, input_file, output_file, min_freq=10, min_mtro=80, min_entro=3): | |||
findword.new_word_find(input_file, output_file, min_freq, min_mtro, min_entro) | |||
def sentiment(self, text): | |||
words = self.seg(text) | |||
ret, prob = self.sentiment_model.classify(words) | |||
return ret, prob | |||
def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2): | |||
return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg) | |||
def lab2spo(self, text, epp_labels): | |||
subject_list = [] # 存放实体的列表 | |||
object_list = [] | |||
@@ -1,5 +1,4 @@ | |||
# -*-coding:utf-8-*- | |||
import jiagu | |||
from collections import Counter | |||
import numpy as np | |||
@@ -10,7 +9,7 @@ def elu_distance(a, b): | |||
return dist | |||
def count_features(corpus, tokenizer=jiagu.cut): | |||
def count_features(corpus, tokenizer=list): | |||
"""词频特征 | |||
:param corpus: list of str | |||
@@ -36,7 +35,7 @@ def count_features(corpus, tokenizer=jiagu.cut): | |||
return np.array(features), vocab | |||
def tfidf_features(corpus, tokenizer=jiagu.cut): | |||
def tfidf_features(corpus, tokenizer=list): | |||
"""文本的 tfidf 特征 | |||
:param corpus: list of str | |||
@@ -6,7 +6,8 @@ from .dbscan import DBSCAN | |||
from .kmeans import KMeans | |||
def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_iter=100, eps=None, min_pts=None): | |||
def text_cluster(docs, features_method='tfidf', method="dbscan", | |||
k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): | |||
"""文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | |||
:param features_method: str | |||
@@ -27,9 +28,9 @@ def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_it | |||
聚类结果 | |||
""" | |||
if features_method == 'tfidf': | |||
features, names = tfidf_features(docs) | |||
features, names = tfidf_features(docs, tokenizer) | |||
elif features_method == 'count': | |||
features, names = count_features(docs) | |||
features, names = count_features(docs, tokenizer) | |||
else: | |||
raise ValueError('features_method error') | |||
@@ -106,11 +106,7 @@ def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, | |||
return entro_dict | |||
def new_word_find(input_file, output_file): | |||
min_freq = 10 | |||
min_mtro = 80 | |||
min_entro = 3 | |||
def new_word_find(input_file, output_file, min_freq=10, min_mtro=80, min_entro=3): | |||
word_freq = count_words(input_file) | |||
total_word = sum(word_freq.values()) | |||
@@ -0,0 +1,2 @@ | |||
思知 | |||
@@ -0,0 +1,194 @@ | |||
import re | |||
import os | |||
import sys | |||
from math import log | |||
re_eng = re.compile('[a-zA-Z0-9]', re.U) | |||
re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U) | |||
re_skip = re.compile("(\r\n|\s)", re.U) | |||
class Segment: | |||
def __init__(self): | |||
self.vocab = {} | |||
self.max_word_len = 0 | |||
self.max_freq = 0 | |||
self.total_freq = 0 | |||
self.initialized = False | |||
def init(self, vocab_path='dict/jiagu.dict'): | |||
self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path)) | |||
self.initialized = True | |||
def load_vocab(self, vocab_path): | |||
fin = open(vocab_path, 'r', encoding='utf8') | |||
for index, line in enumerate(fin): | |||
line = line.strip() | |||
if line == '': | |||
continue | |||
word_freq_tag = line.split('\t') | |||
if len(word_freq_tag) == 1: | |||
word = word_freq_tag[0] | |||
self.add_vocab(word) | |||
elif len(word_freq_tag) == 2: | |||
word = word_freq_tag[0] | |||
freq = int(word_freq_tag[1]) | |||
self.add_vocab(word, freq) | |||
fin.close() | |||
def add_vocab(self, word=None, freq=None, tag=None): | |||
if freq == None: | |||
freq = self.max_freq | |||
if word not in self.vocab: | |||
self.vocab[word] = 0 | |||
self.vocab[word] += freq | |||
self.total_freq += freq | |||
if freq > self.max_freq: | |||
self.max_freq = freq | |||
if len(word) > self.max_word_len: | |||
self.max_word_len = len(word) | |||
def load_userdict(self, userdict): | |||
if self.initialized == False: | |||
self.init() | |||
if isinstance(userdict, str): | |||
self.load_vocab(userdict) | |||
for item in userdict: | |||
if isinstance(item, list): | |||
if len(item) == 1: | |||
word = item[0] | |||
self.add_vocab(word) | |||
elif len(item) == 2: | |||
word = item[0] | |||
freq = item[1] | |||
self.add_vocab(word, freq) | |||
elif isinstance(item, str): | |||
self.add_vocab(word=item) | |||
def calc_route(self, sentence, DAG, route): | |||
vocab = self.vocab | |||
N = len(sentence) | |||
route[N] = (0, 0) | |||
logtotal = log(self.total_freq) | |||
for idx in range(N - 1, -1, -1): | |||
route[idx] = max((log(vocab.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[idx]) | |||
def create_DAG(self, sentence): | |||
vocab = self.vocab | |||
max_word_len = self.max_word_len | |||
DAG = {} | |||
N = len(sentence) | |||
for idx in range(N): | |||
cand_idx = [idx] | |||
for i in range(idx+1, idx + min(max_word_len, N - idx), 1): | |||
cand = sentence[idx: i+1] | |||
if cand in vocab: | |||
cand_idx.append(i) | |||
DAG[idx] = cand_idx | |||
return DAG | |||
def cut_search(self, sentence): | |||
DAG = self.create_DAG(sentence) | |||
old_j = -1 | |||
for k, L in DAG.items(): | |||
if len(L) == 1 and k > old_j: | |||
yield sentence[k:L[0] + 1] | |||
old_j = L[0] | |||
else: | |||
for j in L: | |||
if j > k: | |||
yield sentence[k:j + 1] | |||
old_j = j | |||
def cut_vocab(self, sentence): | |||
DAG = self.create_DAG(sentence) | |||
route = {} | |||
self.calc_route(sentence, DAG, route) | |||
x = 0 | |||
N = len(sentence) | |||
buf = '' | |||
while x < N: | |||
y = route[x][1] + 1 | |||
l_word = sentence[x:y] | |||
if buf: | |||
yield buf | |||
buf = '' | |||
yield l_word | |||
x = y | |||
if buf: | |||
yield buf | |||
buf = '' | |||
def cut_words(self, sentence): | |||
DAG = self.create_DAG(sentence) | |||
route = {} | |||
self.calc_route(sentence, DAG, route) | |||
x = 0 | |||
N = len(sentence) | |||
buf = '' | |||
while x < N: | |||
y = route[x][1] + 1 | |||
l_word = sentence[x:y] | |||
if re_eng.match(l_word) and len(l_word) == 1: | |||
buf += l_word | |||
x = y | |||
else: | |||
if buf: | |||
yield buf | |||
buf = '' | |||
yield l_word | |||
x = y | |||
if buf: | |||
yield buf | |||
buf = '' | |||
def seg_default(self, sentence, mode): | |||
blocks = re_han.split(sentence) | |||
cut_block = self.cut_words | |||
cut_all = False | |||
for block in blocks: | |||
if not block: | |||
continue | |||
if re_han.match(block): | |||
for word in cut_block(block): | |||
yield word | |||
else: | |||
tmp = re_skip.split(block) | |||
for x in tmp: | |||
if re_skip.match(x): | |||
yield x | |||
elif not cut_all: | |||
for xx in x: | |||
yield xx | |||
else: | |||
yield x | |||
def seg(self, sentence, mode="default"): | |||
if self.initialized == False: | |||
self.init() | |||
return list(self.seg_default(sentence, mode=mode)) | |||
if __name__=='__main__': | |||
s = Segment() | |||
# sg.load_userdict('dict/user.dict') | |||
s.load_userdict(['知识图谱']) | |||
text = '知识图谱机器人hello\nworld¥¥' | |||
words = s.seg(text) | |||
print(words) | |||
@@ -1,4 +1,5 @@ | |||
# -*- coding: utf-8 -*- | |||
import os | |||
import sys | |||
import gzip | |||
import marshal | |||
@@ -83,6 +84,9 @@ class Bayes(object): | |||
self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys())) | |||
def classify(self, x): | |||
if self.d == {}: | |||
self.load(os.path.join(os.path.dirname(__file__), 'model/sentiment.model')) | |||
tmp = {} | |||
for k in self.d: | |||
tmp[k] = log(self.d[k].getsum()) - log(self.total) | |||
@@ -3,7 +3,7 @@ | |||
from setuptools import setup | |||
setup(name='jiagu', | |||
version='0.1.7', | |||
version='0.1.8', | |||
description='Jiagu Natural Language Processing', | |||
author='Yener(Zheng Wenyu)', | |||
author_email='help@ownthink.com', | |||
@@ -12,5 +12,7 @@ setup(name='jiagu', | |||
install_requires=['tensorflow>=1.6.0', 'numpy>=1.12.1'], | |||
packages=['jiagu'], | |||
package_dir={'jiagu': 'jiagu'}, | |||
package_data={'jiagu': ['*.*', 'model/*', 'data/*']} | |||
package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*', | |||
'normal/*', 'segment/*', 'segment/dict/*', | |||
'sentiment/*', 'sentiment/model/*', 'topic/*']} | |||
) |