Browse Source

add text_cluster sentiment

master
Yener 5 years ago
parent
commit
5a93561384
15 changed files with 301607 additions and 388079 deletions
  1. +46
    -18
      README.md
  2. +39
    -7
      demo.py
  3. +9
    -3
      jiagu/__init__.py
  4. +25
    -4
      jiagu/analyze.py
  5. +2
    -3
      jiagu/cluster/base.py
  6. +4
    -3
      jiagu/cluster/text.py
  7. BIN
      jiagu/data/Trie.pkl
  8. +0
    -12638
      jiagu/data/chars.dic
  9. +1
    -5
      jiagu/findword.py
  10. +301277
    -375396
      jiagu/segment/dict/jiagu.dict
  11. +2
    -0
      jiagu/segment/dict/user.dict
  12. +194
    -0
      jiagu/segment/nroute.py
  13. +4
    -0
      jiagu/sentiment/bayes.py
  14. +0
    -0
      jiagu/sentiment/model/sentiment.model
  15. +4
    -2
      setup.py

+ 46
- 18
README.md View File

@@ -1,5 +1,5 @@
# Jiagu自然语言处理工具
>>> Jiagu以BiLSTM等模型为基础,使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家。
>>> Jiagu以BiLSTM等模型为基础,使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家。

## 目录
* [安装方式](#安装方式)
@@ -17,6 +17,8 @@
* 关键词提取
* 文本摘要
* 新词发现
* 情感分析
* 文本聚类
* 等等。。。。

---
@@ -42,7 +44,7 @@ import jiagu

text = '厦门明天会不会下雨'

words = jiagu.seg(text) # 分词
words = jiagu.cut(text) # 分词
print(words)

pos = jiagu.pos(words) # 词性标注
@@ -54,35 +56,38 @@ print(ner)

2. 中文分词

分词各种模式使用方式
自定义分词模型(将单独提供msr、pku、cnc等分词标准)
```python3
import jiagu

text = '汉服和服装'
# 独立标准模型路径
# msr:test/extra_data/model/msr.model
# pku:test/extra_data/model/pku.model
# cnc:test/extra_data/model/cnc.model

words = jiagu.seg(text) # 默认分词
print(words)
jiagu.load_model('test/extra_data/model/cnc.model') # 使用国家语委分词标准

words = jiagu.seg([text, text, text], input='batch') # 批量分词,加快速度。
print(words)
words = jiagu.cut('结婚的和尚未结婚的')

words = jiagu.seg(text, model='mmseg') # 使用mmseg算法进行分词
print(list(words))
print(words)
```

自定义分词模型(将单独提供msr、pku、cnc等分词标准)
分词各种模式使用方式
```python3
import jiagu

# 独立标准模型路径
# msr:test/extra_data/model/msr.model
# pku:test/extra_data/model/pku.model
# cnc:test/extra_data/model/cnc.model
text = '汉服和服装、知识图谱机器人'

jiagu.load_model('test/extra_data/model/cnc.model') # 使用国家语委分词标准
words = jiagu.cut(text) # 默认分词
print(words)

words = jiagu.seg(text) # 字典分词
print(words)

words = jiagu.seg('结婚的和尚未结婚的')
# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。
jiagu.load_userdict(['知识图谱'])

words = jiagu.seg(text) # 自定义分词,字典分词模式有效
print(words)
```

@@ -132,8 +137,31 @@ import jiagu
jiagu.findword('input.txt', 'output.txt') # 根据文本,利用信息熵做新词发现。
```

7. 情感分析
```python3
text = '很讨厌还是个懒鬼'
sentiment = jiagu.sentiment(text)
print(sentiment)
```

8. 文本聚类
```python3
docs = [
"百度深度学习中文情感分析工具Senta试用及在线测试",
"情感分析是自然语言处理里面一个热门话题",
"AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总",
"深度学习实践:从零开始做电影评论文本情感分析",
"BERT相关论文、文章和代码资源汇总",
"将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上",
"自然语言处理工具包spaCy介绍",
"现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文"
]
cluster = jiagu.text_cluster(docs)
print(cluster)
```

### 评价标准
1. msr测试结果
1. msr测试结果(旧版本)

![msr](https://github.com/ownthink/evaluation/blob/master/images/2.png)



+ 39
- 7
demo.py View File

@@ -5,18 +5,29 @@ import jiagu

text = '厦门明天会不会下雨'

words = jiagu.seg(text) # 分词,可以用model选择分词模式,不填则默认,mmseg则使用mmseg算法。
words = jiagu.cut(text) # 分词
print(words)

# words = jiagu.seg(text, model="mmseg") # mmseg 分词得到generator,需要用list进行转换
# print(list(words))

pos = jiagu.pos(words) # 词性标注
print(pos)

ner = jiagu.ner(text) # 命名实体识别
print(ner)


# 字典模式分词
text = '知识图谱机器人'
words = jiagu.seg(text)
print(words)

# jiagu.load_userdict('dict/user.dict') # 加载自定义字典,支持字典路径、字典列表形式。
jiagu.load_userdict(['知识图谱'])

words = jiagu.seg(text)
print(words)



text = '''
该研究主持者之一、波士顿大学地球与环境科学系博士陈池(音)表示,“尽管中国和印度国土面积仅占全球陆地的9%,但两国为这一绿化过程贡献超过三分之一。考虑到人口过多的国家一般存在对土地过度利用的问题,这个发现令人吃惊。”
NASA埃姆斯研究中心的科学家拉玛·内曼尼(Rama Nemani)说,“这一长期数据能让我们深入分析地表绿化背后的影响因素。我们一开始以为,植被增加是由于更多二氧化碳排放,导致气候更加温暖、潮湿,适宜生长。”
@@ -25,17 +36,38 @@ NASA文章介绍,在中国为全球绿化进程做出的贡献中,有42%来
据观察者网过往报道,2017年我国全国共完成造林736.2万公顷、森林抚育830.2万公顷。其中,天然林资源保护工程完成造林26万公顷,退耕还林工程完成造林91.2万公顷。京津风沙源治理工程完成造林18.5万公顷。三北及长江流域等重点防护林体系工程完成造林99.1万公顷。完成国家储备林建设任务68万公顷。
'''

keywords = jiagu.keywords(text, 5) # 关键词
keywords = jiagu.keywords(text, 5) # 关键词抽取
print(keywords)

summarize = jiagu.summarize(text, 3) # 摘要
summarize = jiagu.summarize(text, 3) # 文本摘要
print(summarize)

# iagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。

# jiagu.findword('input.txt', 'output.txt') # 根据大规模语料,利用信息熵做新词发现。


# 知识图谱关系抽取
text = '姚明(Yao Ming),1980年9月12日出生于上海市徐汇区,祖籍江苏省苏州市吴江区震泽镇,前中国职业篮球运动员,司职中锋,现任中职联公司董事长兼总经理。'
knowledge = jiagu.knowledge(text)
print(knowledge)


# 情感分析
text = '很讨厌还是个懒鬼'
sentiment = jiagu.sentiment(text)
print(sentiment)


# 文本聚类(需要调参)
docs = [
"百度深度学习中文情感分析工具Senta试用及在线测试",
"情感分析是自然语言处理里面一个热门话题",
"AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总",
"深度学习实践:从零开始做电影评论文本情感分析",
"BERT相关论文、文章和代码资源汇总",
"将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上",
"自然语言处理工具包spaCy介绍",
"现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文"
]
cluster = jiagu.text_cluster(docs)
print(cluster)

+ 9
- 3
jiagu/__init__.py View File

@@ -15,7 +15,7 @@ any = analyze.Analyze()
init = any.init

# 分词
seg = any.cws
seg = any.seg
cws = any.cws
cut = any.cws

@@ -29,7 +29,7 @@ ner = any.ner
# parser

# 加载用户字典
# load_userdict
load_userdict = any.load_userdict

# 自定义分词模型
load_model = any.load_model
@@ -44,4 +44,10 @@ summarize = any.summarize
findword = any.findword

# 知识图谱
knowledge = any.knowledge
knowledge = any.knowledge

# 情感分析
sentiment = any.sentiment

# 文本聚类
text_cluster = any.text_cluster

+ 25
- 4
jiagu/analyze.py View File

@@ -14,7 +14,9 @@ from jiagu import findword
from jiagu import bilstm_crf
from jiagu.textrank import Keywords
from jiagu.textrank import Summarize

from jiagu.segment.nroute import Segment
from jiagu.sentiment.bayes import Bayes
from jiagu.cluster.text import text_cluster as cluster

def add_curr_dir(name):
return os.path.join(os.path.dirname(__file__), name)
@@ -32,11 +34,19 @@ class Analyze(object):

self.keywords_model = None
self.summarize_model = None
self.seg_nroute = Segment()
self.sentiment_model = Bayes()

def init(self):
self.init_cws()
self.init_pos()
self.init_ner()
self.seg_nroute.init()
def load_userdict(self, userdict):
self.seg_nroute.load_userdict(userdict)

def init_cws(self):
if self.seg_model is None:
@@ -99,6 +109,9 @@ class Analyze(object):
sent_words.append(self.__lab2word(text, seg_labels))
return sent_words

def seg(self, sentence):
return self.seg_nroute.seg(sentence, mode="default")
def cws(self, sentence, input='text', model='default'):
"""中文分词

@@ -171,9 +184,17 @@ class Analyze(object):
self.summarize_model = Summarize(tol=0.0001)
return self.summarize_model.summarize(text, topsen)

def findword(self, input, output):
findword.new_word_find(input, output)

def findword(self, input_file, output_file, min_freq=10, min_mtro=80, min_entro=3):
findword.new_word_find(input_file, output_file, min_freq, min_mtro, min_entro)
def sentiment(self, text):
words = self.seg(text)
ret, prob = self.sentiment_model.classify(words)
return ret, prob
def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2):
return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg)
def lab2spo(self, text, epp_labels):
subject_list = [] # 存放实体的列表
object_list = []


+ 2
- 3
jiagu/cluster/base.py View File

@@ -1,5 +1,4 @@
# -*-coding:utf-8-*-
import jiagu
from collections import Counter
import numpy as np

@@ -10,7 +9,7 @@ def elu_distance(a, b):
return dist


def count_features(corpus, tokenizer=jiagu.cut):
def count_features(corpus, tokenizer=list):
"""词频特征

:param corpus: list of str
@@ -36,7 +35,7 @@ def count_features(corpus, tokenizer=jiagu.cut):
return np.array(features), vocab


def tfidf_features(corpus, tokenizer=jiagu.cut):
def tfidf_features(corpus, tokenizer=list):
"""文本的 tfidf 特征

:param corpus: list of str


+ 4
- 3
jiagu/cluster/text.py View File

@@ -6,7 +6,8 @@ from .dbscan import DBSCAN
from .kmeans import KMeans


def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_iter=100, eps=None, min_pts=None):
def text_cluster(docs, features_method='tfidf', method="dbscan",
k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list):
"""文本聚类,目前支持 K-Means 和 DBSCAN 两种方法

:param features_method: str
@@ -27,9 +28,9 @@ def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_it
聚类结果
"""
if features_method == 'tfidf':
features, names = tfidf_features(docs)
features, names = tfidf_features(docs, tokenizer)
elif features_method == 'count':
features, names = count_features(docs)
features, names = count_features(docs, tokenizer)
else:
raise ValueError('features_method error')



BIN
jiagu/data/Trie.pkl View File


+ 0
- 12638
jiagu/data/chars.dic
File diff suppressed because it is too large
View File


+ 1
- 5
jiagu/findword.py View File

@@ -106,11 +106,7 @@ def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq,
return entro_dict


def new_word_find(input_file, output_file):
min_freq = 10
min_mtro = 80
min_entro = 3

def new_word_find(input_file, output_file, min_freq=10, min_mtro=80, min_entro=3):
word_freq = count_words(input_file)
total_word = sum(word_freq.values())



jiagu/segment/dict/jiagu.dict
File diff suppressed because it is too large
View File


+ 2
- 0
jiagu/segment/dict/user.dict View File

@@ -0,0 +1,2 @@
思知


+ 194
- 0
jiagu/segment/nroute.py View File

@@ -0,0 +1,194 @@
import re
import os
import sys
from math import log

re_eng = re.compile('[a-zA-Z0-9]', re.U)
re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
re_skip = re.compile("(\r\n|\s)", re.U)

class Segment:
def __init__(self):
self.vocab = {}
self.max_word_len = 0
self.max_freq = 0
self.total_freq = 0
self.initialized = False

def init(self, vocab_path='dict/jiagu.dict'):
self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path))
self.initialized = True
def load_vocab(self, vocab_path):
fin = open(vocab_path, 'r', encoding='utf8')
for index, line in enumerate(fin):
line = line.strip()
if line == '':
continue
word_freq_tag = line.split('\t')
if len(word_freq_tag) == 1:
word = word_freq_tag[0]
self.add_vocab(word)
elif len(word_freq_tag) == 2:
word = word_freq_tag[0]
freq = int(word_freq_tag[1])
self.add_vocab(word, freq)
fin.close()

def add_vocab(self, word=None, freq=None, tag=None):
if freq == None:
freq = self.max_freq
if word not in self.vocab:
self.vocab[word] = 0
self.vocab[word] += freq
self.total_freq += freq
if freq > self.max_freq:
self.max_freq = freq
if len(word) > self.max_word_len:
self.max_word_len = len(word)
def load_userdict(self, userdict):
if self.initialized == False:
self.init()
if isinstance(userdict, str):
self.load_vocab(userdict)
for item in userdict:
if isinstance(item, list):
if len(item) == 1:
word = item[0]
self.add_vocab(word)
elif len(item) == 2:
word = item[0]
freq = item[1]
self.add_vocab(word, freq)
elif isinstance(item, str):
self.add_vocab(word=item)
def calc_route(self, sentence, DAG, route):
vocab = self.vocab
N = len(sentence)
route[N] = (0, 0)
logtotal = log(self.total_freq)
for idx in range(N - 1, -1, -1):
route[idx] = max((log(vocab.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[idx])
def create_DAG(self, sentence):
vocab = self.vocab
max_word_len = self.max_word_len
DAG = {}
N = len(sentence)
for idx in range(N):
cand_idx = [idx]
for i in range(idx+1, idx + min(max_word_len, N - idx), 1):
cand = sentence[idx: i+1]
if cand in vocab:
cand_idx.append(i)
DAG[idx] = cand_idx
return DAG
def cut_search(self, sentence):
DAG = self.create_DAG(sentence)
old_j = -1
for k, L in DAG.items():
if len(L) == 1 and k > old_j:
yield sentence[k:L[0] + 1]
old_j = L[0]
else:
for j in L:
if j > k:
yield sentence[k:j + 1]
old_j = j

def cut_vocab(self, sentence):
DAG = self.create_DAG(sentence)
route = {}
self.calc_route(sentence, DAG, route)

x = 0
N = len(sentence)
buf = ''
while x < N:
y = route[x][1] + 1
l_word = sentence[x:y]
if buf:
yield buf
buf = ''
yield l_word
x = y
if buf:
yield buf
buf = ''
def cut_words(self, sentence):
DAG = self.create_DAG(sentence)
route = {}
self.calc_route(sentence, DAG, route)
x = 0
N = len(sentence)
buf = ''
while x < N:
y = route[x][1] + 1
l_word = sentence[x:y]
if re_eng.match(l_word) and len(l_word) == 1:
buf += l_word
x = y
else:
if buf:
yield buf
buf = ''
yield l_word
x = y
if buf:
yield buf
buf = ''
def seg_default(self, sentence, mode):
blocks = re_han.split(sentence)
cut_block = self.cut_words
cut_all = False
for block in blocks:
if not block:
continue
if re_han.match(block):
for word in cut_block(block):
yield word
else:
tmp = re_skip.split(block)
for x in tmp:
if re_skip.match(x):
yield x
elif not cut_all:
for xx in x:
yield xx
else:
yield x
def seg(self, sentence, mode="default"):
if self.initialized == False:
self.init()
return list(self.seg_default(sentence, mode=mode))

if __name__=='__main__':
s = Segment()
# sg.load_userdict('dict/user.dict')
s.load_userdict(['知识图谱'])

text = '知识图谱机器人hello\nworld¥¥'
words = s.seg(text)
print(words)



+ 4
- 0
jiagu/sentiment/bayes.py View File

@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
import os
import sys
import gzip
import marshal
@@ -83,6 +84,9 @@ class Bayes(object):
self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys()))

def classify(self, x):
if self.d == {}:
self.load(os.path.join(os.path.dirname(__file__), 'model/sentiment.model'))
tmp = {}
for k in self.d:
tmp[k] = log(self.d[k].getsum()) - log(self.total)


jiagu/sentiment/model/1.model → jiagu/sentiment/model/sentiment.model View File


+ 4
- 2
setup.py View File

@@ -3,7 +3,7 @@
from setuptools import setup

setup(name='jiagu',
version='0.1.7',
version='0.1.8',
description='Jiagu Natural Language Processing',
author='Yener(Zheng Wenyu)',
author_email='help@ownthink.com',
@@ -12,5 +12,7 @@ setup(name='jiagu',
install_requires=['tensorflow>=1.6.0', 'numpy>=1.12.1'],
packages=['jiagu'],
package_dir={'jiagu': 'jiagu'},
package_data={'jiagu': ['*.*', 'model/*', 'data/*']}
package_data={'jiagu': ['*.*', 'cluster/*', 'data/*', 'model/*',
'normal/*', 'segment/*', 'segment/dict/*',
'sentiment/*', 'sentiment/model/*', 'topic/*']}
)

Loading…
Cancel
Save