You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

analyze.py 5.5 kB

6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
5 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. #!/usr/bin/env python3
  2. # -*-coding:utf-8-*-
  3. import os
  4. from jiagu import mmseg
  5. from jiagu import findword
  6. from jiagu import perceptron
  7. from jiagu.textrank import Keywords
  8. from jiagu.textrank import Summarize
  9. from jiagu.segment.nroute import Segment
  10. from jiagu.sentiment.bayes import Bayes
  11. from jiagu.cluster.text import text_cluster as cluster
  12. def add_curr_dir(name):
  13. return os.path.join(os.path.dirname(__file__), name)
  14. class Analyze(object):
  15. def __init__(self):
  16. self.seg_model = None
  17. self.pos_model = None
  18. self.ner_model = None
  19. self.kg_model = None
  20. self.seg_mmseg = None
  21. self.keywords_model = None
  22. self.summarize_model = None
  23. self.seg_nroute = Segment()
  24. self.sentiment_model = Bayes()
  25. def init(self):
  26. self.init_cws()
  27. self.init_pos()
  28. self.init_ner()
  29. self.seg_nroute.init()
  30. def load_userdict(self, userdict):
  31. self.seg_nroute.load_userdict(userdict)
  32. def init_cws(self):
  33. if self.seg_model is None:
  34. self.seg_model = perceptron.Perceptron(add_curr_dir('model/cws.model'))
  35. def load_model(self, model_path):
  36. self.seg_model = perceptron.Perceptron(model_path)
  37. def init_pos(self):
  38. if self.pos_model is None:
  39. self.pos_model = perceptron.Perceptron(add_curr_dir('model/pos.model'))
  40. def init_ner(self):
  41. if self.ner_model is None:
  42. self.ner_model = perceptron.Perceptron(add_curr_dir('model/ner.model'))
  43. def init_mmseg(self):
  44. if self.seg_mmseg is None:
  45. self.seg_mmseg = mmseg.MMSeg()
  46. def init_kg(self):
  47. if self.kg_model is None:
  48. self.kg_model = perceptron.Perceptron(add_curr_dir('model/kg.model'))
  49. @staticmethod
  50. def __lab2word(sentence, labels):
  51. sen_len = len(sentence)
  52. tmp_word = ""
  53. words = []
  54. for i in range(sen_len):
  55. label = labels[i]
  56. w = sentence[i]
  57. if label == "B":
  58. tmp_word += w
  59. elif label == "M":
  60. tmp_word += w
  61. elif label == "E":
  62. tmp_word += w
  63. words.append(tmp_word)
  64. tmp_word = ""
  65. else:
  66. if tmp_word != '':
  67. words.append(tmp_word)
  68. tmp_word = ""
  69. words.append(w)
  70. if tmp_word:
  71. words.append(tmp_word)
  72. return words
  73. def cws_text(self, sentence):
  74. if sentence == '':
  75. return ['']
  76. sentence = list(sentence)
  77. labels = self.seg_model.predict(sentence)
  78. return self.__lab2word(sentence, labels)
  79. def seg(self, sentence):
  80. return self.seg_nroute.seg(sentence, mode="default")
  81. def cws(self, sentence, model='default'):
  82. """中文分词
  83. :param sentence: str or list
  84. 文本或者文本列表,根据input的模式来定
  85. :param input: str
  86. 句子输入的格式,text则为默认的文本,batch则为批量的文本列表
  87. :param model: str
  88. 分词所使用的模式,default为默认模式,mmseg为mmseg分词方式
  89. :return:
  90. """
  91. if model == 'default':
  92. self.init_cws()
  93. words = self.cws_text(sentence)
  94. return words
  95. elif model == 'mmseg':
  96. self.init_mmseg()
  97. words = self.seg_mmseg.cws(sentence)
  98. return words
  99. else:
  100. pass
  101. return []
  102. def pos(self, words): # 传入的是词语
  103. self.init_pos()
  104. labels = self.pos_model.predict(words)
  105. return labels
  106. def ner(self, words): # 传入的是词语
  107. self.init_ner()
  108. labels = self.ner_model.predict(words)
  109. return labels
  110. def knowledge(self, text): # 传入的是文本
  111. self.init_kg()
  112. words = self.seg(text)
  113. labels = self.kg_model.predict(words)
  114. return self.lab2spo(words, labels)
  115. def keywords(self, text, topkey=5):
  116. if self.keywords_model == None:
  117. self.keywords_model = Keywords(tol=0.0001, window=2)
  118. return self.keywords_model.keywords(text, topkey)
  119. def summarize(self, text, topsen=5):
  120. if self.summarize_model == None:
  121. self.summarize_model = Summarize(tol=0.0001)
  122. return self.summarize_model.summarize(text, topsen)
  123. def findword(self, input_file, output_file, min_freq=10, min_mtro=80, min_entro=3):
  124. findword.new_word_find(input_file, output_file, min_freq, min_mtro, min_entro)
  125. def sentiment(self, text):
  126. words = self.seg(text)
  127. ret, prob = self.sentiment_model.classify(words)
  128. return ret, prob
  129. def text_cluster(self, docs, features_method='tfidf', method="k-means", k=3, max_iter=100, eps=0.5, min_pts=2):
  130. return cluster(docs, features_method, method, k, max_iter, eps, min_pts, self.seg)
  131. def lab2spo(self, words, epp_labels):
  132. subject_list = [] # 存放实体的列表
  133. object_list = []
  134. index = 0
  135. for word, ep in zip(words, epp_labels):
  136. if ep[0] == 'B' and ep[2:] == '实体':
  137. subject_list.append([word, ep[2:], index])
  138. elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] == '实体':
  139. if len(subject_list) == 0:
  140. continue
  141. subject_list[len(subject_list)-1][0] += word
  142. if ep[0] == 'B' and ep[2:] != '实体':
  143. object_list.append([word, ep[2:], index])
  144. elif (ep[0] == 'I' or ep[0] == 'E') and ep[2:] != '实体':
  145. if len(object_list) == 0:
  146. return []
  147. object_list[len(object_list)-1][0] += word
  148. index += 1
  149. spo_list = []
  150. if len(subject_list) == 0 or len(object_list) == 0:
  151. pass
  152. elif len(subject_list) == 1:
  153. entity = subject_list[0]
  154. for obj in object_list:
  155. predicate = obj[1][:-1]
  156. spo_list.append([entity[0], predicate, obj[0]])
  157. else:
  158. for obj in object_list:
  159. entity = []
  160. predicate = obj[1][:-1]
  161. direction = obj[1][-1]
  162. for sub in subject_list:
  163. if direction == '+':
  164. if sub[2] > obj[2]:
  165. entity = sub
  166. break
  167. else:
  168. if sub[2] < obj[2]:
  169. entity = sub
  170. if entity == []:
  171. continue
  172. spo_list.append([entity[0], predicate, obj[0]])
  173. return spo_list

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家