You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

textrank.py 6.6 kB

6 years ago
5 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186
  1. # -*- encoding:utf-8 -*-
  2. import sys
  3. from jiagu import utils
  4. from heapq import nlargest
  5. from collections import defaultdict
  6. from itertools import count, product
  7. class Keywords(object):
  8. def __init__(self,
  9. use_stopword=True,
  10. stop_words_file=utils.default_stopwords_file(),
  11. max_iter=100,
  12. tol=0.0001,
  13. window=2):
  14. self.__use_stopword = use_stopword
  15. self.__max_iter = max_iter
  16. self.__tol = tol
  17. self.__window = window
  18. self.__stop_words = set()
  19. self.__stop_words_file = utils.default_stopwords_file()
  20. if stop_words_file:
  21. self.__stop_words_file = stop_words_file
  22. if use_stopword:
  23. with open(self.__stop_words_file, 'r', encoding='utf-8') as f:
  24. for word in f:
  25. self.__stop_words.add(word.strip())
  26. @staticmethod
  27. def build_vocab(sents):
  28. word_index = {}
  29. index_word = {}
  30. words_number = 0
  31. for word_list in sents:
  32. for word in word_list:
  33. if word not in word_index:
  34. word_index[word] = words_number
  35. index_word[words_number] = word
  36. words_number += 1
  37. return word_index, index_word, words_number
  38. @staticmethod
  39. def create_graph(sents, words_number, word_index, window=2):
  40. graph = [[0.0 for _ in range(words_number)] for _ in range(words_number)]
  41. for word_list in sents:
  42. for w1, w2 in utils.combine(word_list, window):
  43. if w1 in word_index and w2 in word_index:
  44. index1 = word_index[w1]
  45. index2 = word_index[w2]
  46. graph[index1][index2] += 1.0
  47. graph[index2][index1] += 1.0
  48. return graph
  49. def keywords(self, text, n):
  50. text = text.replace('\n', '')
  51. text = text.replace('\r', '')
  52. text = utils.as_text(text)
  53. tokens = utils.cut_sentences(text)
  54. sentences, sents = utils.psegcut_filter_words(tokens,
  55. self.__stop_words,
  56. self.__use_stopword)
  57. word_index, index_word, words_number = self.build_vocab(sents)
  58. graph = self.create_graph(sents, words_number,
  59. word_index, window=self.__window)
  60. scores = utils.weight_map_rank(graph, max_iter=self.__max_iter,
  61. tol=self.__tol)
  62. sent_selected = nlargest(n, zip(scores, count()))
  63. sent_index = []
  64. for i in range(min(len(sent_selected), n)):
  65. sent_index.append(sent_selected[i][1])
  66. return [index_word[i] for i in sent_index]
  67. class Summarize(object):
  68. def __init__(self, use_stopword=True,
  69. stop_words_file=None,
  70. dict_path=None,
  71. max_iter=100,
  72. tol=0.0001):
  73. if dict_path:
  74. raise RuntimeError("True")
  75. self.__use_stopword = use_stopword
  76. self.__dict_path = dict_path
  77. self.__max_iter = max_iter
  78. self.__tol = tol
  79. self.__stop_words = set()
  80. self.__stop_words_file = utils.default_stopwords_file()
  81. if stop_words_file:
  82. self.__stop_words_file = stop_words_file
  83. if use_stopword:
  84. with open(self.__stop_words_file, 'r', encoding='utf-8') as f:
  85. for word in f:
  86. self.__stop_words.add(word.strip())
  87. def filter_dictword(self, sents):
  88. _sents = []
  89. dele = set()
  90. for sentence in sents:
  91. for word in sentence:
  92. if word not in self.__word2vec:
  93. dele.add(word)
  94. if sentence:
  95. _sents.append([word for word in sentence if word not in dele])
  96. return _sents
  97. def summarize(self, text, n):
  98. text = text.replace('\n', '')
  99. text = text.replace('\r', '')
  100. text = utils.as_text(text)
  101. tokens = utils.cut_sentences(text)
  102. sentences, sents = utils.cut_filter_words(tokens, self.__stop_words, self.__use_stopword)
  103. graph = self.create_graph(sents)
  104. scores = utils.weight_map_rank(graph, self.__max_iter, self.__tol)
  105. sent_selected = nlargest(n, zip(scores, count()))
  106. sent_index = []
  107. for i in range(min(n, len(sent_selected))):
  108. sent_index.append(sent_selected[i][1])
  109. return [sentences[i] for i in sent_index]
  110. @staticmethod
  111. def create_graph(word_sent):
  112. num = len(word_sent)
  113. board = [[0.0 for _ in range(num)] for _ in range(num)]
  114. for i, j in product(range(num), repeat=2):
  115. if i != j:
  116. board[i][j] = utils.sentences_similarity(word_sent[i], word_sent[j])
  117. return board
  118. def compute_similarity_by_avg(self, sents_1, sents_2):
  119. if len(sents_1) == 0 or len(sents_2) == 0:
  120. return 0.0
  121. vec1 = self.__word2vec[sents_1[0]]
  122. for word1 in sents_1[1:]:
  123. vec1 = vec1 + self.__word2vec[word1]
  124. vec2 = self.__word2vec[sents_2[0]]
  125. for word2 in sents_2[1:]:
  126. vec2 = vec2 + self.__word2vec[word2]
  127. similarity = utils.cosine_similarity(vec1 / len(sents_1),
  128. vec2 / len(sents_2))
  129. return similarity
  130. class TextRank:
  131. d = 0.85
  132. def __init__(self):
  133. self.graph = defaultdict(list)
  134. def add_edge(self, start, end, weight=1):
  135. self.graph[start].append((start, end, weight))
  136. self.graph[end].append((end, start, weight))
  137. def rank(self):
  138. ws = defaultdict(float)
  139. out_sum = defaultdict(float)
  140. wsdef = 1.0 / (len(self.graph) or 1.0)
  141. for n, out in self.graph.items():
  142. ws[n] = wsdef
  143. out_sum[n] = sum((e[2] for e in out), 0.0)
  144. sorted_keys = sorted(self.graph.keys())
  145. for x in range(10):
  146. for n in sorted_keys:
  147. s = 0
  148. for e in self.graph[n]:
  149. s += e[2] / out_sum[e[1]] * ws[e[1]]
  150. ws[n] = (1 - self.d) + self.d * s
  151. min_rank, max_rank = sys.float_info[0], sys.float_info[3]
  152. for w in ws.values():
  153. if w < min_rank:
  154. min_rank = w
  155. if w > max_rank:
  156. max_rank = w
  157. for n, w in ws.items():
  158. ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
  159. return ws

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家