You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 4.9 kB

6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. # -*- encoding:utf-8 -*-
  2. """
  3. * Copyright (C) 2017 OwnThink.
  4. *
  5. * Name : utils.py - 解析
  6. * Author : zengbin93 <zeng_bin8888@163.com>
  7. * Version : 0.01
  8. * Description : 常用工具函数
  9. """
  10. import os
  11. import jiagu
  12. import math
  13. import numpy as np
  14. def default_stopwords_file():
  15. d = os.path.dirname(os.path.realpath(__file__))
  16. return os.path.join(d, 'data/stopwords.txt')
  17. sentence_delimiters = ['。', '?', '!', '…']
  18. allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
  19. 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
  20. def as_text(v):
  21. """生成unicode字符串"""
  22. if v is None:
  23. return None
  24. elif isinstance(v, bytes):
  25. return v.decode('utf-8', errors='ignore')
  26. elif isinstance(v, str):
  27. return v
  28. else:
  29. raise ValueError('Unknown type %r' % type(v))
  30. def is_text(v):
  31. return isinstance(v, str)
  32. def cut_sentences(sentence):
  33. tmp = []
  34. for ch in sentence: # 遍历字符串中的每一个字
  35. tmp.append(ch)
  36. if ch in sentence_delimiters:
  37. yield ''.join(tmp)
  38. tmp = []
  39. yield ''.join(tmp)
  40. def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
  41. sentences = []
  42. sents = []
  43. for sent in cutted_sentences:
  44. sentences.append(sent)
  45. if use_stopwords:
  46. sents.append([word for word in jiagu.cut(sent) if word and word not in stopwords]) # 把句子分成词语
  47. else:
  48. sents.append([word for word in jiagu.cut(sent) if word])
  49. return sentences, sents
  50. def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
  51. sents = []
  52. sentences = []
  53. for sent in cutted_sentences:
  54. sentences.append(sent)
  55. word_list = jiagu.seg(sent)
  56. word_list = [word for word in word_list if len(word) > 0]
  57. if use_stopwords:
  58. word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
  59. sents.append(word_list)
  60. return sentences, sents
  61. def weight_map_rank(weight_graph, max_iter, tol):
  62. # 初始分数设置为0.5
  63. # 初始化每个句子的分子和老分数
  64. scores = [0.5 for _ in range(len(weight_graph))]
  65. old_scores = [0.0 for _ in range(len(weight_graph))]
  66. denominator = get_degree(weight_graph)
  67. # 开始迭代
  68. count = 0
  69. while different(scores, old_scores, tol):
  70. for i in range(len(weight_graph)):
  71. old_scores[i] = scores[i]
  72. # 计算每个句子的分数
  73. for i in range(len(weight_graph)):
  74. scores[i] = get_score(weight_graph, denominator, i)
  75. count += 1
  76. if count > max_iter:
  77. break
  78. return scores
  79. def get_degree(weight_graph):
  80. length = len(weight_graph)
  81. denominator = [0.0 for _ in range(len(weight_graph))]
  82. for j in range(length):
  83. for k in range(length):
  84. denominator[j] += weight_graph[j][k]
  85. if denominator[j] == 0:
  86. denominator[j] = 1.0
  87. return denominator
  88. def get_score(weight_graph, denominator, i):
  89. """
  90. :param weight_graph:
  91. :param denominator:
  92. :param i: int
  93. 第i个句子
  94. :return: float
  95. """
  96. length = len(weight_graph)
  97. d = 0.85
  98. added_score = 0.0
  99. for j in range(length):
  100. # [j,i]是指句子j指向句子i
  101. fraction = weight_graph[j][i] * 1.0
  102. # 除以j的出度
  103. added_score += fraction / denominator[j]
  104. weighted_score = (1 - d) + d * added_score
  105. return weighted_score
  106. def different(scores, old_scores, tol=0.0001):
  107. flag = False
  108. for i in range(len(scores)):
  109. if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001
  110. flag = True
  111. break
  112. return flag
  113. def cosine_similarity(vec1, vec2):
  114. """计算两个向量的余弦相似度
  115. :param vec1: list or np.array
  116. :param vec2: list or np.array
  117. :return: float
  118. """
  119. tx = np.array(vec1)
  120. ty = np.array(vec2)
  121. cos1 = np.sum(tx * ty)
  122. cos21 = np.sqrt(sum(tx ** 2))
  123. cos22 = np.sqrt(sum(ty ** 2))
  124. cosine_value = cos1 / float(cos21 * cos22)
  125. return cosine_value
  126. def combine(word_list, window=2):
  127. if window < 2:
  128. window = 2
  129. for x in range(1, window):
  130. if x >= len(word_list):
  131. break
  132. word_list2 = word_list[x:]
  133. res = zip(word_list, word_list2)
  134. for r in res:
  135. yield r
  136. def sentences_similarity(s1, s2):
  137. """计算两个句子的相似度
  138. :param s1: list
  139. :param s2: list
  140. :return: float
  141. """
  142. counter = 0
  143. for sent in s1:
  144. if sent in s2:
  145. counter += 1
  146. if counter == 0:
  147. return 0
  148. return counter / (math.log(len(s1) + len(s2)))

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家

Contributors (1)