You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 6.0 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
5 years ago
5 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215
  1. # -*- encoding:utf-8 -*-
  2. import os
  3. import jiagu
  4. import math
  5. def default_stopwords_file():
  6. d = os.path.dirname(os.path.realpath(__file__))
  7. return os.path.join(d, 'data/stopwords.txt')
  8. sentence_delimiters = ['。', '?', '!', '…']
  9. allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
  10. 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
  11. def as_text(v):
  12. """生成unicode字符串"""
  13. if v is None:
  14. return None
  15. elif isinstance(v, bytes):
  16. return v.decode('utf-8', errors='ignore')
  17. elif isinstance(v, str):
  18. return v
  19. else:
  20. raise ValueError('Unknown type %r' % type(v))
  21. def is_text(v):
  22. return isinstance(v, str)
  23. def cut_sentences(sentence):
  24. tmp = []
  25. for ch in sentence: # 遍历字符串中的每一个字
  26. tmp.append(ch)
  27. if ch in sentence_delimiters:
  28. yield ''.join(tmp)
  29. tmp = []
  30. yield ''.join(tmp)
  31. def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
  32. sentences = []
  33. sents = []
  34. for sent in cutted_sentences:
  35. sentences.append(sent)
  36. if use_stopwords:
  37. sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords]) # 把句子分成词语
  38. else:
  39. sents.append([word for word in jiagu.seg(sent) if word])
  40. return sentences, sents
  41. def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
  42. sents = []
  43. sentences = []
  44. for sent in cutted_sentences:
  45. sentences.append(sent)
  46. word_list = jiagu.seg(sent)
  47. word_list = [word for word in word_list if len(word) > 0]
  48. if use_stopwords:
  49. word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
  50. sents.append(word_list)
  51. return sentences, sents
  52. def weight_map_rank(weight_graph, max_iter, tol):
  53. # 初始分数设置为0.5
  54. # 初始化每个句子的分子和老分数
  55. scores = [0.5 for _ in range(len(weight_graph))]
  56. old_scores = [0.0 for _ in range(len(weight_graph))]
  57. denominator = get_degree(weight_graph)
  58. # 开始迭代
  59. count = 0
  60. while different(scores, old_scores, tol):
  61. for i in range(len(weight_graph)):
  62. old_scores[i] = scores[i]
  63. # 计算每个句子的分数
  64. for i in range(len(weight_graph)):
  65. scores[i] = get_score(weight_graph, denominator, i)
  66. count += 1
  67. if count > max_iter:
  68. break
  69. return scores
  70. def get_degree(weight_graph):
  71. length = len(weight_graph)
  72. denominator = [0.0 for _ in range(len(weight_graph))]
  73. for j in range(length):
  74. for k in range(length):
  75. denominator[j] += weight_graph[j][k]
  76. if denominator[j] == 0:
  77. denominator[j] = 1.0
  78. return denominator
  79. def get_score(weight_graph, denominator, i):
  80. """
  81. :param weight_graph:
  82. :param denominator:
  83. :param i: int
  84. 第i个句子
  85. :return: float
  86. """
  87. length = len(weight_graph)
  88. d = 0.85
  89. added_score = 0.0
  90. for j in range(length):
  91. # [j,i]是指句子j指向句子i
  92. fraction = weight_graph[j][i] * 1.0
  93. # 除以j的出度
  94. added_score += fraction / denominator[j]
  95. weighted_score = (1 - d) + d * added_score
  96. return weighted_score
  97. def different(scores, old_scores, tol=0.0001):
  98. flag = False
  99. for i in range(len(scores)):
  100. if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001
  101. flag = True
  102. break
  103. return flag
  104. def combine(word_list, window=2):
  105. if window < 2:
  106. window = 2
  107. for x in range(1, window):
  108. if x >= len(word_list):
  109. break
  110. word_list2 = word_list[x:]
  111. res = zip(word_list, word_list2)
  112. for r in res:
  113. yield r
  114. def sentences_similarity(s1, s2):
  115. """计算两个句子的相似度
  116. :param s1: list
  117. :param s2: list
  118. :return: float
  119. """
  120. counter = 0
  121. for sent in s1:
  122. if sent in s2:
  123. counter += 1
  124. if counter == 0:
  125. return 0
  126. return counter / (math.log(len(s1) + len(s2)))
  127. # --------------------------------------------------------------------
  128. def is_chinese(uchar):
  129. """判断一个字符是否是汉字"""
  130. assert len(uchar) == 1, "uchar 只能是单个字符"
  131. if u'\u4e00' <= uchar <= u'\u9fa5':
  132. return True
  133. else:
  134. return False
  135. def is_number(uchar):
  136. """判断一个字符是否是数字"""
  137. assert len(uchar) == 1, "uchar 只能是单个字符"
  138. if u'\u0030' <= uchar <= u'\u0039':
  139. return True
  140. else:
  141. return False
  142. def is_alphabet(uchar):
  143. """判断一个字符是否是英文字母"""
  144. assert len(uchar) == 1, "uchar 只能是单个字符"
  145. if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
  146. return True
  147. else:
  148. return False
  149. def B2Q(uchar):
  150. """单字符半角转全角"""
  151. assert len(uchar) == 1, "uchar 只能是单个字符"
  152. inside_code = ord(uchar)
  153. if inside_code < 0x0020 or inside_code > 0x7e:
  154. # 不是半角字符就返回原来的字符
  155. return uchar
  156. if inside_code == 0x0020:
  157. # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
  158. inside_code = 0x3000
  159. else:
  160. inside_code += 0xfee0
  161. return chr(inside_code)
  162. def Q2B(uchar):
  163. """单字符全角转半角"""
  164. assert len(uchar) == 1, "uchar 只能是单个字符"
  165. inside_code = ord(uchar)
  166. if inside_code == 0x3000:
  167. inside_code = 0x0020
  168. else:
  169. inside_code -= 0xfee0
  170. if inside_code < 0x0020 or inside_code > 0x7e:
  171. # 转完之后不是半角字符返回原来的字符
  172. return uchar
  173. return chr(inside_code)

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家