You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

utils.py 5.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
  1. # -*- encoding:utf-8 -*-
  2. import os
  3. import jiagu
  4. import math
  5. def default_stopwords_file():
  6. d = os.path.dirname(os.path.realpath(__file__))
  7. return os.path.join(d, 'data/stopwords.txt')
  8. sentence_delimiters = ['。', '?', '!', '…']
  9. allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
  10. 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
  11. def as_text(v):
  12. """生成unicode字符串"""
  13. if v is None:
  14. return None
  15. elif isinstance(v, bytes):
  16. return v.decode('utf-8', errors='ignore')
  17. elif isinstance(v, str):
  18. return v
  19. else:
  20. raise ValueError('Unknown type %r' % type(v))
  21. def is_text(v):
  22. return isinstance(v, str)
  23. def cut_sentences(sentence):
  24. tmp = []
  25. for ch in sentence: # 遍历字符串中的每一个字
  26. tmp.append(ch)
  27. if ch in sentence_delimiters:
  28. yield ''.join(tmp)
  29. tmp = []
  30. if len(tmp) > 0: # 如以定界符结尾的文本的文本信息会在循环中返回,无需再次传递
  31. yield ''.join(tmp)
  32. def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
  33. sentences = []
  34. sents = []
  35. for sent in cutted_sentences:
  36. sentences.append(sent)
  37. if use_stopwords:
  38. sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords]) # 把句子分成词语
  39. else:
  40. sents.append([word for word in jiagu.seg(sent) if word])
  41. return sentences, sents
  42. def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
  43. sents = []
  44. sentences = []
  45. for sent in cutted_sentences:
  46. sentences.append(sent)
  47. word_list = jiagu.seg(sent)
  48. word_list = [word for word in word_list if len(word) > 0]
  49. if use_stopwords:
  50. word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
  51. sents.append(word_list)
  52. return sentences, sents
  53. def weight_map_rank(weight_graph, max_iter, tol):
  54. # 初始分数设置为0.5
  55. # 初始化每个句子的分子和老分数
  56. scores = [0.5 for _ in range(len(weight_graph))]
  57. old_scores = [0.0 for _ in range(len(weight_graph))]
  58. denominator = get_degree(weight_graph)
  59. # 开始迭代
  60. count = 0
  61. while different(scores, old_scores, tol):
  62. for i in range(len(weight_graph)):
  63. old_scores[i] = scores[i]
  64. # 计算每个句子的分数
  65. for i in range(len(weight_graph)):
  66. scores[i] = get_score(weight_graph, denominator, i)
  67. count += 1
  68. if count > max_iter:
  69. break
  70. return scores
  71. def get_degree(weight_graph):
  72. length = len(weight_graph)
  73. denominator = [0.0 for _ in range(len(weight_graph))]
  74. for j in range(length):
  75. for k in range(length):
  76. denominator[j] += weight_graph[j][k]
  77. if denominator[j] == 0:
  78. denominator[j] = 1.0
  79. return denominator
  80. def get_score(weight_graph, denominator, i):
  81. """
  82. :param weight_graph:
  83. :param denominator:
  84. :param i: int
  85. 第i个句子
  86. :return: float
  87. """
  88. length = len(weight_graph)
  89. d = 0.85
  90. added_score = 0.0
  91. for j in range(length):
  92. # [j,i]是指句子j指向句子i
  93. fraction = weight_graph[j][i] * 1.0
  94. # 除以j的出度
  95. added_score += fraction / denominator[j]
  96. weighted_score = (1 - d) + d * added_score
  97. return weighted_score
  98. def different(scores, old_scores, tol=0.0001):
  99. flag = False
  100. for i in range(len(scores)):
  101. if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001
  102. flag = True
  103. break
  104. return flag
  105. def combine(word_list, window=2):
  106. if window < 2:
  107. window = 2
  108. for x in range(1, window):
  109. if x >= len(word_list):
  110. break
  111. word_list2 = word_list[x:]
  112. res = zip(word_list, word_list2)
  113. for r in res:
  114. yield r
  115. def sentences_similarity(s1, s2):
  116. """计算两个句子的相似度
  117. :param s1: list
  118. :param s2: list
  119. :return: float
  120. """
  121. counter = 0
  122. for sent in s1:
  123. if sent in s2:
  124. counter += 1
  125. if counter == 0:
  126. return 0
  127. return counter / (math.log(len(s1) + len(s2)))
  128. # --------------------------------------------------------------------
  129. def is_chinese(uchar):
  130. """判断一个字符是否是汉字"""
  131. assert len(uchar) == 1, "uchar 只能是单个字符"
  132. if u'\u4e00' <= uchar <= u'\u9fa5':
  133. return True
  134. else:
  135. return False
  136. def is_number(uchar):
  137. """判断一个字符是否是数字"""
  138. assert len(uchar) == 1, "uchar 只能是单个字符"
  139. if u'\u0030' <= uchar <= u'\u0039':
  140. return True
  141. else:
  142. return False
  143. def is_alphabet(uchar):
  144. """判断一个字符是否是英文字母"""
  145. assert len(uchar) == 1, "uchar 只能是单个字符"
  146. if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
  147. return True
  148. else:
  149. return False
  150. def B2Q(uchar):
  151. """单字符半角转全角"""
  152. assert len(uchar) == 1, "uchar 只能是单个字符"
  153. inside_code = ord(uchar)
  154. if inside_code < 0x0020 or inside_code > 0x7e:
  155. # 不是半角字符就返回原来的字符
  156. return uchar
  157. if inside_code == 0x0020:
  158. # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
  159. inside_code = 0x3000
  160. else:
  161. inside_code += 0xfee0
  162. return chr(inside_code)
  163. def Q2B(uchar):
  164. """单字符全角转半角"""
  165. assert len(uchar) == 1, "uchar 只能是单个字符"
  166. inside_code = ord(uchar)
  167. if inside_code == 0x3000:
  168. inside_code = 0x0020
  169. else:
  170. inside_code -= 0xfee0
  171. if inside_code < 0x0020 or inside_code > 0x7e:
  172. # 转完之后不是半角字符返回原来的字符
  173. return uchar
  174. return chr(inside_code)

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家