You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

findword.py 3.7 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. # -*- encoding:utf-8 -*-
  2. import re
  3. from math import log
  4. from collections import Counter
  5. max_word_len = 6
  6. re_chinese = re.compile(u"[\w]+", re.U)
  7. def count_words(input_file):
  8. word_freq = Counter()
  9. fin = open(input_file, 'r', encoding='utf8')
  10. for index, line in enumerate(fin):
  11. words = []
  12. for sentence in re_chinese.findall(line):
  13. length = len(sentence)
  14. for i in range(length):
  15. words += [sentence[i: j + i] for j in range(1, min(length - i + 1, max_word_len + 1))]
  16. word_freq.update(words)
  17. fin.close()
  18. return word_freq
  19. def lrg_info(word_freq, total_word, min_freq, min_mtro):
  20. l_dict = {}
  21. r_dict = {}
  22. for word, freq in word_freq.items():
  23. if len(word) < 3:
  24. continue
  25. left_word = word[:-1]
  26. right_word = word[1:]
  27. def __update_dict(side_dict, side_word):
  28. side_word_freq = word_freq[side_word]
  29. if side_word_freq > min_freq:
  30. mul_info1 = side_word_freq * total_word / (word_freq[side_word[1:]] * word_freq[side_word[0]])
  31. mul_info2 = side_word_freq * total_word / (word_freq[side_word[-1]] * word_freq[side_word[:-1]])
  32. mul_info = min(mul_info1, mul_info2)
  33. if mul_info > min_mtro:
  34. if side_word in side_dict:
  35. side_dict[side_word].append(freq)
  36. else:
  37. side_dict[side_word] = [side_word_freq, freq]
  38. __update_dict(l_dict, left_word)
  39. __update_dict(r_dict, right_word)
  40. return l_dict, r_dict
  41. def cal_entro(r_dict):
  42. entro_r_dict = {}
  43. for word in r_dict:
  44. m_list = r_dict[word]
  45. r_list = m_list[1:]
  46. entro_r = 0
  47. sum_r_list = sum(r_list)
  48. for rm in r_list:
  49. entro_r -= rm / sum_r_list * log(rm / sum_r_list, 2)
  50. entro_r_dict[word] = entro_r
  51. return entro_r_dict
  52. def entro_lr_fusion(entro_r_dict, entro_l_dict):
  53. entro_in_rl_dict = {}
  54. entro_in_r_dict = {}
  55. entro_in_l_dict = entro_l_dict.copy()
  56. for word in entro_r_dict:
  57. if word in entro_l_dict:
  58. entro_in_rl_dict[word] = [entro_l_dict[word], entro_r_dict[word]]
  59. entro_in_l_dict.pop(word)
  60. else:
  61. entro_in_r_dict[word] = entro_r_dict[word]
  62. return entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict
  63. def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro):
  64. entro_dict = {}
  65. for word in entro_in_rl_dict:
  66. if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro:
  67. entro_dict[word] = word_freq[word]
  68. for word in entro_in_l_dict:
  69. if entro_in_l_dict[word] > min_entro:
  70. entro_dict[word] = word_freq[word]
  71. for word in entro_in_r_dict:
  72. if entro_in_r_dict[word] > min_entro:
  73. entro_dict[word] = word_freq[word]
  74. return entro_dict
  75. def new_word_find(input_file, output_file, min_freq=10, min_mtro=80, min_entro=3):
  76. word_freq = count_words(input_file)
  77. total_word = sum(word_freq.values())
  78. l_dict, r_dict = lrg_info(word_freq, total_word, min_freq, min_mtro)
  79. entro_r_dict = cal_entro(l_dict)
  80. entro_l_dict = cal_entro(r_dict)
  81. entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict = entro_lr_fusion(entro_r_dict, entro_l_dict)
  82. entro_dict = entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro)
  83. result = sorted(entro_dict.items(), key=lambda x: x[1], reverse=True)
  84. with open(output_file, 'w', encoding='utf-8') as kf:
  85. for w, m in result:
  86. kf.write(w + '\t%d\n' % m)

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家