You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mmseg.py 3.9 kB

6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. import os
  4. import pickle
  5. from math import log
  6. from collections import defaultdict
  7. def add_curr_dir(name):
  8. return os.path.join(os.path.dirname(__file__), name)
  9. class Trie(object):
  10. def __init__(self):
  11. self.root = {}
  12. self.value = "value"
  13. self.trie_file_path = os.path.join(os.path.dirname(__file__), "data/Trie.pkl")
  14. def get_matches(self, word):
  15. ret = []
  16. node = self.root
  17. for c in word:
  18. if c not in node:
  19. break
  20. node = node[c]
  21. if "value" in node:
  22. ret.append(node["value"])
  23. return ret
  24. def load(self):
  25. with open(self.trie_file_path, "rb") as f:
  26. data = pickle.load(f)
  27. self.root = data
  28. class Chunk:
  29. def __init__(self, words_list, chrs):
  30. # self.sentence_sep = ['?', '!', ';', '?', '!', '。', ';', '……', '…', ",", ",", "."]
  31. self.best_word = words_list[0]
  32. self.words_num = len(words_list)
  33. self.length = 0
  34. self.entropy = 0
  35. length_square = 0
  36. for word in words_list:
  37. word_length = len(word)
  38. self.length += word_length
  39. self.entropy += log(chrs.get(word, 1))
  40. length_square += word_length * word_length
  41. self.mean = self.length / self.words_num
  42. self.var = length_square / self.words_num - (self.length / self.words_num) * (self.length / self.words_num)
  43. def __lt__(self, other):
  44. return (self.length, self.mean, -self.var, self.entropy) < \
  45. (other.length, other.mean, -other.var, other.entropy)
  46. class MMSeg:
  47. def __init__(self):
  48. # 加载词语字典
  49. trie = Trie()
  50. trie.load()
  51. self.words_dic = trie
  52. # 加载字频字典
  53. self.chrs_dic = self._load_freq(filename="data/chars.dic")
  54. def _load_freq(self, filename):
  55. chrs_dic = defaultdict()
  56. with open(add_curr_dir(filename), "r", encoding="utf-8") as f:
  57. for line in f:
  58. if line:
  59. key, value = line.strip().split(" ")
  60. chrs_dic.setdefault(key, int(value))
  61. return chrs_dic
  62. def __get_start_words(self, sentence):
  63. if sentence:
  64. match_words = self.words_dic.get_matches(sentence)
  65. return match_words if match_words else [sentence[0]]
  66. else:
  67. return False
  68. def __get_chunks(self, sentence):
  69. # 获取chunk,每个chunk中最多三个词
  70. first_match_words = self.__get_start_words(sentence)
  71. for word_one in first_match_words:
  72. word_one_length = len(word_one)
  73. second_match_words = self.__get_start_words(sentence[word_one_length:])
  74. if second_match_words:
  75. for word_two in second_match_words:
  76. word_two_length = len(word_two) + word_one_length
  77. third_match_words = self.__get_start_words(sentence[word_two_length:])
  78. if third_match_words:
  79. for word_three in third_match_words:
  80. yield (Chunk([word_one, word_two, word_three], self.chrs_dic))
  81. else:
  82. yield (Chunk([word_one, word_two], self.chrs_dic))
  83. else:
  84. yield (Chunk([word_one], self.chrs_dic))
  85. def cws(self, sentence):
  86. """
  87. :param sentence: 输入的数据
  88. :return: 返回的分词生成器
  89. """
  90. while sentence:
  91. chunks = self.__get_chunks(sentence)
  92. word = max(chunks).best_word
  93. sentence = sentence[len(word):]
  94. yield word
  95. if __name__ == "__main__":
  96. mmseg = MMSeg()
  97. print(list(mmseg.cws("武汉市长江大桥上的日落非常好看,很喜欢看日出日落。")))
  98. print(list(mmseg.cws("人要是行干一行行一行.")))

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家