You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mmseg.py 4.0 kB

6 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!/usr/bin/env python
  2. # encoding: utf-8
  3. """
  4. * Copyright (C) 2018 OwnThink.
  5. *
  6. * Name : mmseg.py
  7. * Author : Leo <1162441289@qq.com>
  8. * Version : 0.01
  9. * Description : mmseg分词方法,目前算法比较耗时,仍在优化中
  10. """
  11. import os
  12. import pickle
  13. from math import log
  14. from collections import defaultdict
  15. def add_curr_dir(name):
  16. return os.path.join(os.path.dirname(__file__), name)
  17. class Trie(object):
  18. def __init__(self):
  19. self.root = {}
  20. self.value = "value"
  21. self.trie_file_path = os.path.join(os.path.dirname(__file__), "data/Trie.pkl")
  22. def get_matches(self, word):
  23. ret = []
  24. node = self.root
  25. for c in word:
  26. if c not in node:
  27. break
  28. node = node[c]
  29. if self.value in node:
  30. ret.append(node[self.value])
  31. return ret
  32. def load(self):
  33. with open(self.trie_file_path, "rb") as f:
  34. data = pickle.load(f)
  35. self.root = data
  36. class Chunk:
  37. def __init__(self, words_list, chrs, word_freq):
  38. # self.sentence_sep = ['?', '!', ';', '?', '!', '。', ';', '……', '…', ",", ",", "."]
  39. self.words = words_list
  40. self.lens_list = map(lambda x: len(x), words_list)
  41. self.length = sum(self.lens_list)
  42. self.mean = float(self.length) / len(words_list)
  43. self.var = sum(map(lambda x: (x - self.mean) ** 2, self.lens_list)) / len(self.words)
  44. self.entropy = sum([log(float(chrs.get(x, 1))) for x in words_list])
  45. # 计算词频信息熵
  46. self.word_entropy = sum([log(float(word_freq.get(x, 1))) for x in words_list])
  47. def __lt__(self, other):
  48. return (self.length, self.mean, -self.var, self.entropy, self.word_entropy) < \
  49. (other.length, other.mean, -other.var, other.entropy, other.word_entropy)
  50. class MMSeg:
  51. def __init__(self):
  52. # 加载词语字典
  53. trie = Trie()
  54. trie.load()
  55. self.words_dic = trie
  56. # 加载字频字典
  57. self.chrs_dic = self._load_freq(filename="data/chars.dic")
  58. # 加载词频字典
  59. self.word_freq = self._load_freq(filename="data/words.dic")
  60. def _load_freq(self, filename):
  61. chrs_dic = defaultdict()
  62. with open(add_curr_dir(filename), "r", encoding="utf-8") as f:
  63. for line in f:
  64. if line:
  65. key, value = line.strip().split(" ")
  66. chrs_dic.setdefault(key, int(value))
  67. return chrs_dic
  68. def __get_start_words(self, sentence):
  69. match_words = self.words_dic.get_matches(sentence)
  70. if sentence:
  71. if not match_words:
  72. return [sentence[0]]
  73. else:
  74. return match_words
  75. else:
  76. return False
  77. def __get_chunks(self, sentence):
  78. # 获取chunk,每个chunk中最多三个词
  79. ret = []
  80. def _iter_chunk(sentence, num, tmp_seg_words):
  81. match_words = self.__get_start_words(sentence)
  82. if (not match_words or num == 0) and tmp_seg_words:
  83. ret.append(Chunk(tmp_seg_words, self.chrs_dic, self.word_freq))
  84. else:
  85. for word in match_words:
  86. _iter_chunk(sentence[len(word):], num - 1, tmp_seg_words + [word])
  87. _iter_chunk(sentence, num=3, tmp_seg_words=[])
  88. return ret
  89. def cws(self, sentence):
  90. """
  91. :param sentence: 输入的数据
  92. :return: 返回的分词生成器
  93. """
  94. while sentence:
  95. chunks = self.__get_chunks(sentence)
  96. word = max(chunks).words[0]
  97. sentence = sentence[len(word):]
  98. yield word
  99. if __name__ == "__main__":
  100. mmseg = MMSeg()
  101. print(list(mmseg.cws("武汉市长江大桥上的日落非常好看,很喜欢看日出日落。")))
  102. print(list(mmseg.cws("人要是行干一行行一行.")))

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家

Contributors (1)