You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nroute.py 8.1 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. import re
  2. import os
  3. import sys
  4. from math import log
  5. from jiagu.perceptron import Perceptron
  6. re_eng = re.compile('[a-zA-Z0-9]', re.U)
  7. re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  8. re_skip = re.compile("(\r\n|\s)", re.U)
  9. class Segment:
  10. def __init__(self):
  11. self.vocab = {}
  12. self.max_word_len = 0
  13. self.max_freq = 0
  14. self.total_freq = 0
  15. self.initialized = False
  16. self.model = None
  17. def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict',
  18. model_path='model/cws.model'):
  19. self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path))
  20. self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab))
  21. self.model = Perceptron(os.path.join(os.path.dirname(__file__), model_path))
  22. self.initialized = True
  23. def load_vocab(self, vocab_path):
  24. fin = open(vocab_path, 'r', encoding='utf8')
  25. for index, line in enumerate(fin):
  26. line = line.strip()
  27. if line == '':
  28. continue
  29. word_freq_tag = line.split('\t')
  30. if len(word_freq_tag) == 1:
  31. word = word_freq_tag[0]
  32. self.add_vocab(word)
  33. elif len(word_freq_tag) == 2:
  34. word = word_freq_tag[0]
  35. freq = int(word_freq_tag[1])
  36. self.add_vocab(word, freq)
  37. fin.close()
  38. def add_vocab(self, word=None, freq=None, tag=None):
  39. if freq == None:
  40. freq = self.max_freq
  41. if word not in self.vocab:
  42. self.vocab[word] = 0
  43. self.vocab[word] += freq
  44. self.total_freq += freq
  45. if freq > self.max_freq:
  46. self.max_freq = freq
  47. if len(word) > self.max_word_len:
  48. self.max_word_len = len(word)
  49. def del_vocab(self, word=None, freq=None, tag=None):
  50. if word not in self.vocab:
  51. return None
  52. vocab_freq = self.vocab[word]
  53. if freq == None or vocab_freq <= freq:
  54. del self.vocab[word]
  55. self.total_freq -= vocab_freq
  56. else:
  57. self.vocab[word] -= freq
  58. # self.max_freq and self.max_word_len ?
  59. def load_userdict(self, userdict):
  60. if self.initialized == False:
  61. self.init()
  62. if isinstance(userdict, str):
  63. self.load_vocab(userdict)
  64. for item in userdict:
  65. if isinstance(item, list):
  66. if len(item) == 1:
  67. word = item[0]
  68. self.add_vocab(word)
  69. elif len(item) == 2:
  70. word = item[0]
  71. freq = item[1]
  72. self.add_vocab(word, freq)
  73. elif isinstance(item, str):
  74. self.add_vocab(word=item)
  75. def del_userdict(self, userdict):
  76. if self.initialized == False:
  77. self.init()
  78. for item in userdict:
  79. if isinstance(item, list):
  80. if len(item) == 1:
  81. word = item[0]
  82. self.del_vocab(word)
  83. elif len(item) == 2:
  84. word = item[0]
  85. freq = item[1]
  86. self.del_vocab(word, freq)
  87. elif isinstance(item, str):
  88. self.del_vocab(word=item)
  89. def calc_route(self, sentence, DAG, route):
  90. vocab = self.vocab
  91. N = len(sentence)
  92. route[N] = (0, 0)
  93. logtotal = log(self.total_freq)
  94. for idx in range(N - 1, -1, -1):
  95. route[idx] = max((log(vocab.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[idx])
  96. def create_DAG(self, sentence):
  97. vocab = self.vocab
  98. max_word_len = self.max_word_len
  99. DAG = {}
  100. N = len(sentence)
  101. for idx in range(N):
  102. cand_idx = [idx]
  103. for i in range(idx+1, idx + min(max_word_len, N - idx), 1):
  104. cand = sentence[idx: i+1]
  105. if cand in vocab:
  106. cand_idx.append(i)
  107. DAG[idx] = cand_idx
  108. return DAG
  109. def cut_search(self, sentence):
  110. DAG = self.create_DAG(sentence)
  111. old_j = -1
  112. for k, L in DAG.items():
  113. if len(L) == 1 and k > old_j:
  114. yield sentence[k:L[0] + 1]
  115. old_j = L[0]
  116. else:
  117. for j in L:
  118. if j > k:
  119. yield sentence[k:j + 1]
  120. old_j = j
  121. def cut_vocab(self, sentence):
  122. DAG = self.create_DAG(sentence)
  123. route = {}
  124. self.calc_route(sentence, DAG, route)
  125. x = 0
  126. N = len(sentence)
  127. buf = ''
  128. while x < N:
  129. y = route[x][1] + 1
  130. l_word = sentence[x:y]
  131. if buf:
  132. yield buf
  133. buf = ''
  134. yield l_word
  135. x = y
  136. if buf:
  137. yield buf
  138. buf = ''
  139. def cut_words(self, sentence):
  140. DAG = self.create_DAG(sentence)
  141. route = {}
  142. self.calc_route(sentence, DAG, route)
  143. x = 0
  144. N = len(sentence)
  145. buf = ''
  146. while x < N:
  147. y = route[x][1] + 1
  148. l_word = sentence[x:y]
  149. if re_eng.match(l_word) and len(l_word) == 1:
  150. buf += l_word
  151. x = y
  152. else:
  153. if buf:
  154. yield buf
  155. buf = ''
  156. yield l_word
  157. x = y
  158. if buf:
  159. yield buf
  160. buf = ''
  161. def model_cut(self, sentence):
  162. if sentence == '':
  163. return ['']
  164. sentence = list(sentence)
  165. labels = self.model.predict(sentence)
  166. return self.__lab2word(sentence, labels)
  167. def __lab2word(self, sentence, labels):
  168. sen_len = len(sentence)
  169. tmp_word = ""
  170. words = []
  171. for i in range(sen_len):
  172. label = labels[i]
  173. w = sentence[i]
  174. if label == "B":
  175. tmp_word += w
  176. elif label == "M":
  177. tmp_word += w
  178. elif label == "E":
  179. tmp_word += w
  180. words.append(tmp_word)
  181. tmp_word = ""
  182. else:
  183. if tmp_word != '':
  184. words.append(tmp_word)
  185. tmp_word = ""
  186. words.append(w)
  187. if tmp_word:
  188. words.append(tmp_word)
  189. return words
  190. def seg_default(self, sentence):
  191. blocks = re_han.split(sentence)
  192. cut_block = self.cut_words
  193. cut_all = False
  194. for block in blocks:
  195. if not block:
  196. continue
  197. if re_han.match(block):
  198. for word in cut_block(block):
  199. yield word
  200. else:
  201. tmp = re_skip.split(block)
  202. for x in tmp:
  203. if re_skip.match(x):
  204. yield x
  205. elif not cut_all:
  206. for xx in x:
  207. yield xx
  208. else:
  209. yield x
  210. def seg_new_word(self, sentence):
  211. blocks = re_han.split(sentence)
  212. cut_block = self.cut_words
  213. cut_all = False
  214. for block in blocks:
  215. if not block:
  216. continue
  217. if re_han.match(block):
  218. words1 = list(cut_block(block))
  219. # print(words1)
  220. words2 = self.model_cut(block)
  221. # print(words2)
  222. new_word = [] # 有冲突的不加,长度大于4的不加,加完记得删除
  223. length = len(words1)
  224. for n in range(3):
  225. can_limit = length - n + 1
  226. for i in range(0, can_limit):
  227. ngram = ''.join(words1[i:i + n])
  228. word_len = len(ngram)
  229. if word_len > 4 or word_len==1:
  230. continue
  231. if ngram in words2 and ngram not in words1:
  232. # print(ngram)
  233. new_word.append([ngram, 1])
  234. # new_word = []
  235. # for word in words2:
  236. # if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
  237. # new_word.append([word, 1])
  238. self.load_userdict(new_word)
  239. # print('------------------')
  240. for word in cut_block(block):
  241. yield word
  242. # 删除字典
  243. self.del_userdict(new_word)
  244. else:
  245. tmp = re_skip.split(block)
  246. for x in tmp:
  247. if re_skip.match(x):
  248. yield x
  249. elif not cut_all:
  250. for xx in x:
  251. yield xx
  252. else:
  253. yield x
  254. def seg(self, sentence, mode="default"):
  255. if self.initialized == False:
  256. self.init()
  257. if mode == 'probe':
  258. return list(self.seg_new_word(sentence))
  259. else:
  260. return list(self.seg_default(sentence))
  261. if __name__=='__main__':
  262. s = Segment()
  263. # sg.load_userdict('dict/user.dict')
  264. # s.load_userdict(['知识图谱'])
  265. # text = '辽宁省铁岭市西丰县房木镇潭清村东屯' # bug
  266. # text = '黑龙江省双鸭山市宝清县宝清镇通达街341号'
  267. # text = '浙江省杭州市西湖区三墩镇紫宣路158号1幢801室'
  268. # text = '北京市西城区茶马街8号院1号楼15层1502'
  269. # text = '西藏自治区林芝市米林县羌纳乡羌渡岗村'
  270. # text = '深圳市南山区西丽街道松坪山社区宝深路科陆大厦B座13层B05'
  271. # text = '深圳市福田区福强路中港城裙楼6E部分602-A' # bug
  272. # text = '深圳市福田区福保街道石厦北二街89号新港商城C座3305室'
  273. # text = '五常市向阳镇致富村庆丰营屯'
  274. # text = '中牟县中兴路与益民巷交叉口路南'
  275. # text = '黄山市屯溪区华馨路38号二楼'
  276. text = '银川市金凤区北京中路福宁城11-1-号'
  277. # 直接将新词动态加入新词的字典中,有冲突的不加,加完记得删除
  278. # words = s.seg(text)
  279. # print(words)
  280. words = s.seg(text, 'probe')
  281. print('----------------')
  282. print(words)

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家