|
- # -*- encoding:utf-8 -*-
- """
- * Copyright (C) 2017 OwnThink.
- *
- * Name : findword.py - 新词发现
- * Author : Yener <yener@ownthink.com>
- * Version : 0.01
- * Description : 新词发现算法实现
- special thanks to
- http://www.matrix67.com/blog/archives/5044
- https://github.com/zoulala/New_words_find
- """
- import re
- from math import log
- from collections import Counter
-
- max_word_len = 6
- re_chinese = re.compile(u"[\w]+", re.U)
-
-
- def count_words(input_file):
- word_freq = Counter()
- fin = open(input_file, 'r', encoding='utf8')
- for index, line in enumerate(fin):
- words = []
- for sentence in re_chinese.findall(line):
- length = len(sentence)
- for i in range(length):
- words += [sentence[i: j + i] for j in range(1, min(length - i + 1, max_word_len + 1))]
- word_freq.update(words)
- fin.close()
- return word_freq
-
-
- def lrg_info(word_freq, total_word, min_freq, min_mtro):
- l_dict = {}
- r_dict = {}
- k = 0
- for word, freq in word_freq.items():
- k += 1
- if len(word) < 3:
- continue
-
- left_word = word[:-1]
- ml = word_freq[left_word]
- if ml > min_freq:
- mul_info1 = ml * total_word / (word_freq[left_word[1:]] * word_freq[left_word[0]])
- mul_info2 = ml * total_word / (word_freq[left_word[-1]] * word_freq[left_word[:-1]])
- mul_info = min(mul_info1, mul_info2)
-
- if mul_info > min_mtro:
- if left_word in l_dict:
- l_dict[left_word].append(freq)
- else:
- l_dict[left_word] = [ml, freq]
-
- right_word = word[1:]
- mr = word_freq[right_word]
- if mr > min_freq:
- mul_info1 = mr * total_word / (word_freq[right_word[1:]] * word_freq[right_word[0]])
- mul_info2 = mr * total_word / (word_freq[right_word[-1]] * word_freq[right_word[:-1]])
- mul_info = min(mul_info1, mul_info2)
-
- if mul_info > min_mtro:
- if right_word in r_dict:
- r_dict[right_word].append(freq)
- else:
- r_dict[right_word] = [mr, freq]
- return l_dict, r_dict
-
-
- def cal_entro(r_dict):
- entro_r_dict = {}
- for word in r_dict:
- m_list = r_dict[word]
-
- r_list = m_list[1:]
- fm = m_list[0]
-
- entro_r = 0
- krm = fm - sum(r_list)
- if krm > 0:
- entro_r -= 1 / fm * log(1 / fm, 2) * krm
-
- for rm in r_list:
- entro_r -= rm / fm * log(rm / fm, 2)
- entro_r_dict[word] = entro_r
-
- return entro_r_dict
-
-
- def entro_lr_fusion(entro_r_dict, entro_l_dict):
- entro_in_rl_dict = {}
- entro_in_r_dict = {}
- entro_in_l_dict = entro_l_dict.copy()
- for word in entro_r_dict:
- if word in entro_l_dict:
- entro_in_rl_dict[word] = [entro_l_dict[word], entro_r_dict[word]]
- entro_in_l_dict.pop(word)
- else:
- entro_in_r_dict[word] = entro_r_dict[word]
- return entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict
-
-
- def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro):
- entro_dict = {}
- l, r, rl = 0, 0, 0
- for word in entro_in_rl_dict:
- if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro:
- entro_dict[word] = word_freq[word]
- rl += 1
-
- for word in entro_in_l_dict:
- if entro_in_l_dict[word] > min_entro:
- entro_dict[word] = word_freq[word]
- l += 1
-
- for word in entro_in_r_dict:
- if entro_in_r_dict[word] > min_entro:
- entro_dict[word] = word_freq[word]
- r += 1
-
- return entro_dict
-
-
- def new_word_find(input_file, output_file):
- min_freq = 10
- min_mtro = 80
- min_entro = 3
-
- word_freq = count_words(input_file)
- total_word = sum(word_freq.values())
-
- l_dict, r_dict = lrg_info(word_freq, total_word, min_freq, min_mtro)
-
- entro_r_dict = cal_entro(l_dict)
- entro_l_dict = cal_entro(r_dict)
-
- entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict = entro_lr_fusion(entro_r_dict, entro_l_dict)
- entro_dict = entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro)
- result = sorted(entro_dict.items(), key=lambda x: x[1], reverse=True)
-
- with open(output_file, 'w', encoding='utf-8') as kf:
- for w, m in result:
- kf.write(w + '\t%d\n' % m)
|