OpenI
/
Jiagu

 
			
							# -*- encoding:utf-8 -*-
"""
 * Copyright (C) 2017 OwnThink.
 *
 * Name        : findword.py - 新词发现
 * Author      : Yener <yener@ownthink.com>
 * Version     : 0.01
 * Description : 新词发现算法实现
 special thanks to 
 http://www.matrix67.com/blog/archives/5044
 https://github.com/zoulala/New_words_find
"""
import re
from math import log
from collections import Counter

max_word_len = 6
re_chinese = re.compile(u"[\w]+", re.U)


def count_words(input_file):
    word_freq = Counter()
    fin = open(input_file, 'r', encoding='utf8')
    for index, line in enumerate(fin):
        words = []
        for sentence in re_chinese.findall(line):
            length = len(sentence)
            for i in range(length):
                words += [sentence[i: j + i] for j in range(1, min(length - i + 1, max_word_len + 1))]
        word_freq.update(words)
    fin.close()
    return word_freq


def lrg_info(word_freq, total_word, min_freq, min_mtro):
    l_dict = {}
    r_dict = {}
    k = 0
    for word, freq in word_freq.items():
        k += 1
        if len(word) < 3:
            continue

        left_word = word[:-1]
        ml = word_freq[left_word]
        if ml > min_freq:
            mul_info1 = ml * total_word / (word_freq[left_word[1:]] * word_freq[left_word[0]])
            mul_info2 = ml * total_word / (word_freq[left_word[-1]] * word_freq[left_word[:-1]])
            mul_info = min(mul_info1, mul_info2)

            if mul_info > min_mtro:
                if left_word in l_dict:
                    l_dict[left_word].append(freq)
                else:
                    l_dict[left_word] = [ml, freq]

        right_word = word[1:]
        mr = word_freq[right_word]
        if mr > min_freq:
            mul_info1 = mr * total_word / (word_freq[right_word[1:]] * word_freq[right_word[0]])
            mul_info2 = mr * total_word / (word_freq[right_word[-1]] * word_freq[right_word[:-1]])
            mul_info = min(mul_info1, mul_info2)

            if mul_info > min_mtro:
                if right_word in r_dict:
                    r_dict[right_word].append(freq)
                else:
                    r_dict[right_word] = [mr, freq]
    return l_dict, r_dict


def cal_entro(r_dict):
    entro_r_dict = {}
    for word in r_dict:
        m_list = r_dict[word]

        r_list = m_list[1:]
        fm = m_list[0]

        entro_r = 0
        krm = fm - sum(r_list)
        if krm > 0:
            entro_r -= 1 / fm * log(1 / fm, 2) * krm

        for rm in r_list:
            entro_r -= rm / fm * log(rm / fm, 2)
        entro_r_dict[word] = entro_r

    return entro_r_dict


def entro_lr_fusion(entro_r_dict, entro_l_dict):
    entro_in_rl_dict = {}
    entro_in_r_dict = {}
    entro_in_l_dict = entro_l_dict.copy()
    for word in entro_r_dict:
        if word in entro_l_dict:
            entro_in_rl_dict[word] = [entro_l_dict[word], entro_r_dict[word]]
            entro_in_l_dict.pop(word)
        else:
            entro_in_r_dict[word] = entro_r_dict[word]
    return entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict


def entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro):
    entro_dict = {}
    l, r, rl = 0, 0, 0
    for word in entro_in_rl_dict:
        if entro_in_rl_dict[word][0] > min_entro and entro_in_rl_dict[word][1] > min_entro:
            entro_dict[word] = word_freq[word]
            rl += 1

    for word in entro_in_l_dict:
        if entro_in_l_dict[word] > min_entro:
            entro_dict[word] = word_freq[word]
            l += 1

    for word in entro_in_r_dict:
        if entro_in_r_dict[word] > min_entro:
            entro_dict[word] = word_freq[word]
            r += 1

    return entro_dict


def new_word_find(input_file, output_file):
    min_freq = 10
    min_mtro = 80
    min_entro = 3

    word_freq = count_words(input_file)
    total_word = sum(word_freq.values())

    l_dict, r_dict = lrg_info(word_freq, total_word, min_freq, min_mtro)

    entro_r_dict = cal_entro(l_dict)
    entro_l_dict = cal_entro(r_dict)

    entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict = entro_lr_fusion(entro_r_dict, entro_l_dict)
    entro_dict = entro_filter(entro_in_rl_dict, entro_in_l_dict, entro_in_r_dict, word_freq, min_entro)
    result = sorted(entro_dict.items(), key=lambda x: x[1], reverse=True)

    with open(output_file, 'w', encoding='utf-8') as kf:
        for w, m in result:
            kf.write(w + '\t%d\n' % m)