From fb03abeb67d4bf6de2d0884d285d333cf4780ffc Mon Sep 17 00:00:00 2001 From: dirtdust Date: Sun, 4 Aug 2019 14:47:30 +0800 Subject: [PATCH] [add]-upload LDA module --- jiagu/lda | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 jiagu/lda diff --git a/jiagu/lda b/jiagu/lda new file mode 100644 index 0000000..fb37aed --- /dev/null +++ b/jiagu/lda @@ -0,0 +1,146 @@ +# -*- coding: utf-8 -*- + +import glob +import jiagu +import numpy as np +from random import random + + +def normalize(vec): + total = sum(vec) + assert(abs(total) > 1e-6) + for i in range(len(vec)): + assert(vec[i] >= 0) + vec[i] = float(vec[i]) / total + + +def get_prob(vec, prob): + assert (len(vec) == len(prob)) + # 归一化分布 + normalize(prob) + r = random() + index = -1 + while r > 0: + index = index + 1 + r = r - prob[index] + return vec[index] + + +class Document(object): + def __init__(self, filename): + self.doc_name = filename[:-4] + self.__load_document(filename) + + def __load_document(self, filename): + """ + 读取一篇文章,默认一个file里面包含一篇文章 + :param filename: filename 为 *.txt + :return: self.document 文章 + self.words_list 文章中所有的词 + """ + try: + doc_file = open(filename, "r", encoding="utf-8") + self.document = "" + self.words_list = [] + for line in doc_file: + if line: + line = line.strip().replace("\t", "") + self.document += line + self.words_list.extend(jiagu.seg(line)) + except Exception as e: + print("无法加载文件,错误信息 : {}".format(e)) + + +class Corpus(object): + def __init__(self, filepath): + self.Documents = [] + self.filepath = filepath + self._build_corpus() + + def _build_corpus(self): + """ + 把所有的文章加载进来 + :return: + """ + vocabulary = set() + files = glob.glob(self.filepath + "/*.txt") + if len(files) > 0: + for each in files: + target = Document(each) + self.Documents.append(target) + for word in target.words_list: + vocabulary.add(word) + self.vocabulary = list(vocabulary) + return True + else: + print("目标文件夹下没有文件!!!") + return False + + +class LdaModel(object): + def __init__(self, filepath, number_of_topics, alpha=50, beta=0.1, iteration=3): + self.alpha = alpha + self.beta = beta + self.iteration = iteration + self.corpus = Corpus(filepath) + self.number_of_topics = number_of_topics + self.__initialize_all() + + def __initialize_all(self): + print("LDA Initializing... \nnumber of topics : {}, iteration : {}".format(self.number_of_topics, self.iteration)) + self.number_of_documents = len(self.corpus.Documents) + assert(self.number_of_documents > self.number_of_topics) + self.document_topic_counts = np.zeros([self.number_of_documents, self.number_of_topics], dtype=np.int) + self.topic_word_counts = np.zeros([self.number_of_topics, len(self.corpus.vocabulary)], dtype=np.int) + self.current_word_topic_assignments = [] + self.topic_counts = np.zeros(self.number_of_topics) + self.doc_name = dict() + for d_index, document in enumerate(self.corpus.Documents): + self.doc_name.setdefault(d_index, document.doc_name) + word_topic_assignments = [] + for word in document.words_list: + if word in self.corpus.vocabulary: + w_index = self.corpus.vocabulary.index(word) + starting_topic_index = np.random.randint(self.number_of_topics) + word_topic_assignments.append(starting_topic_index) + self.document_topic_counts[d_index, starting_topic_index] += 1 + self.topic_word_counts[starting_topic_index, w_index] += 1 + self.topic_counts[starting_topic_index] += 1 + self.current_word_topic_assignments.append(np.array(word_topic_assignments)) + + for iteration in range(self.iteration): + print("Iteration #" + str(iteration + 1) + "...") + for d_index, document in enumerate(self.corpus.Documents): + for w, word in enumerate(document.words_list): + if word in self.corpus.vocabulary: + w_index = self.corpus.vocabulary.index(word) + current_topic_index = self.current_word_topic_assignments[d_index][w] + self.document_topic_counts[d_index, current_topic_index] -= 1 + self.topic_word_counts[current_topic_index, w_index] -= 1 + self.topic_counts[current_topic_index] -= 1 + topic_distribution = (self.topic_word_counts[:, w_index] + self.beta) * \ + (self.document_topic_counts[d_index] + self.alpha) / \ + (self.topic_counts + self.beta) + new_topic_index = get_prob(range(self.number_of_topics), topic_distribution) + self.current_word_topic_assignments[d_index][w] = new_topic_index + self.document_topic_counts[d_index, new_topic_index] += 1 + self.topic_word_counts[new_topic_index, w_index] += 1 + self.topic_counts[new_topic_index] += 1 + print("LDA Initializing finished !\n") + + def get_document_topic(self): + for d_index, topic in enumerate(np.argmax(self.document_topic_counts, axis=1)): + print("this is file {}, topic : #{}".format(self.doc_name.get(d_index), topic)) + + def get_word_topic(self, topN=10): + for row in (self.topic_word_counts.argsort(axis=1)[:, -topN:]): + print(list(map(lambda x: self.corpus.vocabulary[x], row))) + + +if __name__ == "__main__": + filepath = "documents" + number_of_topics = 3 + test = LdaModel(filepath, number_of_topics) + test.get_document_topic() + test.get_word_topic() +