diff --git a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py index 809d8de..bfa2219 100644 --- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py +++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py @@ -72,6 +72,7 @@ class NumberNormalizingVectorizer(TfidfVectorizer): stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()] +''' def loadData(filepath): fopen = open(filepath, 'r', encoding='utf-8') fileread = fopen.read() @@ -85,13 +86,25 @@ for item in items: title = item.get("title") content = item.get("content") - ''' - if title is not None and len(title) > 0: - docArr.append(title) - ''' + # if title is not None and len(title) > 0: + # docArr.append(title) if content is not None and len(content) > 0: docArr.append(content) +''' + +def loadData(filepath): + fopen = open(filepath, 'r', encoding='utf-8') + + arr = [] + for line in fopen.readlines(): + if len(line) > 0: + arr.append(line) + + fopen.close() + return arr + +docArr = loadData("./data-艺术.txt") docs = [" ".join(jieba.lcut(doc)) for doc in docArr] # pprint(docs[:10]) # 展示靠前的十篇文章的分词效果,注意,每篇文章变成了有一连串词汇组成的list(列表) @@ -190,6 +203,7 @@ km.fit(X) print("完成所耗费时间:%0.3fs" % (time() - t0)) print() +''' print("Homogeneity值: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness值: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure值: %0.3f" % metrics.v_measure_score(labels, km.labels_)) @@ -197,8 +211,8 @@ print("Adjusted Rand-Index值: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient值: %0.3f" % metrics.silhouette_score(X, km.labels_, sample_size=1000)) - print() +''' #用训练好的聚类模型反推文档的所属的主题类别 label_prediction = km.predict(X) diff --git a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py index 552254a..0662cfe 100644 --- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py +++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py @@ -47,6 +47,7 @@ def loadData(filepath): fopen.close() return json.loads(fileread) +''' items = loadData("./dataset/data.json") docArr = [] @@ -54,25 +55,49 @@ for item in items: title = item.get("title") content = item.get("content") - ''' - if title is not None and len(title) > 0: - docArr.append(title) - ''' + # if title is not None and len(title) > 0: + # docArr.append(title) if content is not None and len(content) > 0: docArr.append(content) +''' + +def loadData(filepath): + fopen = open(filepath, 'r', encoding='utf-8') + + arr = [] + for line in fopen.readlines(): + if len(line) > 0: + arr.append(line) + + fopen.close() + return arr + +docArr = loadData("./data-艺术.txt") docs = [" ".join(jieba.lcut(doc)) for doc in docArr] # pprint(docs[:10]) # 展示靠前的十篇文章的分词效果,注意,每篇文章变成了有一连串词汇组成的list(列表) - print("%d 个文档" % len(docs)) print() +sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词 -import gensim -sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词 +''' +from smart_open import smart_open +class MySentences(object): + def __init__(self, filename): + self.filename = filename + def __iter__(self): + for line in smart_open(self.filename, 'r', encoding='utf-8'): + line = line.lower() #对每一行文本中的英文词汇小写化 + yield [i.strip() for i in jieba.lcut(line) if i not in stwlist and len(i) > 1] #在载入文本的同时,对其中的语句进行分词处理,且去掉停用词和长度小于1的语句 + +sentences = MySentences('./data-艺术.txt') # 内存友好的迭代器 +''' + +import gensim # 在这些语句上训练word2vec模型 model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5, workers=2) @@ -82,7 +107,7 @@ model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5 #获取model里面的说有关键词 keys=model.wv.index_to_key -print(keys) +# print(keys) print(len(keys)) #获取词对于的词向量