Browse Source

文本聚类方法更新

master
wangsheng 3 years ago
parent
commit
a3f240d07d
2 changed files with 52 additions and 13 deletions
  1. +19
    -5
      自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py
  2. +33
    -8
      自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py

+ 19
- 5
自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py View File

@@ -72,6 +72,7 @@ class NumberNormalizingVectorizer(TfidfVectorizer):

stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()]

'''
def loadData(filepath):
fopen = open(filepath, 'r', encoding='utf-8')
fileread = fopen.read()
@@ -85,13 +86,25 @@ for item in items:
title = item.get("title")
content = item.get("content")

'''
if title is not None and len(title) > 0:
docArr.append(title)
'''
# if title is not None and len(title) > 0:
# docArr.append(title)

if content is not None and len(content) > 0:
docArr.append(content)
'''

def loadData(filepath):
fopen = open(filepath, 'r', encoding='utf-8')

arr = []
for line in fopen.readlines():
if len(line) > 0:
arr.append(line)

fopen.close()
return arr

docArr = loadData("./data-艺术.txt")

docs = [" ".join(jieba.lcut(doc)) for doc in docArr]
# pprint(docs[:10]) # 展示靠前的十篇文章的分词效果,注意,每篇文章变成了有一连串词汇组成的list(列表)
@@ -190,6 +203,7 @@ km.fit(X)
print("完成所耗费时间:%0.3fs" % (time() - t0))
print()

'''
print("Homogeneity值: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness值: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure值: %0.3f" % metrics.v_measure_score(labels, km.labels_))
@@ -197,8 +211,8 @@ print("Adjusted Rand-Index值: %.3f"
% metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient值: %0.3f"
% metrics.silhouette_score(X, km.labels_, sample_size=1000))

print()
'''

#用训练好的聚类模型反推文档的所属的主题类别
label_prediction = km.predict(X)


+ 33
- 8
自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py View File

@@ -47,6 +47,7 @@ def loadData(filepath):
fopen.close()
return json.loads(fileread)

'''
items = loadData("./dataset/data.json")

docArr = []
@@ -54,25 +55,49 @@ for item in items:
title = item.get("title")
content = item.get("content")

'''
if title is not None and len(title) > 0:
docArr.append(title)
'''
# if title is not None and len(title) > 0:
# docArr.append(title)

if content is not None and len(content) > 0:
docArr.append(content)
'''

def loadData(filepath):
fopen = open(filepath, 'r', encoding='utf-8')

arr = []
for line in fopen.readlines():
if len(line) > 0:
arr.append(line)

fopen.close()
return arr

docArr = loadData("./data-艺术.txt")

docs = [" ".join(jieba.lcut(doc)) for doc in docArr]
# pprint(docs[:10]) # 展示靠前的十篇文章的分词效果,注意,每篇文章变成了有一连串词汇组成的list(列表)


print("%d 个文档" % len(docs))
print()

sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词

import gensim

sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词
'''
from smart_open import smart_open
class MySentences(object):
def __init__(self, filename):
self.filename = filename
def __iter__(self):
for line in smart_open(self.filename, 'r', encoding='utf-8'):
line = line.lower() #对每一行文本中的英文词汇小写化
yield [i.strip() for i in jieba.lcut(line) if i not in stwlist and len(i) > 1] #在载入文本的同时,对其中的语句进行分词处理,且去掉停用词和长度小于1的语句

sentences = MySentences('./data-艺术.txt') # 内存友好的迭代器
'''

import gensim

# 在这些语句上训练word2vec模型
model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5, workers=2)
@@ -82,7 +107,7 @@ model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5

#获取model里面的说有关键词
keys=model.wv.index_to_key
print(keys)
# print(keys)
print(len(keys))

#获取词对于的词向量


Loading…
Cancel
Save