文本聚类方法更新

3 years ago · a3f240d07d
--- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py
+++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_tfidf.py
@@ -72,6 +72,7 @@ class NumberNormalizingVectorizer(TfidfVectorizer):

 stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()]

 '''
 def loadData(filepath):
    fopen = open(filepath, 'r', encoding='utf-8')
    fileread = fopen.read()
@@ -85,13 +86,25 @@ for item in items:
    title = item.get("title")
    content = item.get("content")

    '''
    if title is not None and len(title) > 0:
        docArr.append(title)
    '''
    # if title is not None and len(title) > 0:
        # docArr.append(title)

    if content is not None and len(content) > 0:
        docArr.append(content)
 '''

 def loadData(filepath):
    fopen = open(filepath, 'r', encoding='utf-8')

    arr = []
    for line in fopen.readlines():
        if len(line) > 0:
            arr.append(line)

    fopen.close()
    return arr

 docArr = loadData("./data-艺术.txt")

 docs = [" ".join(jieba.lcut(doc)) for doc in docArr]
 # pprint(docs[:10]) # 展示靠前的十篇文章的分词效果，注意，每篇文章变成了有一连串词汇组成的list（列表）
@@ -190,6 +203,7 @@ km.fit(X)
 print("完成所耗费时间：%0.3fs" % (time() - t0))
 print()

 '''
 print("Homogeneity值: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
 print("Completeness值: %0.3f" % metrics.completeness_score(labels, km.labels_))
 print("V-measure值: %0.3f" % metrics.v_measure_score(labels, km.labels_))
@@ -197,8 +211,8 @@ print("Adjusted Rand-Index值: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
 print("Silhouette Coefficient值: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

 print()
 '''

 #用训练好的聚类模型反推文档的所属的主题类别
 label_prediction = km.predict(X)   
--- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py
+++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_1.py
@@ -47,6 +47,7 @@ def loadData(filepath):
    fopen.close()
    return json.loads(fileread)

 '''
 items = loadData("./dataset/data.json")

 docArr = []
@@ -54,25 +55,49 @@ for item in items:
    title = item.get("title")
    content = item.get("content")

    '''
    if title is not None and len(title) > 0:
        docArr.append(title)
    '''
    # if title is not None and len(title) > 0:
        # docArr.append(title)

    if content is not None and len(content) > 0:
        docArr.append(content)
 '''

 def loadData(filepath):
    fopen = open(filepath, 'r', encoding='utf-8')

    arr = []
    for line in fopen.readlines():
        if len(line) > 0:
            arr.append(line)

    fopen.close()
    return arr

 docArr = loadData("./data-艺术.txt")

 docs = [" ".join(jieba.lcut(doc)) for doc in docArr]
 # pprint(docs[:10]) # 展示靠前的十篇文章的分词效果，注意，每篇文章变成了有一连串词汇组成的list（列表）


 print("%d 个文档" % len(docs))
 print()

 sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词

 import gensim

 sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词
 '''
 from smart_open import smart_open
 class MySentences(object):  
    def __init__(self, filename):    
        self.filename = filename  
    def __iter__(self):          
        for line in smart_open(self.filename, 'r', encoding='utf-8'):        
            line = line.lower() #对每一行文本中的英文词汇小写化
            yield [i.strip() for i in jieba.lcut(line) if i not in stwlist and len(i) > 1]  #在载入文本的同时，对其中的语句进行分词处理，且去掉停用词和长度小于1的语句

 sentences = MySentences('./data-艺术.txt')  # 内存友好的迭代器
 '''

 import gensim

 # 在这些语句上训练word2vec模型
 model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5, workers=2)
@@ -82,7 +107,7 @@ model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5

 #获取model里面的说有关键词
 keys=model.wv.index_to_key
 print(keys)
 # print(keys)
 print(len(keys))

 #获取词对于的词向量