分词与聚类新尝试

3 years ago · 8e7721b6b5
--- a/自然语言处理/中文分词/flashtext_cut.py
+++ b/自然语言处理/中文分词/flashtext_cut.py
@@ -29,7 +29,7 @@ def wordCut():
            if len(keywords_found) >= 4:
                tempStr += ' '.join(keywords_found) + '\n'

    fd = open('./word-cut.txt', 'w', encoding='utf-8')
    fd = open('./flashtext-cut.txt', 'w', encoding='utf-8')
    fd.write(tempStr)
    fd.close()

--- a/自然语言处理/中文分词/jieba_cut.py
+++ b/自然语言处理/中文分词/jieba_cut.py
@@ -0,0 +1,49 @@
 # -*- coding: utf-8 -*-

 from flashtext import KeywordProcessor
 import pandas as pd
 import jieba

 def loadKeyWord():
    data = pd.read_table('./word-lib.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['index','keyword'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
                      engine= 'python')
    keywords = data['keyword']
    for keyword in keywords:
        jieba.add_word(str(keyword))

 def wordCut():
    fopen = open("./data-艺术.txt", 'r', encoding='utf-8')

    tempStr = ''
    for line in fopen.readlines():
        if len(line) > 0:
            tempStr += " ".join(jieba.lcut(line)) + '\n'

    fopen.close()

    fd = open('./jieba-cut.txt', 'w', encoding='utf-8')
    fd.write(tempStr)
    fd.close()

 def wordLibCut():
    loadKeyWord()

    fopen = open("./data-艺术.txt", 'r', encoding='utf-8')

    tempStr = ''
    for line in fopen.readlines():
        if len(line) > 0:
            tempStr += " ".join(jieba.lcut(line)) + '\n'

    fopen.close()

    fd = open('./jieba-wordlib-cut.txt', 'w', encoding='utf-8')
    fd.write(tempStr)
    fd.close()

 # wordCut()
 wordLibCut()

--- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py
+++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py
@@ -41,11 +41,13 @@ class NumberNormalizingVectorizer(TfidfVectorizer):

 stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()]

 '''
 def loadData(filepath):
    fopen = open(filepath, 'r', encoding='utf-8')
    fileread = fopen.read()
    fopen.close()
    return json.loads(fileread)
 '''

 '''
 items = loadData("./dataset/data.json")
@@ -61,7 +63,20 @@ for item in items:
    if content is not None and len(content) > 0:
        docArr.append(content)
 '''
 docArr = loadData("./dataset/data.json")
 # docArr = loadData("./dataset/data.json")

 def loadData(filepath):
    fopen = open(filepath, 'r', encoding='utf-8')

    arr = []
    for line in fopen.readlines():
        if len(line) > 0:
            arr.append(line)

    fopen.close()
    return arr

 docArr = loadData("./data-艺术.txt")

 docs = [" ".join(jieba.lcut(doc)) for doc in docArr]
 # pprint(docs[:10]) # 展示靠前的十篇文章的分词效果，注意，每篇文章变成了有一连串词汇组成的list（列表）
--- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py
+++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py
@@ -42,6 +42,7 @@ class NumberNormalizingVectorizer(TfidfVectorizer):

 stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()]

 '''
 def loadData(filepath):
    fopen = open(filepath, 'r', encoding='utf-8')
    fileread = fopen.read()
@@ -49,6 +50,21 @@ def loadData(filepath):
    return json.loads(fileread)

 docArr = loadData("./dataset/data.json")
 '''


 def loadData(filepath):
    fopen = open(filepath, 'r', encoding='utf-8')

    arr = []
    for line in fopen.readlines():
        if len(line) > 0:
            arr.append(line)

    fopen.close()
    return arr

 docArr = loadData("./data-艺术.txt")


 keys = []