Browse Source

分词与聚类新尝试

master
wangsheng 3 years ago
parent
commit
8e7721b6b5
4 changed files with 82 additions and 2 deletions
  1. +1
    -1
      自然语言处理/中文分词/flashtext_cut.py
  2. +49
    -0
      自然语言处理/中文分词/jieba_cut.py
  3. +16
    -1
      自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py
  4. +16
    -0
      自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py

+ 1
- 1
自然语言处理/中文分词/flashtext_cut.py View File

@@ -29,7 +29,7 @@ def wordCut():
if len(keywords_found) >= 4:
tempStr += ' '.join(keywords_found) + '\n'

fd = open('./word-cut.txt', 'w', encoding='utf-8')
fd = open('./flashtext-cut.txt', 'w', encoding='utf-8')
fd.write(tempStr)
fd.close()



+ 49
- 0
自然语言处理/中文分词/jieba_cut.py View File

@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-

from flashtext import KeywordProcessor
import pandas as pd
import jieba

def loadKeyWord():
data = pd.read_table('./word-lib.txt',
header=None, # 表示不要导入原文件内的表头
names=['index','keyword'], #自定义列名
sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入
engine= 'python')
keywords = data['keyword']
for keyword in keywords:
jieba.add_word(str(keyword))

def wordCut():
fopen = open("./data-艺术.txt", 'r', encoding='utf-8')

tempStr = ''
for line in fopen.readlines():
if len(line) > 0:
tempStr += " ".join(jieba.lcut(line)) + '\n'

fopen.close()

fd = open('./jieba-cut.txt', 'w', encoding='utf-8')
fd.write(tempStr)
fd.close()

def wordLibCut():
loadKeyWord()

fopen = open("./data-艺术.txt", 'r', encoding='utf-8')

tempStr = ''
for line in fopen.readlines():
if len(line) > 0:
tempStr += " ".join(jieba.lcut(line)) + '\n'

fopen.close()

fd = open('./jieba-wordlib-cut.txt', 'w', encoding='utf-8')
fd.write(tempStr)
fd.close()

# wordCut()
wordLibCut()


+ 16
- 1
自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py View File

@@ -41,11 +41,13 @@ class NumberNormalizingVectorizer(TfidfVectorizer):

stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()]

'''
def loadData(filepath):
fopen = open(filepath, 'r', encoding='utf-8')
fileread = fopen.read()
fopen.close()
return json.loads(fileread)
'''

'''
items = loadData("./dataset/data.json")
@@ -61,7 +63,20 @@ for item in items:
if content is not None and len(content) > 0:
docArr.append(content)
'''
docArr = loadData("./dataset/data.json")
# docArr = loadData("./dataset/data.json")

def loadData(filepath):
fopen = open(filepath, 'r', encoding='utf-8')

arr = []
for line in fopen.readlines():
if len(line) > 0:
arr.append(line)

fopen.close()
return arr

docArr = loadData("./data-艺术.txt")

docs = [" ".join(jieba.lcut(doc)) for doc in docArr]
# pprint(docs[:10]) # 展示靠前的十篇文章的分词效果,注意,每篇文章变成了有一连串词汇组成的list(列表)


+ 16
- 0
自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py View File

@@ -42,6 +42,7 @@ class NumberNormalizingVectorizer(TfidfVectorizer):

stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()]

'''
def loadData(filepath):
fopen = open(filepath, 'r', encoding='utf-8')
fileread = fopen.read()
@@ -49,6 +50,21 @@ def loadData(filepath):
return json.loads(fileread)

docArr = loadData("./dataset/data.json")
'''


def loadData(filepath):
fopen = open(filepath, 'r', encoding='utf-8')

arr = []
for line in fopen.readlines():
if len(line) > 0:
arr.append(line)

fopen.close()
return arr

docArr = loadData("./data-艺术.txt")


keys = []


Loading…
Cancel
Save