diff --git a/自然语言处理/中文分词/flashtext_cut.py b/自然语言处理/中文分词/flashtext_cut.py index 520e448..723863b 100644 --- a/自然语言处理/中文分词/flashtext_cut.py +++ b/自然语言处理/中文分词/flashtext_cut.py @@ -29,7 +29,7 @@ def wordCut(): if len(keywords_found) >= 4: tempStr += ' '.join(keywords_found) + '\n' - fd = open('./word-cut.txt', 'w', encoding='utf-8') + fd = open('./flashtext-cut.txt', 'w', encoding='utf-8') fd.write(tempStr) fd.close() diff --git a/自然语言处理/中文分词/jieba_cut.py b/自然语言处理/中文分词/jieba_cut.py new file mode 100644 index 0000000..097a995 --- /dev/null +++ b/自然语言处理/中文分词/jieba_cut.py @@ -0,0 +1,49 @@ +# -*- coding: utf-8 -*- + +from flashtext import KeywordProcessor +import pandas as pd +import jieba + +def loadKeyWord(): + data = pd.read_table('./word-lib.txt', + header=None, # 表示不要导入原文件内的表头 + names=['index','keyword'], #自定义列名 + sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入 + engine= 'python') + keywords = data['keyword'] + for keyword in keywords: + jieba.add_word(str(keyword)) + +def wordCut(): + fopen = open("./data-艺术.txt", 'r', encoding='utf-8') + + tempStr = '' + for line in fopen.readlines(): + if len(line) > 0: + tempStr += " ".join(jieba.lcut(line)) + '\n' + + fopen.close() + + fd = open('./jieba-cut.txt', 'w', encoding='utf-8') + fd.write(tempStr) + fd.close() + +def wordLibCut(): + loadKeyWord() + + fopen = open("./data-艺术.txt", 'r', encoding='utf-8') + + tempStr = '' + for line in fopen.readlines(): + if len(line) > 0: + tempStr += " ".join(jieba.lcut(line)) + '\n' + + fopen.close() + + fd = open('./jieba-wordlib-cut.txt', 'w', encoding='utf-8') + fd.write(tempStr) + fd.close() + +# wordCut() +wordLibCut() + diff --git a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py index 8ecbeea..c074be8 100644 --- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py +++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_2.py @@ -41,11 +41,13 @@ class NumberNormalizingVectorizer(TfidfVectorizer): stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()] +''' def loadData(filepath): fopen = open(filepath, 'r', encoding='utf-8') fileread = fopen.read() fopen.close() return json.loads(fileread) +''' ''' items = loadData("./dataset/data.json") @@ -61,7 +63,20 @@ for item in items: if content is not None and len(content) > 0: docArr.append(content) ''' -docArr = loadData("./dataset/data.json") +# docArr = loadData("./dataset/data.json") + +def loadData(filepath): + fopen = open(filepath, 'r', encoding='utf-8') + + arr = [] + for line in fopen.readlines(): + if len(line) > 0: + arr.append(line) + + fopen.close() + return arr + +docArr = loadData("./data-艺术.txt") docs = [" ".join(jieba.lcut(doc)) for doc in docArr] # pprint(docs[:10]) # 展示靠前的十篇文章的分词效果,注意,每篇文章变成了有一连串词汇组成的list(列表) diff --git a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py index 70db6b8..3ccc39d 100644 --- a/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py +++ b/自然语言处理/文本聚类与关键字抽取/文本聚类/main_word2vec_3.py @@ -42,6 +42,7 @@ class NumberNormalizingVectorizer(TfidfVectorizer): stwlist=[line.strip() for line in open('dataset/stopwords_zh.txt', 'r',encoding='utf-8').readlines()] +''' def loadData(filepath): fopen = open(filepath, 'r', encoding='utf-8') fileread = fopen.read() @@ -49,6 +50,21 @@ def loadData(filepath): return json.loads(fileread) docArr = loadData("./dataset/data.json") +''' + + +def loadData(filepath): + fopen = open(filepath, 'r', encoding='utf-8') + + arr = [] + for line in fopen.readlines(): + if len(line) > 0: + arr.append(line) + + fopen.close() + return arr + +docArr = loadData("./data-艺术.txt") keys = []