|
|
@@ -47,6 +47,7 @@ def loadData(filepath): |
|
|
|
fopen.close() |
|
|
|
return json.loads(fileread) |
|
|
|
|
|
|
|
''' |
|
|
|
items = loadData("./dataset/data.json") |
|
|
|
|
|
|
|
docArr = [] |
|
|
@@ -54,25 +55,49 @@ for item in items: |
|
|
|
title = item.get("title") |
|
|
|
content = item.get("content") |
|
|
|
|
|
|
|
''' |
|
|
|
if title is not None and len(title) > 0: |
|
|
|
docArr.append(title) |
|
|
|
''' |
|
|
|
# if title is not None and len(title) > 0: |
|
|
|
# docArr.append(title) |
|
|
|
|
|
|
|
if content is not None and len(content) > 0: |
|
|
|
docArr.append(content) |
|
|
|
''' |
|
|
|
|
|
|
|
def loadData(filepath): |
|
|
|
fopen = open(filepath, 'r', encoding='utf-8') |
|
|
|
|
|
|
|
arr = [] |
|
|
|
for line in fopen.readlines(): |
|
|
|
if len(line) > 0: |
|
|
|
arr.append(line) |
|
|
|
|
|
|
|
fopen.close() |
|
|
|
return arr |
|
|
|
|
|
|
|
docArr = loadData("./data-艺术.txt") |
|
|
|
|
|
|
|
docs = [" ".join(jieba.lcut(doc)) for doc in docArr] |
|
|
|
# pprint(docs[:10]) # 展示靠前的十篇文章的分词效果,注意,每篇文章变成了有一连串词汇组成的list(列表) |
|
|
|
|
|
|
|
|
|
|
|
print("%d 个文档" % len(docs)) |
|
|
|
print() |
|
|
|
|
|
|
|
sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词 |
|
|
|
|
|
|
|
import gensim |
|
|
|
|
|
|
|
sentences = [[word for word in document.strip().split() if word not in stwlist] for document in docs] # 过滤语句中的停用词 |
|
|
|
''' |
|
|
|
from smart_open import smart_open |
|
|
|
class MySentences(object): |
|
|
|
def __init__(self, filename): |
|
|
|
self.filename = filename |
|
|
|
def __iter__(self): |
|
|
|
for line in smart_open(self.filename, 'r', encoding='utf-8'): |
|
|
|
line = line.lower() #对每一行文本中的英文词汇小写化 |
|
|
|
yield [i.strip() for i in jieba.lcut(line) if i not in stwlist and len(i) > 1] #在载入文本的同时,对其中的语句进行分词处理,且去掉停用词和长度小于1的语句 |
|
|
|
|
|
|
|
sentences = MySentences('./data-艺术.txt') # 内存友好的迭代器 |
|
|
|
''' |
|
|
|
|
|
|
|
import gensim |
|
|
|
|
|
|
|
# 在这些语句上训练word2vec模型 |
|
|
|
model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5, workers=2) |
|
|
@@ -82,7 +107,7 @@ model = gensim.models.Word2Vec(sentences, vector_size=200, window=5, min_count=5 |
|
|
|
|
|
|
|
#获取model里面的说有关键词 |
|
|
|
keys=model.wv.index_to_key |
|
|
|
print(keys) |
|
|
|
# print(keys) |
|
|
|
print(len(keys)) |
|
|
|
|
|
|
|
#获取词对于的词向量 |
|
|
|