|
|
@@ -0,0 +1,49 @@ |
|
|
|
# -*- coding: utf-8 -*- |
|
|
|
|
|
|
|
from flashtext import KeywordProcessor |
|
|
|
import pandas as pd |
|
|
|
import jieba |
|
|
|
|
|
|
|
def loadKeyWord(): |
|
|
|
data = pd.read_table('./word-lib.txt', |
|
|
|
header=None, # 表示不要导入原文件内的表头 |
|
|
|
names=['index','keyword'], #自定义列名 |
|
|
|
sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入 |
|
|
|
engine= 'python') |
|
|
|
keywords = data['keyword'] |
|
|
|
for keyword in keywords: |
|
|
|
jieba.add_word(str(keyword)) |
|
|
|
|
|
|
|
def wordCut(): |
|
|
|
fopen = open("./data-艺术.txt", 'r', encoding='utf-8') |
|
|
|
|
|
|
|
tempStr = '' |
|
|
|
for line in fopen.readlines(): |
|
|
|
if len(line) > 0: |
|
|
|
tempStr += " ".join(jieba.lcut(line)) + '\n' |
|
|
|
|
|
|
|
fopen.close() |
|
|
|
|
|
|
|
fd = open('./jieba-cut.txt', 'w', encoding='utf-8') |
|
|
|
fd.write(tempStr) |
|
|
|
fd.close() |
|
|
|
|
|
|
|
def wordLibCut(): |
|
|
|
loadKeyWord() |
|
|
|
|
|
|
|
fopen = open("./data-艺术.txt", 'r', encoding='utf-8') |
|
|
|
|
|
|
|
tempStr = '' |
|
|
|
for line in fopen.readlines(): |
|
|
|
if len(line) > 0: |
|
|
|
tempStr += " ".join(jieba.lcut(line)) + '\n' |
|
|
|
|
|
|
|
fopen.close() |
|
|
|
|
|
|
|
fd = open('./jieba-wordlib-cut.txt', 'w', encoding='utf-8') |
|
|
|
fd.write(tempStr) |
|
|
|
fd.close() |
|
|
|
|
|
|
|
# wordCut() |
|
|
|
wordLibCut() |
|
|
|
|