# -*- coding: utf-8 -*- from flashtext import KeywordProcessor import pandas as pd import jieba def loadKeyWord(): data = pd.read_table('./word-lib.txt', header=None, # 表示不要导入原文件内的表头 names=['index','keyword'], #自定义列名 sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入 engine= 'python') keywords = data['keyword'] for keyword in keywords: jieba.add_word(str(keyword)) def wordCut(): fopen = open("./data-艺术.txt", 'r', encoding='utf-8') tempStr = '' for line in fopen.readlines(): if len(line) > 0: tempStr += " ".join(jieba.lcut(line)) + '\n' fopen.close() fd = open('./jieba-cut.txt', 'w', encoding='utf-8') fd.write(tempStr) fd.close() def wordLibCut(): loadKeyWord() fopen = open("./data-艺术.txt", 'r', encoding='utf-8') tempStr = '' for line in fopen.readlines(): if len(line) > 0: tempStr += " ".join(jieba.lcut(line)) + '\n' fopen.close() fd = open('./jieba-wordlib-cut.txt', 'w', encoding='utf-8') fd.write(tempStr) fd.close() # wordCut() wordLibCut()