|
- # -*- coding: utf-8 -*-
-
- from flashtext import KeywordProcessor
- import pandas as pd
- import jieba
-
- def loadKeyWord():
- data = pd.read_table('./word-lib.txt',
- header=None, # 表示不要导入原文件内的表头
- names=['index','keyword'], #自定义列名
- sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入
- engine= 'python')
- keywords = data['keyword']
- for keyword in keywords:
- jieba.add_word(str(keyword))
-
- def wordCut():
- fopen = open("./data-艺术.txt", 'r', encoding='utf-8')
-
- tempStr = ''
- for line in fopen.readlines():
- if len(line) > 0:
- tempStr += " ".join(jieba.lcut(line)) + '\n'
-
- fopen.close()
-
- fd = open('./jieba-cut.txt', 'w', encoding='utf-8')
- fd.write(tempStr)
- fd.close()
-
- def wordLibCut():
- loadKeyWord()
-
- fopen = open("./data-艺术.txt", 'r', encoding='utf-8')
-
- tempStr = ''
- for line in fopen.readlines():
- if len(line) > 0:
- tempStr += " ".join(jieba.lcut(line)) + '\n'
-
- fopen.close()
-
- fd = open('./jieba-wordlib-cut.txt', 'w', encoding='utf-8')
- fd.write(tempStr)
- fd.close()
-
- # wordCut()
- wordLibCut()
|