You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

jieba_cut.py 1.3 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849
  1. # -*- coding: utf-8 -*-
  2. from flashtext import KeywordProcessor
  3. import pandas as pd
  4. import jieba
  5. def loadKeyWord():
  6. data = pd.read_table('./word-lib.txt',
  7. header=None, # 表示不要导入原文件内的表头
  8. names=['index','keyword'], #自定义列名
  9. sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入
  10. engine= 'python')
  11. keywords = data['keyword']
  12. for keyword in keywords:
  13. jieba.add_word(str(keyword))
  14. def wordCut():
  15. fopen = open("./data-艺术.txt", 'r', encoding='utf-8')
  16. tempStr = ''
  17. for line in fopen.readlines():
  18. if len(line) > 0:
  19. tempStr += " ".join(jieba.lcut(line)) + '\n'
  20. fopen.close()
  21. fd = open('./jieba-cut.txt', 'w', encoding='utf-8')
  22. fd.write(tempStr)
  23. fd.close()
  24. def wordLibCut():
  25. loadKeyWord()
  26. fopen = open("./data-艺术.txt", 'r', encoding='utf-8')
  27. tempStr = ''
  28. for line in fopen.readlines():
  29. if len(line) > 0:
  30. tempStr += " ".join(jieba.lcut(line)) + '\n'
  31. fopen.close()
  32. fd = open('./jieba-wordlib-cut.txt', 'w', encoding='utf-8')
  33. fd.write(tempStr)
  34. fd.close()
  35. # wordCut()
  36. wordLibCut()