分词优化

3 years ago · 240016d1ff
--- a/自然语言处理/data_and_result/compare.py
+++ b/自然语言处理/data_and_result/compare.py
@@ -28,13 +28,13 @@ chaji=chaji[chaji==1]
 chaji.to_csv('./compare-chaji-result.txt', header = False)
 '''

 data1 = pd.read_table('./result-艺术-7字-标点符号混乱化.txt',
 data1 = pd.read_table('./result-艺术-7字.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['keyword','count'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
                      engine= 'python')

 data2 = pd.read_table('./result-艺术-7字-去重.txt',
 data2 = pd.read_table('./result-所有-7字.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['keyword','count'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
--- a/自然语言处理/中文分词/flashtext_cut.py
+++ b/自然语言处理/中文分词/flashtext_cut.py
@@ -6,7 +6,7 @@ import pandas as pd
 def loadKeyWord(keyword_processor):
    data = pd.read_table('./result.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['keyword','count'],   #自定义列名
                      names=['index','keyword'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
                      engine= 'python')
    keywords = data['keyword']
--- a/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py
+++ b/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py
@@ -33,7 +33,9 @@ def isChinese(word):
 notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•']
 notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•']
 def notStartEnd(word):
    if word[0] not in notStartChar and word[-1] not in notEndChar: 
    if (word[0] == '《' and word[-1] != '》') or (word[0] != '《' and word[-1] == '》'):
        return False
    elif word[0] not in notStartChar and word[-1] not in notEndChar: 
        return True
    return False