compare.py、filter.py上传

3 years ago · f785b4aa97
--- a/自然语言处理/data_and_result/compare.py
+++ b/自然语言处理/data_and_result/compare.py
@@ -0,0 +1,59 @@

 import pandas as pd

 '''
 data1 = pd.read_table('./result-艺术-4字.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['keyword','count'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
                      engine= 'python',
                      index_col = 'keyword')

 data2 = pd.read_table('./result-艺术-7字.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['keyword','count'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
                      engine= 'python',
                      index_col = 'keyword')

 jiaoji = pd.merge(data1, data2, on=['keyword','count'])
 jiaoji.to_csv('./compare-jiaoji-result.txt', header = False)

 bingji = pd.merge(data1, data2, on=['keyword','count'], how='outer')
 bingji.to_csv('./compare-bingji-result.txt', header = False)

 chaji=data1.append(data2)
 chaji=pd.Series(chaji.index).value_counts()
 chaji=chaji[chaji==1]
 chaji.to_csv('./compare-chaji-result.txt', header = False)
 '''

 data1 = pd.read_table('./result-艺术-7字-标点符号混乱化.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['keyword','count'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
                      engine= 'python')

 data2 = pd.read_table('./result-艺术-7字-去重.txt',
                      header=None,                    # 表示不要导入原文件内的表头
                      names=['keyword','count'],   #自定义列名
                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
                      engine= 'python')

 del data1['count']
 del data2['count']

 jiaoji = pd.merge(data1, data2, on=['keyword'])
 jiaoji.to_csv('./compare-jiaoji-result.txt', header = False)

 bingji = pd.merge(data1, data2, on=['keyword'], how='outer')
 bingji.to_csv('./compare-bingji-result.txt', header = False)

 chaji=data2.append(data1)
 chaji=chaji.drop_duplicates(subset=['keyword'],keep=False)
 chaji.to_csv('./compare-chaji-result.txt', header = False)


 chaji=jiaoji.append(data1)
 chaji=chaji.drop_duplicates(subset=['keyword'],keep=False)
 chaji.to_csv('./compare-chaji-result.txt', header = False)
--- a/自然语言处理/data_and_result/filter.py
+++ b/自然语言处理/data_and_result/filter.py
@@ -0,0 +1,18 @@

 filterChar = ['《', '》', '「', '」', '【', '】', '(', ')', '[', ']', '·', '・', '•']

 notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•']
 notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•']
 def notStartEnd(word):
    if word[0] not in notStartChar and word[-1] not in notEndChar: 
        return True
    return False

 result = notStartEnd('》阿斯顿')
 print(result)
 result = notStartEnd('】根据国际【')
 print(result)
 result = notStartEnd('《国际经济】')
 print(result)
 result = notStartEnd('国际经济')
 print(result)