Browse Source

compare.py、filter.py上传

master
wangsheng 3 years ago
parent
commit
f785b4aa97
2 changed files with 77 additions and 0 deletions
  1. +59
    -0
      自然语言处理/data_and_result/compare.py
  2. +18
    -0
      自然语言处理/data_and_result/filter.py

+ 59
- 0
自然语言处理/data_and_result/compare.py View File

@@ -0,0 +1,59 @@

import pandas as pd

'''
data1 = pd.read_table('./result-艺术-4字.txt',
header=None, # 表示不要导入原文件内的表头
names=['keyword','count'], #自定义列名
sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入
engine= 'python',
index_col = 'keyword')

data2 = pd.read_table('./result-艺术-7字.txt',
header=None, # 表示不要导入原文件内的表头
names=['keyword','count'], #自定义列名
sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入
engine= 'python',
index_col = 'keyword')

jiaoji = pd.merge(data1, data2, on=['keyword','count'])
jiaoji.to_csv('./compare-jiaoji-result.txt', header = False)

bingji = pd.merge(data1, data2, on=['keyword','count'], how='outer')
bingji.to_csv('./compare-bingji-result.txt', header = False)

chaji=data1.append(data2)
chaji=pd.Series(chaji.index).value_counts()
chaji=chaji[chaji==1]
chaji.to_csv('./compare-chaji-result.txt', header = False)
'''

data1 = pd.read_table('./result-艺术-7字-标点符号混乱化.txt',
header=None, # 表示不要导入原文件内的表头
names=['keyword','count'], #自定义列名
sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入
engine= 'python')

data2 = pd.read_table('./result-艺术-7字-去重.txt',
header=None, # 表示不要导入原文件内的表头
names=['keyword','count'], #自定义列名
sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入
engine= 'python')

del data1['count']
del data2['count']

jiaoji = pd.merge(data1, data2, on=['keyword'])
jiaoji.to_csv('./compare-jiaoji-result.txt', header = False)

bingji = pd.merge(data1, data2, on=['keyword'], how='outer')
bingji.to_csv('./compare-bingji-result.txt', header = False)

chaji=data2.append(data1)
chaji=chaji.drop_duplicates(subset=['keyword'],keep=False)
chaji.to_csv('./compare-chaji-result.txt', header = False)


chaji=jiaoji.append(data1)
chaji=chaji.drop_duplicates(subset=['keyword'],keep=False)
chaji.to_csv('./compare-chaji-result.txt', header = False)

+ 18
- 0
自然语言处理/data_and_result/filter.py View File

@@ -0,0 +1,18 @@

filterChar = ['《', '》', '「', '」', '【', '】', '(', ')', '[', ']', '·', '・', '•']

notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•']
notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•']
def notStartEnd(word):
if word[0] not in notStartChar and word[-1] not in notEndChar:
return True
return False

result = notStartEnd('》阿斯顿')
print(result)
result = notStartEnd('】根据国际【')
print(result)
result = notStartEnd('《国际经济】')
print(result)
result = notStartEnd('国际经济')
print(result)

Loading…
Cancel
Save