From f785b4aa978132eda76a77bc41f1ff2dfe029fbb Mon Sep 17 00:00:00 2001 From: wangsheng Date: Wed, 29 Sep 2021 14:37:13 +0800 Subject: [PATCH] =?UTF-8?q?compare.py=E3=80=81filter.py=E4=B8=8A=E4=BC=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- 自然语言处理/data_and_result/compare.py | 59 +++++++++++++++++++++++++++ 自然语言处理/data_and_result/filter.py | 18 ++++++++ 2 files changed, 77 insertions(+) create mode 100644 自然语言处理/data_and_result/compare.py create mode 100644 自然语言处理/data_and_result/filter.py diff --git a/自然语言处理/data_and_result/compare.py b/自然语言处理/data_and_result/compare.py new file mode 100644 index 0000000..68f49c5 --- /dev/null +++ b/自然语言处理/data_and_result/compare.py @@ -0,0 +1,59 @@ + +import pandas as pd + +''' +data1 = pd.read_table('./result-艺术-4字.txt', + header=None, # 表示不要导入原文件内的表头 + names=['keyword','count'], #自定义列名 + sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入 + engine= 'python', + index_col = 'keyword') + +data2 = pd.read_table('./result-艺术-7字.txt', + header=None, # 表示不要导入原文件内的表头 + names=['keyword','count'], #自定义列名 + sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入 + engine= 'python', + index_col = 'keyword') + +jiaoji = pd.merge(data1, data2, on=['keyword','count']) +jiaoji.to_csv('./compare-jiaoji-result.txt', header = False) + +bingji = pd.merge(data1, data2, on=['keyword','count'], how='outer') +bingji.to_csv('./compare-bingji-result.txt', header = False) + +chaji=data1.append(data2) +chaji=pd.Series(chaji.index).value_counts() +chaji=chaji[chaji==1] +chaji.to_csv('./compare-chaji-result.txt', header = False) +''' + +data1 = pd.read_table('./result-艺术-7字-标点符号混乱化.txt', + header=None, # 表示不要导入原文件内的表头 + names=['keyword','count'], #自定义列名 + sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入 + engine= 'python') + +data2 = pd.read_table('./result-艺术-7字-去重.txt', + header=None, # 表示不要导入原文件内的表头 + names=['keyword','count'], #自定义列名 + sep=',', # 原文件的分隔符是'::',此处是按此分隔符将数据导入 + engine= 'python') + +del data1['count'] +del data2['count'] + +jiaoji = pd.merge(data1, data2, on=['keyword']) +jiaoji.to_csv('./compare-jiaoji-result.txt', header = False) + +bingji = pd.merge(data1, data2, on=['keyword'], how='outer') +bingji.to_csv('./compare-bingji-result.txt', header = False) + +chaji=data2.append(data1) +chaji=chaji.drop_duplicates(subset=['keyword'],keep=False) +chaji.to_csv('./compare-chaji-result.txt', header = False) + + +chaji=jiaoji.append(data1) +chaji=chaji.drop_duplicates(subset=['keyword'],keep=False) +chaji.to_csv('./compare-chaji-result.txt', header = False) diff --git a/自然语言处理/data_and_result/filter.py b/自然语言处理/data_and_result/filter.py new file mode 100644 index 0000000..8d17b60 --- /dev/null +++ b/自然语言处理/data_and_result/filter.py @@ -0,0 +1,18 @@ + +filterChar = ['《', '》', '「', '」', '【', '】', '(', ')', '[', ']', '·', '・', '•'] + +notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•'] +notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•'] +def notStartEnd(word): + if word[0] not in notStartChar and word[-1] not in notEndChar: + return True + return False + +result = notStartEnd('》阿斯顿') +print(result) +result = notStartEnd('】根据国际【') +print(result) +result = notStartEnd('《国际经济】') +print(result) +result = notStartEnd('国际经济') +print(result)