From f785b4aa978132eda76a77bc41f1ff2dfe029fbb Mon Sep 17 00:00:00 2001
From: wangsheng <ak@126.com>
Date: Wed, 29 Sep 2021 14:37:13 +0800
Subject: [PATCH] =?UTF-8?q?compare.py=E3=80=81filter.py=E4=B8=8A=E4=BC=A0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 自然语言处理/data_and_result/compare.py | 59 +++++++++++++++++++++++++++
 自然语言处理/data_and_result/filter.py  | 18 ++++++++
 2 files changed, 77 insertions(+)
 create mode 100644 自然语言处理/data_and_result/compare.py
 create mode 100644 自然语言处理/data_and_result/filter.py

diff --git a/自然语言处理/data_and_result/compare.py b/自然语言处理/data_and_result/compare.py
new file mode 100644
index 0000000..68f49c5
--- /dev/null
+++ b/自然语言处理/data_and_result/compare.py
@@ -0,0 +1,59 @@
+
+import pandas as pd
+
+'''
+data1 = pd.read_table('./result-艺术-4字.txt',
+                      header=None,                    # 表示不要导入原文件内的表头
+                      names=['keyword','count'],   #自定义列名
+                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
+                      engine= 'python',
+                      index_col = 'keyword')
+
+data2 = pd.read_table('./result-艺术-7字.txt',
+                      header=None,                    # 表示不要导入原文件内的表头
+                      names=['keyword','count'],   #自定义列名
+                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
+                      engine= 'python',
+                      index_col = 'keyword')
+
+jiaoji = pd.merge(data1, data2, on=['keyword','count'])
+jiaoji.to_csv('./compare-jiaoji-result.txt', header = False)
+
+bingji = pd.merge(data1, data2, on=['keyword','count'], how='outer')
+bingji.to_csv('./compare-bingji-result.txt', header = False)
+
+chaji=data1.append(data2)
+chaji=pd.Series(chaji.index).value_counts()
+chaji=chaji[chaji==1]
+chaji.to_csv('./compare-chaji-result.txt', header = False)
+'''
+
+data1 = pd.read_table('./result-艺术-7字-标点符号混乱化.txt',
+                      header=None,                    # 表示不要导入原文件内的表头
+                      names=['keyword','count'],   #自定义列名
+                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
+                      engine= 'python')
+
+data2 = pd.read_table('./result-艺术-7字-去重.txt',
+                      header=None,                    # 表示不要导入原文件内的表头
+                      names=['keyword','count'],   #自定义列名
+                      sep=',',                     # 原文件的分隔符是'::'，此处是按此分隔符将数据导入
+                      engine= 'python')
+
+del data1['count']
+del data2['count']
+
+jiaoji = pd.merge(data1, data2, on=['keyword'])
+jiaoji.to_csv('./compare-jiaoji-result.txt', header = False)
+
+bingji = pd.merge(data1, data2, on=['keyword'], how='outer')
+bingji.to_csv('./compare-bingji-result.txt', header = False)
+
+chaji=data2.append(data1)
+chaji=chaji.drop_duplicates(subset=['keyword'],keep=False)
+chaji.to_csv('./compare-chaji-result.txt', header = False)
+
+
+chaji=jiaoji.append(data1)
+chaji=chaji.drop_duplicates(subset=['keyword'],keep=False)
+chaji.to_csv('./compare-chaji-result.txt', header = False)
diff --git a/自然语言处理/data_and_result/filter.py b/自然语言处理/data_and_result/filter.py
new file mode 100644
index 0000000..8d17b60
--- /dev/null
+++ b/自然语言处理/data_and_result/filter.py
@@ -0,0 +1,18 @@
+
+filterChar = ['《', '》', '「', '」', '【', '】', '(', ')', '[', ']', '·', '・', '•']
+
+notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•']
+notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•']
+def notStartEnd(word):
+    if word[0] not in notStartChar and word[-1] not in notEndChar: 
+        return True
+    return False
+
+result = notStartEnd('》阿斯顿')
+print(result)
+result = notStartEnd('】根据国际【')
+print(result)
+result = notStartEnd('《国际经济】')
+print(result)
+result = notStartEnd('国际经济')
+print(result)