From 21c4f16be5ce82e4572ead094be6cdaab1ee0a08 Mon Sep 17 00:00:00 2001 From: wangsheng Date: Wed, 29 Sep 2021 09:54:17 +0800 Subject: [PATCH] =?UTF-8?q?=E8=8B=8F=E5=89=91=E6=9E=97=E5=88=86=E8=AF=8D?= =?UTF-8?q?=EF=BC=8C=E5=8E=BB=E6=8E=89=E3=80=8B=E5=BC=80=E5=A4=B4=E6=88=96?= =?UTF-8?q?=E3=80=8A=E7=BB=93=E5=B0=BE=E7=9A=84=E5=80=99=E9=80=89=E8=AF=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../短语挖掘与新词发现/苏剑林/main_sujianlin.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py b/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py index bcb5a01..5c9ad2f 100644 --- a/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py +++ b/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py @@ -30,6 +30,13 @@ def isChinese(word): return True return False +notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•'] +notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•'] +def notStartEnd(word): + if word[0] not in notStartChar and word[len(work) - 1] not in notEndChar: + return True + return False + for m in range(2, max_sep+1): print(u'正在生成%s字词...'%m) t.append([]) @@ -44,7 +51,7 @@ for m in range(2, max_sep+1): qq = np.array(tt.index) qqfilter = [] for word in qq: - qqfilter.append(isChinese(word)) + qqfilter.append(isChinese(word) and notStartEnd(word)) qq = qq[qqfilter] #非汉字过滤 tt = tt[qq] #非汉字过滤 end