From 0c9fc24a86714f5511509a0aeab85f182115fcc2 Mon Sep 17 00:00:00 2001 From: wangsheng Date: Tue, 28 Sep 2021 16:15:50 +0800 Subject: [PATCH] =?UTF-8?q?=E9=9D=9E=E6=B1=89=E5=AD=97=E8=BF=87=E6=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../苏剑林/main_sujianlin.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py b/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py index 07c5632..03f3f5b 100644 --- a/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py +++ b/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py @@ -24,6 +24,12 @@ t.append(pd.Series(list(s)).value_counts()) #逐字统计 tsum = t[0].sum() #统计总字数 rt = [] #保存结果用 +def isChinese(word): + for ch in word: + if not '\u4e00' <= ch <= '\u9fff': + return False + return True + for m in range(2, max_sep+1): print(u'正在生成%s字词...'%m) t.append([]) @@ -33,6 +39,16 @@ for m in range(2, max_sep+1): t[m-1] = pd.Series(t[m-1]).value_counts() #逐词统计 t[m-1] = t[m-1][t[m-1] > min_count] #最小次数筛选 tt = t[m-1][:] + + #非汉字过滤 start + qq = np.array(tt.index) + qqfilter = [] + for word in qq: + qqfilter.append(isChinese(word)) + qq = qq[qqfilter] #非汉字过滤 + tt = tt[qq] + #非汉字过滤 end + for k in range(m-1): qq = np.array(list(map(lambda ms: tsum*t[m-1][ms]/t[m-2-k][ms[:m-1-k]]/t[k][ms[m-1-k:]], tt.index))) > min_support #最小支持度筛选。 tt = tt[qq]