|
|
@@ -24,6 +24,12 @@ t.append(pd.Series(list(s)).value_counts()) #逐字统计 |
|
|
|
tsum = t[0].sum() #统计总字数 |
|
|
|
rt = [] #保存结果用 |
|
|
|
|
|
|
|
def isChinese(word): |
|
|
|
for ch in word: |
|
|
|
if not '\u4e00' <= ch <= '\u9fff': |
|
|
|
return False |
|
|
|
return True |
|
|
|
|
|
|
|
for m in range(2, max_sep+1): |
|
|
|
print(u'正在生成%s字词...'%m) |
|
|
|
t.append([]) |
|
|
@@ -33,6 +39,16 @@ for m in range(2, max_sep+1): |
|
|
|
t[m-1] = pd.Series(t[m-1]).value_counts() #逐词统计 |
|
|
|
t[m-1] = t[m-1][t[m-1] > min_count] #最小次数筛选 |
|
|
|
tt = t[m-1][:] |
|
|
|
|
|
|
|
#非汉字过滤 start |
|
|
|
qq = np.array(tt.index) |
|
|
|
qqfilter = [] |
|
|
|
for word in qq: |
|
|
|
qqfilter.append(isChinese(word)) |
|
|
|
qq = qq[qqfilter] #非汉字过滤 |
|
|
|
tt = tt[qq] |
|
|
|
#非汉字过滤 end |
|
|
|
|
|
|
|
for k in range(m-1): |
|
|
|
qq = np.array(list(map(lambda ms: tsum*t[m-1][ms]/t[m-2-k][ms[:m-1-k]]/t[k][ms[m-1-k:]], tt.index))) > min_support #最小支持度筛选。 |
|
|
|
tt = tt[qq] |
|
|
|