|
|
@@ -30,6 +30,13 @@ def isChinese(word): |
|
|
|
return True |
|
|
|
return False |
|
|
|
|
|
|
|
notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•'] |
|
|
|
notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•'] |
|
|
|
def notStartEnd(word): |
|
|
|
if word[0] not in notStartChar and word[len(work) - 1] not in notEndChar: |
|
|
|
return True |
|
|
|
return False |
|
|
|
|
|
|
|
for m in range(2, max_sep+1): |
|
|
|
print(u'正在生成%s字词...'%m) |
|
|
|
t.append([]) |
|
|
@@ -44,7 +51,7 @@ for m in range(2, max_sep+1): |
|
|
|
qq = np.array(tt.index) |
|
|
|
qqfilter = [] |
|
|
|
for word in qq: |
|
|
|
qqfilter.append(isChinese(word)) |
|
|
|
qqfilter.append(isChinese(word) and notStartEnd(word)) |
|
|
|
qq = qq[qqfilter] #非汉字过滤 |
|
|
|
tt = tt[qq] |
|
|
|
#非汉字过滤 end |
|
|
|