Browse Source

苏剑林分词,去掉》开头或《结尾的候选词

master
wangsheng 3 years ago
parent
commit
21c4f16be5
1 changed files with 8 additions and 1 deletions
  1. +8
    -1
      自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py

+ 8
- 1
自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py View File

@@ -30,6 +30,13 @@ def isChinese(word):
return True
return False

notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•']
notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•']
def notStartEnd(word):
if word[0] not in notStartChar and word[len(work) - 1] not in notEndChar:
return True
return False

for m in range(2, max_sep+1):
print(u'正在生成%s字词...'%m)
t.append([])
@@ -44,7 +51,7 @@ for m in range(2, max_sep+1):
qq = np.array(tt.index)
qqfilter = []
for word in qq:
qqfilter.append(isChinese(word))
qqfilter.append(isChinese(word) and notStartEnd(word))
qq = qq[qqfilter] #非汉字过滤
tt = tt[qq]
#非汉字过滤 end


Loading…
Cancel
Save