苏剑林分词，去掉》开头或《结尾的候选词

3 years ago · 21c4f16be5
--- a/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py
+++ b/自然语言处理/短语挖掘与新词发现/苏剑林/main_sujianlin.py
@@ -30,6 +30,13 @@ def isChinese(word):
            return True
    return False

 notStartChar = ['》', '」', '】', ')', ']', '·', '・', '•']
 notEndChar = ['《', '「', '【', '(', '[', '·', '・', '•']
 def notStartEnd(word):
    if word[0] not in notStartChar and word[len(work) - 1] not in notEndChar: 
        return True
    return False

 for m in range(2, max_sep+1):
    print(u'正在生成%s字词...'%m)
    t.append([])
@@ -44,7 +51,7 @@ for m in range(2, max_sep+1):
    qq = np.array(tt.index)
    qqfilter = []
    for word in qq:
        qqfilter.append(isChinese(word))
        qqfilter.append(isChinese(word) and notStartEnd(word))
    qq = qq[qqfilter] #非汉字过滤
    tt = tt[qq]
    #非汉字过滤 end