From 83b319e6eb876a94496e2c2da7212d0f50ecac1d Mon Sep 17 00:00:00 2001
From: abtion <abtion@outlook.com>
Date: Wed, 24 Jun 2020 10:04:45 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=96=87=E6=9C=AC?=
 =?UTF-8?q?=E6=91=98=E8=A6=81=E7=9A=84=E4=B8=A4=E4=B8=AA=E5=B0=8F=E9=97=AE?=
 =?UTF-8?q?=E9=A2=98=EF=BC=8C=E5=B9=B6=E6=96=B0=E5=A2=9E=E4=BA=86=E6=B5=8B?=
 =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 jiagu/textrank.py     |   5 +-
 jiagu/utils.py        | 431 +++++++++++++++++++++++++-------------------------
 test/test_textrank.py |  24 ++-
 3 files changed, 242 insertions(+), 218 deletions(-)

diff --git a/jiagu/textrank.py b/jiagu/textrank.py
index ec8fe1e..dcab08f 100644
--- a/jiagu/textrank.py
+++ b/jiagu/textrank.py
@@ -90,8 +90,9 @@ class Summarize(object):
         if stop_words_file:
             self.__stop_words_file = stop_words_file
         if use_stopword:
-            for word in open(self.__stop_words_file, 'r', encoding='utf-8'):
-                self.__stop_words.add(word.strip())
+            with open(self.__stop_words_file, 'r', encoding='utf-8') as f:
+                for word in f:
+                    self.__stop_words.add(word.strip())
 
     def filter_dictword(self, sents):
         _sents = []
diff --git a/jiagu/utils.py b/jiagu/utils.py
index 286715e..73ec11b 100644
--- a/jiagu/utils.py
+++ b/jiagu/utils.py
@@ -1,215 +1,216 @@
-# -*- encoding:utf-8 -*-
-import os
-import jiagu
-import math
-
-
-def default_stopwords_file():
-    d = os.path.dirname(os.path.realpath(__file__))
-    return os.path.join(d, 'data/stopwords.txt')
-
-
-sentence_delimiters = ['。', '？', '！', '…']
-allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
-                     'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
-
-
-def as_text(v):
-    """生成unicode字符串"""
-    if v is None:
-        return None
-    elif isinstance(v, bytes):
-        return v.decode('utf-8', errors='ignore')
-    elif isinstance(v, str):
-        return v
-    else:
-        raise ValueError('Unknown type %r' % type(v))
-
-
-def is_text(v):
-    return isinstance(v, str)
-
-
-def cut_sentences(sentence):
-    tmp = []
-    for ch in sentence:  # 遍历字符串中的每一个字
-        tmp.append(ch)
-        if ch in sentence_delimiters:
-            yield ''.join(tmp)
-            tmp = []
-    yield ''.join(tmp)
-
-
-def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
-    sentences = []
-    sents = []
-    for sent in cutted_sentences:
-        sentences.append(sent)
-        if use_stopwords:
-            sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords])  # 把句子分成词语
-        else:
-            sents.append([word for word in jiagu.seg(sent) if word])
-    return sentences, sents
-
-
-def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
-    sents = []
-    sentences = []
-    for sent in cutted_sentences:
-        sentences.append(sent)
-
-        word_list = jiagu.seg(sent)
-        word_list = [word for word in word_list if len(word) > 0]
-        if use_stopwords:
-            word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
-        sents.append(word_list)
-    return sentences, sents
-
-
-def weight_map_rank(weight_graph, max_iter, tol):
-    # 初始分数设置为0.5
-    # 初始化每个句子的分子和老分数
-    scores = [0.5 for _ in range(len(weight_graph))]
-    old_scores = [0.0 for _ in range(len(weight_graph))]
-    denominator = get_degree(weight_graph)
-
-    # 开始迭代
-    count = 0
-    while different(scores, old_scores, tol):
-        for i in range(len(weight_graph)):
-            old_scores[i] = scores[i]
-        # 计算每个句子的分数
-        for i in range(len(weight_graph)):
-            scores[i] = get_score(weight_graph, denominator, i)
-        count += 1
-        if count > max_iter:
-            break
-    return scores
-
-
-def get_degree(weight_graph):
-    length = len(weight_graph)
-    denominator = [0.0 for _ in range(len(weight_graph))]
-    for j in range(length):
-        for k in range(length):
-            denominator[j] += weight_graph[j][k]
-        if denominator[j] == 0:
-            denominator[j] = 1.0
-    return denominator
-
-
-def get_score(weight_graph, denominator, i):
-    """
-
-	:param weight_graph:
-	:param denominator:
-	:param i: int
-		第i个句子
-	:return: float
-	"""
-    length = len(weight_graph)
-    d = 0.85
-    added_score = 0.0
-
-    for j in range(length):
-        # [j,i]是指句子j指向句子i
-        fraction = weight_graph[j][i] * 1.0
-        # 除以j的出度
-        added_score += fraction / denominator[j]
-    weighted_score = (1 - d) + d * added_score
-    return weighted_score
-
-
-def different(scores, old_scores, tol=0.0001):
-    flag = False
-    for i in range(len(scores)):
-        if math.fabs(scores[i] - old_scores[i]) >= tol:  # 原始是0.0001
-            flag = True
-            break
-    return flag
-
-
-def combine(word_list, window=2):
-    if window < 2:
-        window = 2
-    for x in range(1, window):
-        if x >= len(word_list):
-            break
-        word_list2 = word_list[x:]
-        res = zip(word_list, word_list2)
-        for r in res:
-            yield r
-
-
-def sentences_similarity(s1, s2):
-    """计算两个句子的相似度
-
-	:param s1: list
-	:param s2: list
-	:return: float
-	"""
-    counter = 0
-    for sent in s1:
-        if sent in s2:
-            counter += 1
-    if counter == 0:
-        return 0
-    return counter / (math.log(len(s1) + len(s2)))
-
-
-# --------------------------------------------------------------------
-
-def is_chinese(uchar):
-    """判断一个字符是否是汉字"""
-    assert len(uchar) == 1, "uchar 只能是单个字符"
-    if u'\u4e00' <= uchar <= u'\u9fa5':
-        return True
-    else:
-        return False
-
-
-def is_number(uchar):
-    """判断一个字符是否是数字"""
-    assert len(uchar) == 1, "uchar 只能是单个字符"
-    if u'\u0030' <= uchar <= u'\u0039':
-        return True
-    else:
-        return False
-
-
-def is_alphabet(uchar):
-    """判断一个字符是否是英文字母"""
-    assert len(uchar) == 1, "uchar 只能是单个字符"
-    if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
-        return True
-    else:
-        return False
-
-
-def B2Q(uchar):
-    """单字符半角转全角"""
-    assert len(uchar) == 1, "uchar 只能是单个字符"
-    inside_code = ord(uchar)
-    if inside_code < 0x0020 or inside_code > 0x7e:
-        # 不是半角字符就返回原来的字符
-        return uchar
-    if inside_code == 0x0020:
-        # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
-        inside_code = 0x3000
-    else:
-        inside_code += 0xfee0
-    return chr(inside_code)
-
-
-def Q2B(uchar):
-    """单字符全角转半角"""
-    assert len(uchar) == 1, "uchar 只能是单个字符"
-    inside_code = ord(uchar)
-    if inside_code == 0x3000:
-        inside_code = 0x0020
-    else:
-        inside_code -= 0xfee0
-    if inside_code < 0x0020 or inside_code > 0x7e:
-        # 转完之后不是半角字符返回原来的字符
-        return uchar
-    return chr(inside_code)
+# -*- encoding:utf-8 -*-
+import os
+import jiagu
+import math
+
+
+def default_stopwords_file():
+    d = os.path.dirname(os.path.realpath(__file__))
+    return os.path.join(d, 'data/stopwords.txt')
+
+
+sentence_delimiters = ['。', '？', '！', '…']
+allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
+                     'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
+
+
+def as_text(v):
+    """生成unicode字符串"""
+    if v is None:
+        return None
+    elif isinstance(v, bytes):
+        return v.decode('utf-8', errors='ignore')
+    elif isinstance(v, str):
+        return v
+    else:
+        raise ValueError('Unknown type %r' % type(v))
+
+
+def is_text(v):
+    return isinstance(v, str)
+
+
+def cut_sentences(sentence):
+    tmp = []
+    for ch in sentence:  # 遍历字符串中的每一个字
+        tmp.append(ch)
+        if ch in sentence_delimiters:
+            yield ''.join(tmp)
+            tmp = []
+    if len(tmp) > 0:    # 如以定界符结尾的文本的文本信息会在循环中返回，无需再次传递
+        yield ''.join(tmp)
+
+
+def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
+    sentences = []
+    sents = []
+    for sent in cutted_sentences:
+        sentences.append(sent)
+        if use_stopwords:
+            sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords])  # 把句子分成词语
+        else:
+            sents.append([word for word in jiagu.seg(sent) if word])
+    return sentences, sents
+
+
+def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
+    sents = []
+    sentences = []
+    for sent in cutted_sentences:
+        sentences.append(sent)
+
+        word_list = jiagu.seg(sent)
+        word_list = [word for word in word_list if len(word) > 0]
+        if use_stopwords:
+            word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
+        sents.append(word_list)
+    return sentences, sents
+
+
+def weight_map_rank(weight_graph, max_iter, tol):
+    # 初始分数设置为0.5
+    # 初始化每个句子的分子和老分数
+    scores = [0.5 for _ in range(len(weight_graph))]
+    old_scores = [0.0 for _ in range(len(weight_graph))]
+    denominator = get_degree(weight_graph)
+
+    # 开始迭代
+    count = 0
+    while different(scores, old_scores, tol):
+        for i in range(len(weight_graph)):
+            old_scores[i] = scores[i]
+        # 计算每个句子的分数
+        for i in range(len(weight_graph)):
+            scores[i] = get_score(weight_graph, denominator, i)
+        count += 1
+        if count > max_iter:
+            break
+    return scores
+
+
+def get_degree(weight_graph):
+    length = len(weight_graph)
+    denominator = [0.0 for _ in range(len(weight_graph))]
+    for j in range(length):
+        for k in range(length):
+            denominator[j] += weight_graph[j][k]
+        if denominator[j] == 0:
+            denominator[j] = 1.0
+    return denominator
+
+
+def get_score(weight_graph, denominator, i):
+    """
+
+	:param weight_graph:
+	:param denominator:
+	:param i: int
+		第i个句子
+	:return: float
+	"""
+    length = len(weight_graph)
+    d = 0.85
+    added_score = 0.0
+
+    for j in range(length):
+        # [j,i]是指句子j指向句子i
+        fraction = weight_graph[j][i] * 1.0
+        # 除以j的出度
+        added_score += fraction / denominator[j]
+    weighted_score = (1 - d) + d * added_score
+    return weighted_score
+
+
+def different(scores, old_scores, tol=0.0001):
+    flag = False
+    for i in range(len(scores)):
+        if math.fabs(scores[i] - old_scores[i]) >= tol:  # 原始是0.0001
+            flag = True
+            break
+    return flag
+
+
+def combine(word_list, window=2):
+    if window < 2:
+        window = 2
+    for x in range(1, window):
+        if x >= len(word_list):
+            break
+        word_list2 = word_list[x:]
+        res = zip(word_list, word_list2)
+        for r in res:
+            yield r
+
+
+def sentences_similarity(s1, s2):
+    """计算两个句子的相似度
+
+	:param s1: list
+	:param s2: list
+	:return: float
+	"""
+    counter = 0
+    for sent in s1:
+        if sent in s2:
+            counter += 1
+    if counter == 0:
+        return 0
+    return counter / (math.log(len(s1) + len(s2)))
+
+
+# --------------------------------------------------------------------
+
+def is_chinese(uchar):
+    """判断一个字符是否是汉字"""
+    assert len(uchar) == 1, "uchar 只能是单个字符"
+    if u'\u4e00' <= uchar <= u'\u9fa5':
+        return True
+    else:
+        return False
+
+
+def is_number(uchar):
+    """判断一个字符是否是数字"""
+    assert len(uchar) == 1, "uchar 只能是单个字符"
+    if u'\u0030' <= uchar <= u'\u0039':
+        return True
+    else:
+        return False
+
+
+def is_alphabet(uchar):
+    """判断一个字符是否是英文字母"""
+    assert len(uchar) == 1, "uchar 只能是单个字符"
+    if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
+        return True
+    else:
+        return False
+
+
+def B2Q(uchar):
+    """单字符半角转全角"""
+    assert len(uchar) == 1, "uchar 只能是单个字符"
+    inside_code = ord(uchar)
+    if inside_code < 0x0020 or inside_code > 0x7e:
+        # 不是半角字符就返回原来的字符
+        return uchar
+    if inside_code == 0x0020:
+        # 除了空格其他的全角半角的公式为:半角=全角-0xfee0
+        inside_code = 0x3000
+    else:
+        inside_code += 0xfee0
+    return chr(inside_code)
+
+
+def Q2B(uchar):
+    """单字符全角转半角"""
+    assert len(uchar) == 1, "uchar 只能是单个字符"
+    inside_code = ord(uchar)
+    if inside_code == 0x3000:
+        inside_code = 0x0020
+    else:
+        inside_code -= 0xfee0
+    if inside_code < 0x0020 or inside_code > 0x7e:
+        # 转完之后不是半角字符返回原来的字符
+        return uchar
+    return chr(inside_code)
diff --git a/test/test_textrank.py b/test/test_textrank.py
index fd3dbd6..8f321bf 100644
--- a/test/test_textrank.py
+++ b/test/test_textrank.py
@@ -121,7 +121,29 @@ class TestTextRank(unittest.TestCase):
         print(summarize)
         self.assertTrue(len(summarize) == 3)
 
+    def test_cut_sentences(self):
+        text = '''江西省上饶市信州区人民法院 刑事判决书 （2016）赣1102刑初274号 公诉机关
+                上饶市信州区人民检察院。 被告人曾榴仙，女，1954年11月22日出生于江西省上饶市信州区，
+                汉族，文盲，无业，家住上饶市信州区，因涉嫌过失致人死亡罪，2016年4月27日被上饶市公
+                安局信州区分局刑事拘留，2016年6月1日被执行逮捕。辩护人毛巧云，江西盛义律师事务所
+                律师。 上饶市信州区人民检察院以饶信检公诉刑诉［2016］260号起诉书指控被告人曾榴仙犯
+                过失致人死亡罪，于2016年8月22日向本院提起公诉。'''
+        text = re.sub('\\n| ', '', text)
+        sentences = list(utils.cut_sentences(text))
+        self.assertEqual(len(sentences), 4)
+
+    def test_short_text_summarize(self):
+        text = '''江西省上饶市信州区人民法院 刑事判决书 （2016）赣1102刑初274号 公诉机关
+        上饶市信州区人民检察院。 被告人曾榴仙，女，1954年11月22日出生于江西省上饶市信州区，
+        汉族，文盲，无业，家住上饶市信州区，因涉嫌过失致人死亡罪，2016年4月27日被上饶市公
+        安局信州区分局刑事拘留，2016年6月1日被执行逮捕。辩护人毛巧云，江西盛义律师事务所
+        律师。 上饶市信州区人民检察院以饶信检公诉刑诉［2016］260号起诉书指控被告人曾榴仙犯
+        过失致人死亡罪，于2016年8月22日向本院提起公诉。'''
+        text = re.sub('\\n| ', '', text)
+        summarize = jiagu.summarize(text, 5)  # 设定摘要句子数大于文本句子数
+        print(summarize)
+        print(len(summarize))
+
 
 if __name__ == '__main__':
     unittest.main()
-