From 83b319e6eb876a94496e2c2da7212d0f50ecac1d Mon Sep 17 00:00:00 2001 From: abtion Date: Wed, 24 Jun 2020 10:04:45 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E6=96=87=E6=9C=AC?= =?UTF-8?q?=E6=91=98=E8=A6=81=E7=9A=84=E4=B8=A4=E4=B8=AA=E5=B0=8F=E9=97=AE?= =?UTF-8?q?=E9=A2=98=EF=BC=8C=E5=B9=B6=E6=96=B0=E5=A2=9E=E4=BA=86=E6=B5=8B?= =?UTF-8?q?=E8=AF=95=E7=94=A8=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- jiagu/textrank.py | 5 +- jiagu/utils.py | 431 +++++++++++++++++++++++++------------------------- test/test_textrank.py | 24 ++- 3 files changed, 242 insertions(+), 218 deletions(-) diff --git a/jiagu/textrank.py b/jiagu/textrank.py index ec8fe1e..dcab08f 100644 --- a/jiagu/textrank.py +++ b/jiagu/textrank.py @@ -90,8 +90,9 @@ class Summarize(object): if stop_words_file: self.__stop_words_file = stop_words_file if use_stopword: - for word in open(self.__stop_words_file, 'r', encoding='utf-8'): - self.__stop_words.add(word.strip()) + with open(self.__stop_words_file, 'r', encoding='utf-8') as f: + for word in f: + self.__stop_words.add(word.strip()) def filter_dictword(self, sents): _sents = [] diff --git a/jiagu/utils.py b/jiagu/utils.py index 286715e..73ec11b 100644 --- a/jiagu/utils.py +++ b/jiagu/utils.py @@ -1,215 +1,216 @@ -# -*- encoding:utf-8 -*- -import os -import jiagu -import math - - -def default_stopwords_file(): - d = os.path.dirname(os.path.realpath(__file__)) - return os.path.join(d, 'data/stopwords.txt') - - -sentence_delimiters = ['。', '?', '!', '…'] -allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', - 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] - - -def as_text(v): - """生成unicode字符串""" - if v is None: - return None - elif isinstance(v, bytes): - return v.decode('utf-8', errors='ignore') - elif isinstance(v, str): - return v - else: - raise ValueError('Unknown type %r' % type(v)) - - -def is_text(v): - return isinstance(v, str) - - -def cut_sentences(sentence): - tmp = [] - for ch in sentence: # 遍历字符串中的每一个字 - tmp.append(ch) - if ch in sentence_delimiters: - yield ''.join(tmp) - tmp = [] - yield ''.join(tmp) - - -def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False): - sentences = [] - sents = [] - for sent in cutted_sentences: - sentences.append(sent) - if use_stopwords: - sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords]) # 把句子分成词语 - else: - sents.append([word for word in jiagu.seg(sent) if word]) - return sentences, sents - - -def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True): - sents = [] - sentences = [] - for sent in cutted_sentences: - sentences.append(sent) - - word_list = jiagu.seg(sent) - word_list = [word for word in word_list if len(word) > 0] - if use_stopwords: - word_list = [word.strip() for word in word_list if word.strip() not in stopwords] - sents.append(word_list) - return sentences, sents - - -def weight_map_rank(weight_graph, max_iter, tol): - # 初始分数设置为0.5 - # 初始化每个句子的分子和老分数 - scores = [0.5 for _ in range(len(weight_graph))] - old_scores = [0.0 for _ in range(len(weight_graph))] - denominator = get_degree(weight_graph) - - # 开始迭代 - count = 0 - while different(scores, old_scores, tol): - for i in range(len(weight_graph)): - old_scores[i] = scores[i] - # 计算每个句子的分数 - for i in range(len(weight_graph)): - scores[i] = get_score(weight_graph, denominator, i) - count += 1 - if count > max_iter: - break - return scores - - -def get_degree(weight_graph): - length = len(weight_graph) - denominator = [0.0 for _ in range(len(weight_graph))] - for j in range(length): - for k in range(length): - denominator[j] += weight_graph[j][k] - if denominator[j] == 0: - denominator[j] = 1.0 - return denominator - - -def get_score(weight_graph, denominator, i): - """ - - :param weight_graph: - :param denominator: - :param i: int - 第i个句子 - :return: float - """ - length = len(weight_graph) - d = 0.85 - added_score = 0.0 - - for j in range(length): - # [j,i]是指句子j指向句子i - fraction = weight_graph[j][i] * 1.0 - # 除以j的出度 - added_score += fraction / denominator[j] - weighted_score = (1 - d) + d * added_score - return weighted_score - - -def different(scores, old_scores, tol=0.0001): - flag = False - for i in range(len(scores)): - if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 - flag = True - break - return flag - - -def combine(word_list, window=2): - if window < 2: - window = 2 - for x in range(1, window): - if x >= len(word_list): - break - word_list2 = word_list[x:] - res = zip(word_list, word_list2) - for r in res: - yield r - - -def sentences_similarity(s1, s2): - """计算两个句子的相似度 - - :param s1: list - :param s2: list - :return: float - """ - counter = 0 - for sent in s1: - if sent in s2: - counter += 1 - if counter == 0: - return 0 - return counter / (math.log(len(s1) + len(s2))) - - -# -------------------------------------------------------------------- - -def is_chinese(uchar): - """判断一个字符是否是汉字""" - assert len(uchar) == 1, "uchar 只能是单个字符" - if u'\u4e00' <= uchar <= u'\u9fa5': - return True - else: - return False - - -def is_number(uchar): - """判断一个字符是否是数字""" - assert len(uchar) == 1, "uchar 只能是单个字符" - if u'\u0030' <= uchar <= u'\u0039': - return True - else: - return False - - -def is_alphabet(uchar): - """判断一个字符是否是英文字母""" - assert len(uchar) == 1, "uchar 只能是单个字符" - if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'): - return True - else: - return False - - -def B2Q(uchar): - """单字符半角转全角""" - assert len(uchar) == 1, "uchar 只能是单个字符" - inside_code = ord(uchar) - if inside_code < 0x0020 or inside_code > 0x7e: - # 不是半角字符就返回原来的字符 - return uchar - if inside_code == 0x0020: - # 除了空格其他的全角半角的公式为:半角=全角-0xfee0 - inside_code = 0x3000 - else: - inside_code += 0xfee0 - return chr(inside_code) - - -def Q2B(uchar): - """单字符全角转半角""" - assert len(uchar) == 1, "uchar 只能是单个字符" - inside_code = ord(uchar) - if inside_code == 0x3000: - inside_code = 0x0020 - else: - inside_code -= 0xfee0 - if inside_code < 0x0020 or inside_code > 0x7e: - # 转完之后不是半角字符返回原来的字符 - return uchar - return chr(inside_code) +# -*- encoding:utf-8 -*- +import os +import jiagu +import math + + +def default_stopwords_file(): + d = os.path.dirname(os.path.realpath(__file__)) + return os.path.join(d, 'data/stopwords.txt') + + +sentence_delimiters = ['。', '?', '!', '…'] +allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns', + 'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng'] + + +def as_text(v): + """生成unicode字符串""" + if v is None: + return None + elif isinstance(v, bytes): + return v.decode('utf-8', errors='ignore') + elif isinstance(v, str): + return v + else: + raise ValueError('Unknown type %r' % type(v)) + + +def is_text(v): + return isinstance(v, str) + + +def cut_sentences(sentence): + tmp = [] + for ch in sentence: # 遍历字符串中的每一个字 + tmp.append(ch) + if ch in sentence_delimiters: + yield ''.join(tmp) + tmp = [] + if len(tmp) > 0: # 如以定界符结尾的文本的文本信息会在循环中返回,无需再次传递 + yield ''.join(tmp) + + +def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False): + sentences = [] + sents = [] + for sent in cutted_sentences: + sentences.append(sent) + if use_stopwords: + sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords]) # 把句子分成词语 + else: + sents.append([word for word in jiagu.seg(sent) if word]) + return sentences, sents + + +def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True): + sents = [] + sentences = [] + for sent in cutted_sentences: + sentences.append(sent) + + word_list = jiagu.seg(sent) + word_list = [word for word in word_list if len(word) > 0] + if use_stopwords: + word_list = [word.strip() for word in word_list if word.strip() not in stopwords] + sents.append(word_list) + return sentences, sents + + +def weight_map_rank(weight_graph, max_iter, tol): + # 初始分数设置为0.5 + # 初始化每个句子的分子和老分数 + scores = [0.5 for _ in range(len(weight_graph))] + old_scores = [0.0 for _ in range(len(weight_graph))] + denominator = get_degree(weight_graph) + + # 开始迭代 + count = 0 + while different(scores, old_scores, tol): + for i in range(len(weight_graph)): + old_scores[i] = scores[i] + # 计算每个句子的分数 + for i in range(len(weight_graph)): + scores[i] = get_score(weight_graph, denominator, i) + count += 1 + if count > max_iter: + break + return scores + + +def get_degree(weight_graph): + length = len(weight_graph) + denominator = [0.0 for _ in range(len(weight_graph))] + for j in range(length): + for k in range(length): + denominator[j] += weight_graph[j][k] + if denominator[j] == 0: + denominator[j] = 1.0 + return denominator + + +def get_score(weight_graph, denominator, i): + """ + + :param weight_graph: + :param denominator: + :param i: int + 第i个句子 + :return: float + """ + length = len(weight_graph) + d = 0.85 + added_score = 0.0 + + for j in range(length): + # [j,i]是指句子j指向句子i + fraction = weight_graph[j][i] * 1.0 + # 除以j的出度 + added_score += fraction / denominator[j] + weighted_score = (1 - d) + d * added_score + return weighted_score + + +def different(scores, old_scores, tol=0.0001): + flag = False + for i in range(len(scores)): + if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001 + flag = True + break + return flag + + +def combine(word_list, window=2): + if window < 2: + window = 2 + for x in range(1, window): + if x >= len(word_list): + break + word_list2 = word_list[x:] + res = zip(word_list, word_list2) + for r in res: + yield r + + +def sentences_similarity(s1, s2): + """计算两个句子的相似度 + + :param s1: list + :param s2: list + :return: float + """ + counter = 0 + for sent in s1: + if sent in s2: + counter += 1 + if counter == 0: + return 0 + return counter / (math.log(len(s1) + len(s2))) + + +# -------------------------------------------------------------------- + +def is_chinese(uchar): + """判断一个字符是否是汉字""" + assert len(uchar) == 1, "uchar 只能是单个字符" + if u'\u4e00' <= uchar <= u'\u9fa5': + return True + else: + return False + + +def is_number(uchar): + """判断一个字符是否是数字""" + assert len(uchar) == 1, "uchar 只能是单个字符" + if u'\u0030' <= uchar <= u'\u0039': + return True + else: + return False + + +def is_alphabet(uchar): + """判断一个字符是否是英文字母""" + assert len(uchar) == 1, "uchar 只能是单个字符" + if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'): + return True + else: + return False + + +def B2Q(uchar): + """单字符半角转全角""" + assert len(uchar) == 1, "uchar 只能是单个字符" + inside_code = ord(uchar) + if inside_code < 0x0020 or inside_code > 0x7e: + # 不是半角字符就返回原来的字符 + return uchar + if inside_code == 0x0020: + # 除了空格其他的全角半角的公式为:半角=全角-0xfee0 + inside_code = 0x3000 + else: + inside_code += 0xfee0 + return chr(inside_code) + + +def Q2B(uchar): + """单字符全角转半角""" + assert len(uchar) == 1, "uchar 只能是单个字符" + inside_code = ord(uchar) + if inside_code == 0x3000: + inside_code = 0x0020 + else: + inside_code -= 0xfee0 + if inside_code < 0x0020 or inside_code > 0x7e: + # 转完之后不是半角字符返回原来的字符 + return uchar + return chr(inside_code) diff --git a/test/test_textrank.py b/test/test_textrank.py index fd3dbd6..8f321bf 100644 --- a/test/test_textrank.py +++ b/test/test_textrank.py @@ -121,7 +121,29 @@ class TestTextRank(unittest.TestCase): print(summarize) self.assertTrue(len(summarize) == 3) + def test_cut_sentences(self): + text = '''江西省上饶市信州区人民法院 刑事判决书 (2016)赣1102刑初274号 公诉机关 + 上饶市信州区人民检察院。 被告人曾榴仙,女,1954年11月22日出生于江西省上饶市信州区, + 汉族,文盲,无业,家住上饶市信州区,因涉嫌过失致人死亡罪,2016年4月27日被上饶市公 + 安局信州区分局刑事拘留,2016年6月1日被执行逮捕。辩护人毛巧云,江西盛义律师事务所 + 律师。 上饶市信州区人民检察院以饶信检公诉刑诉[2016]260号起诉书指控被告人曾榴仙犯 + 过失致人死亡罪,于2016年8月22日向本院提起公诉。''' + text = re.sub('\\n| ', '', text) + sentences = list(utils.cut_sentences(text)) + self.assertEqual(len(sentences), 4) + + def test_short_text_summarize(self): + text = '''江西省上饶市信州区人民法院 刑事判决书 (2016)赣1102刑初274号 公诉机关 + 上饶市信州区人民检察院。 被告人曾榴仙,女,1954年11月22日出生于江西省上饶市信州区, + 汉族,文盲,无业,家住上饶市信州区,因涉嫌过失致人死亡罪,2016年4月27日被上饶市公 + 安局信州区分局刑事拘留,2016年6月1日被执行逮捕。辩护人毛巧云,江西盛义律师事务所 + 律师。 上饶市信州区人民检察院以饶信检公诉刑诉[2016]260号起诉书指控被告人曾榴仙犯 + 过失致人死亡罪,于2016年8月22日向本院提起公诉。''' + text = re.sub('\\n| ', '', text) + summarize = jiagu.summarize(text, 5) # 设定摘要句子数大于文本句子数 + print(summarize) + print(len(summarize)) + if __name__ == '__main__': unittest.main() -