Browse Source

Merge pull request #53 from gitabtion/master

修复了文本摘要的两个小问题,并新增了测试用例
master
Yener GitHub 5 years ago
parent
commit
114928b0ef
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 242 additions and 218 deletions
  1. +3
    -2
      jiagu/textrank.py
  2. +216
    -215
      jiagu/utils.py
  3. +23
    -1
      test/test_textrank.py

+ 3
- 2
jiagu/textrank.py View File

@@ -90,8 +90,9 @@ class Summarize(object):
if stop_words_file:
self.__stop_words_file = stop_words_file
if use_stopword:
for word in open(self.__stop_words_file, 'r', encoding='utf-8'):
self.__stop_words.add(word.strip())
with open(self.__stop_words_file, 'r', encoding='utf-8') as f:
for word in f:
self.__stop_words.add(word.strip())

def filter_dictword(self, sents):
_sents = []


+ 216
- 215
jiagu/utils.py View File

@@ -1,215 +1,216 @@
# -*- encoding:utf-8 -*-
import os
import jiagu
import math
def default_stopwords_file():
d = os.path.dirname(os.path.realpath(__file__))
return os.path.join(d, 'data/stopwords.txt')
sentence_delimiters = ['。', '?', '!', '…']
allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']
def as_text(v):
"""生成unicode字符串"""
if v is None:
return None
elif isinstance(v, bytes):
return v.decode('utf-8', errors='ignore')
elif isinstance(v, str):
return v
else:
raise ValueError('Unknown type %r' % type(v))
def is_text(v):
return isinstance(v, str)
def cut_sentences(sentence):
tmp = []
for ch in sentence: # 遍历字符串中的每一个字
tmp.append(ch)
if ch in sentence_delimiters:
yield ''.join(tmp)
tmp = []
yield ''.join(tmp)
def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
sentences = []
sents = []
for sent in cutted_sentences:
sentences.append(sent)
if use_stopwords:
sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords]) # 把句子分成词语
else:
sents.append([word for word in jiagu.seg(sent) if word])
return sentences, sents
def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
sents = []
sentences = []
for sent in cutted_sentences:
sentences.append(sent)
word_list = jiagu.seg(sent)
word_list = [word for word in word_list if len(word) > 0]
if use_stopwords:
word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
sents.append(word_list)
return sentences, sents
def weight_map_rank(weight_graph, max_iter, tol):
# 初始分数设置为0.5
# 初始化每个句子的分子和老分数
scores = [0.5 for _ in range(len(weight_graph))]
old_scores = [0.0 for _ in range(len(weight_graph))]
denominator = get_degree(weight_graph)
# 开始迭代
count = 0
while different(scores, old_scores, tol):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
# 计算每个句子的分数
for i in range(len(weight_graph)):
scores[i] = get_score(weight_graph, denominator, i)
count += 1
if count > max_iter:
break
return scores
def get_degree(weight_graph):
length = len(weight_graph)
denominator = [0.0 for _ in range(len(weight_graph))]
for j in range(length):
for k in range(length):
denominator[j] += weight_graph[j][k]
if denominator[j] == 0:
denominator[j] = 1.0
return denominator
def get_score(weight_graph, denominator, i):
"""
:param weight_graph:
:param denominator:
:param i: int
第i个句子
:return: float
"""
length = len(weight_graph)
d = 0.85
added_score = 0.0
for j in range(length):
# [j,i]是指句子j指向句子i
fraction = weight_graph[j][i] * 1.0
# 除以j的出度
added_score += fraction / denominator[j]
weighted_score = (1 - d) + d * added_score
return weighted_score
def different(scores, old_scores, tol=0.0001):
flag = False
for i in range(len(scores)):
if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001
flag = True
break
return flag
def combine(word_list, window=2):
if window < 2:
window = 2
for x in range(1, window):
if x >= len(word_list):
break
word_list2 = word_list[x:]
res = zip(word_list, word_list2)
for r in res:
yield r
def sentences_similarity(s1, s2):
"""计算两个句子的相似度
:param s1: list
:param s2: list
:return: float
"""
counter = 0
for sent in s1:
if sent in s2:
counter += 1
if counter == 0:
return 0
return counter / (math.log(len(s1) + len(s2)))
# --------------------------------------------------------------------
def is_chinese(uchar):
"""判断一个字符是否是汉字"""
assert len(uchar) == 1, "uchar 只能是单个字符"
if u'\u4e00' <= uchar <= u'\u9fa5':
return True
else:
return False
def is_number(uchar):
"""判断一个字符是否是数字"""
assert len(uchar) == 1, "uchar 只能是单个字符"
if u'\u0030' <= uchar <= u'\u0039':
return True
else:
return False
def is_alphabet(uchar):
"""判断一个字符是否是英文字母"""
assert len(uchar) == 1, "uchar 只能是单个字符"
if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
return True
else:
return False
def B2Q(uchar):
"""单字符半角转全角"""
assert len(uchar) == 1, "uchar 只能是单个字符"
inside_code = ord(uchar)
if inside_code < 0x0020 or inside_code > 0x7e:
# 不是半角字符就返回原来的字符
return uchar
if inside_code == 0x0020:
# 除了空格其他的全角半角的公式为:半角=全角-0xfee0
inside_code = 0x3000
else:
inside_code += 0xfee0
return chr(inside_code)
def Q2B(uchar):
"""单字符全角转半角"""
assert len(uchar) == 1, "uchar 只能是单个字符"
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e:
# 转完之后不是半角字符返回原来的字符
return uchar
return chr(inside_code)
# -*- encoding:utf-8 -*-
import os
import jiagu
import math


def default_stopwords_file():
d = os.path.dirname(os.path.realpath(__file__))
return os.path.join(d, 'data/stopwords.txt')


sentence_delimiters = ['。', '?', '!', '…']
allow_speech_tags = ['an', 'i', 'j', 'l', 'n', 'nr', 'nrfg', 'ns',
'nt', 'nz', 't', 'v', 'vd', 'vn', 'eng']


def as_text(v):
"""生成unicode字符串"""
if v is None:
return None
elif isinstance(v, bytes):
return v.decode('utf-8', errors='ignore')
elif isinstance(v, str):
return v
else:
raise ValueError('Unknown type %r' % type(v))


def is_text(v):
return isinstance(v, str)


def cut_sentences(sentence):
tmp = []
for ch in sentence: # 遍历字符串中的每一个字
tmp.append(ch)
if ch in sentence_delimiters:
yield ''.join(tmp)
tmp = []
if len(tmp) > 0: # 如以定界符结尾的文本的文本信息会在循环中返回,无需再次传递
yield ''.join(tmp)


def cut_filter_words(cutted_sentences, stopwords, use_stopwords=False):
sentences = []
sents = []
for sent in cutted_sentences:
sentences.append(sent)
if use_stopwords:
sents.append([word for word in jiagu.seg(sent) if word and word not in stopwords]) # 把句子分成词语
else:
sents.append([word for word in jiagu.seg(sent) if word])
return sentences, sents


def psegcut_filter_words(cutted_sentences, stopwords, use_stopwords=True):
sents = []
sentences = []
for sent in cutted_sentences:
sentences.append(sent)

word_list = jiagu.seg(sent)
word_list = [word for word in word_list if len(word) > 0]
if use_stopwords:
word_list = [word.strip() for word in word_list if word.strip() not in stopwords]
sents.append(word_list)
return sentences, sents


def weight_map_rank(weight_graph, max_iter, tol):
# 初始分数设置为0.5
# 初始化每个句子的分子和老分数
scores = [0.5 for _ in range(len(weight_graph))]
old_scores = [0.0 for _ in range(len(weight_graph))]
denominator = get_degree(weight_graph)

# 开始迭代
count = 0
while different(scores, old_scores, tol):
for i in range(len(weight_graph)):
old_scores[i] = scores[i]
# 计算每个句子的分数
for i in range(len(weight_graph)):
scores[i] = get_score(weight_graph, denominator, i)
count += 1
if count > max_iter:
break
return scores


def get_degree(weight_graph):
length = len(weight_graph)
denominator = [0.0 for _ in range(len(weight_graph))]
for j in range(length):
for k in range(length):
denominator[j] += weight_graph[j][k]
if denominator[j] == 0:
denominator[j] = 1.0
return denominator


def get_score(weight_graph, denominator, i):
"""

:param weight_graph:
:param denominator:
:param i: int
第i个句子
:return: float
"""
length = len(weight_graph)
d = 0.85
added_score = 0.0

for j in range(length):
# [j,i]是指句子j指向句子i
fraction = weight_graph[j][i] * 1.0
# 除以j的出度
added_score += fraction / denominator[j]
weighted_score = (1 - d) + d * added_score
return weighted_score


def different(scores, old_scores, tol=0.0001):
flag = False
for i in range(len(scores)):
if math.fabs(scores[i] - old_scores[i]) >= tol: # 原始是0.0001
flag = True
break
return flag


def combine(word_list, window=2):
if window < 2:
window = 2
for x in range(1, window):
if x >= len(word_list):
break
word_list2 = word_list[x:]
res = zip(word_list, word_list2)
for r in res:
yield r


def sentences_similarity(s1, s2):
"""计算两个句子的相似度

:param s1: list
:param s2: list
:return: float
"""
counter = 0
for sent in s1:
if sent in s2:
counter += 1
if counter == 0:
return 0
return counter / (math.log(len(s1) + len(s2)))


# --------------------------------------------------------------------

def is_chinese(uchar):
"""判断一个字符是否是汉字"""
assert len(uchar) == 1, "uchar 只能是单个字符"
if u'\u4e00' <= uchar <= u'\u9fa5':
return True
else:
return False


def is_number(uchar):
"""判断一个字符是否是数字"""
assert len(uchar) == 1, "uchar 只能是单个字符"
if u'\u0030' <= uchar <= u'\u0039':
return True
else:
return False


def is_alphabet(uchar):
"""判断一个字符是否是英文字母"""
assert len(uchar) == 1, "uchar 只能是单个字符"
if (u'\u0041' <= uchar <= u'\u005a') or (u'\u0061' <= uchar <= u'\u007a'):
return True
else:
return False


def B2Q(uchar):
"""单字符半角转全角"""
assert len(uchar) == 1, "uchar 只能是单个字符"
inside_code = ord(uchar)
if inside_code < 0x0020 or inside_code > 0x7e:
# 不是半角字符就返回原来的字符
return uchar
if inside_code == 0x0020:
# 除了空格其他的全角半角的公式为:半角=全角-0xfee0
inside_code = 0x3000
else:
inside_code += 0xfee0
return chr(inside_code)


def Q2B(uchar):
"""单字符全角转半角"""
assert len(uchar) == 1, "uchar 只能是单个字符"
inside_code = ord(uchar)
if inside_code == 0x3000:
inside_code = 0x0020
else:
inside_code -= 0xfee0
if inside_code < 0x0020 or inside_code > 0x7e:
# 转完之后不是半角字符返回原来的字符
return uchar
return chr(inside_code)

+ 23
- 1
test/test_textrank.py View File

@@ -121,7 +121,29 @@ class TestTextRank(unittest.TestCase):
print(summarize)
self.assertTrue(len(summarize) == 3)

def test_cut_sentences(self):
text = '''江西省上饶市信州区人民法院 刑事判决书 (2016)赣1102刑初274号 公诉机关
上饶市信州区人民检察院。 被告人曾榴仙,女,1954年11月22日出生于江西省上饶市信州区,
汉族,文盲,无业,家住上饶市信州区,因涉嫌过失致人死亡罪,2016年4月27日被上饶市公
安局信州区分局刑事拘留,2016年6月1日被执行逮捕。辩护人毛巧云,江西盛义律师事务所
律师。 上饶市信州区人民检察院以饶信检公诉刑诉[2016]260号起诉书指控被告人曾榴仙犯
过失致人死亡罪,于2016年8月22日向本院提起公诉。'''
text = re.sub('\\n| ', '', text)
sentences = list(utils.cut_sentences(text))
self.assertEqual(len(sentences), 4)

def test_short_text_summarize(self):
text = '''江西省上饶市信州区人民法院 刑事判决书 (2016)赣1102刑初274号 公诉机关
上饶市信州区人民检察院。 被告人曾榴仙,女,1954年11月22日出生于江西省上饶市信州区,
汉族,文盲,无业,家住上饶市信州区,因涉嫌过失致人死亡罪,2016年4月27日被上饶市公
安局信州区分局刑事拘留,2016年6月1日被执行逮捕。辩护人毛巧云,江西盛义律师事务所
律师。 上饶市信州区人民检察院以饶信检公诉刑诉[2016]260号起诉书指控被告人曾榴仙犯
过失致人死亡罪,于2016年8月22日向本院提起公诉。'''
text = re.sub('\\n| ', '', text)
summarize = jiagu.summarize(text, 5) # 设定摘要句子数大于文本句子数
print(summarize)
print(len(summarize))


if __name__ == '__main__':
unittest.main()


Loading…
Cancel
Save