OpenI
/
Jiagu

 
			
			   
				 
					
						
						
							
							import re
import os
import sys
from math import log
from jiagu.perceptron import Perceptron

re_eng = re.compile('[a-zA-Z0-9]', re.U)
re_han = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
re_skip = re.compile("(\r\n|\s)", re.U)

class Segment:
	def __init__(self):
		self.vocab = {}
		self.max_word_len = 0
		self.max_freq = 0
		self.total_freq = 0
		self.initialized = False
		self.model = None

	def init(self, vocab_path='dict/jiagu.dict', user_vocab='dict/user.dict',
												model_path='model/cws.model'):
		self.load_vocab(os.path.join(os.path.dirname(__file__), vocab_path))
		self.load_vocab(os.path.join(os.path.dirname(__file__), user_vocab))
		self.model = Perceptron(os.path.join(os.path.dirname(__file__), model_path))
		self.initialized = True
	
	def load_vocab(self, vocab_path):
		fin = open(vocab_path, 'r', encoding='utf8')
		for index, line in enumerate(fin):
			line = line.strip()
			if line == '':
				continue
			word_freq_tag = line.split('\t')
			if len(word_freq_tag) == 1:
				word = word_freq_tag[0]
				self.add_vocab(word)
			elif len(word_freq_tag) == 2:
				word = word_freq_tag[0]
				freq = int(word_freq_tag[1])
				self.add_vocab(word, freq)
		fin.close()

	def add_vocab(self, word=None, freq=None, tag=None):
		if freq == None:
			freq = self.max_freq
			
		if word not in self.vocab:
			self.vocab[word] = 0
			
		self.vocab[word] += freq
		self.total_freq += freq
		
		if freq > self.max_freq:
			self.max_freq = freq
			
		if len(word) > self.max_word_len:
			self.max_word_len = len(word)
			
	def del_vocab(self, word=None, freq=None, tag=None):
		if word not in self.vocab:
			return None

		vocab_freq = self.vocab[word]
		if freq == None or vocab_freq <= freq:
			del self.vocab[word]
			self.total_freq -= vocab_freq
		else:
			self.vocab[word] -= freq
		# self.max_freq and self.max_word_len ?
			
	def load_userdict(self, userdict):
		if self.initialized == False:
			self.init()
		
		if isinstance(userdict, str):
			self.load_vocab(userdict)
		
		for item in userdict:
			if isinstance(item, list):
				if len(item) == 1:
					word = item[0]
					self.add_vocab(word)
				elif len(item) == 2:
					word = item[0]
					freq = item[1]
					self.add_vocab(word, freq)
			elif isinstance(item, str):
				self.add_vocab(word=item)
				
	def del_userdict(self, userdict):
		if self.initialized == False:
			self.init()
		
		for item in userdict:
			if isinstance(item, list):
				if len(item) == 1:
					word = item[0]
					self.del_vocab(word)
				elif len(item) == 2:
					word = item[0]
					freq = item[1]
					self.del_vocab(word, freq)
			elif isinstance(item, str):
				self.del_vocab(word=item)
	
	def calc_route(self, sentence, DAG, route):
		vocab = self.vocab
		N = len(sentence)
		route[N] = (0, 0)
		logtotal = log(self.total_freq)
		for idx in range(N - 1, -1, -1):
			route[idx] = max((log(vocab.get(sentence[idx:x + 1]) or 1) - logtotal + route[x + 1][0], x) for x in DAG[idx])
			  
	def create_DAG(self, sentence):
		vocab = self.vocab
		max_word_len = self.max_word_len
		DAG = {}
		N = len(sentence)
		for idx in range(N):
			cand_idx = [idx]
			for i in range(idx+1, idx + min(max_word_len, N - idx), 1):
				cand = sentence[idx: i+1]
				if cand in vocab:
					cand_idx.append(i)
			DAG[idx] = cand_idx
		return DAG
		
	def cut_search(self, sentence):
		DAG = self.create_DAG(sentence)
		old_j = -1
		for k, L in DAG.items():
			if len(L) == 1 and k > old_j:
				yield sentence[k:L[0] + 1]
				old_j = L[0]
			else:
				for j in L:
					if j > k:
						yield sentence[k:j + 1]
						old_j = j

	def cut_vocab(self, sentence):
		DAG = self.create_DAG(sentence)
		route = {}
		self.calc_route(sentence, DAG, route)

		x = 0
		N = len(sentence)
		buf = ''
		while x < N:
			y = route[x][1] + 1
			l_word = sentence[x:y]
			if buf:
				yield buf
				buf = ''
			yield l_word
			x = y
		if buf:
			yield buf
			buf = ''
		
	def cut_words(self, sentence):
		DAG = self.create_DAG(sentence)
		route = {}
		self.calc_route(sentence, DAG, route)
		x = 0
		N = len(sentence)
		buf = ''
		while x < N:
			y = route[x][1] + 1
			l_word = sentence[x:y]
			if re_eng.match(l_word) and len(l_word) == 1:
				buf += l_word
				x = y
			else:
				if buf:
					yield buf
					buf = ''
				yield l_word
				x = y
		if buf:
			yield buf
			buf = ''
		
	def model_cut(self, sentence):
		if sentence == '':
			return ['']
			
		sentence = list(sentence)
		labels = self.model.predict(sentence)
		return self.__lab2word(sentence, labels)
		
	def __lab2word(self, sentence, labels):
		sen_len = len(sentence)
		tmp_word = ""
		words = []
		for i in range(sen_len):
			label = labels[i]
			w = sentence[i]
			if label == "B":
				tmp_word += w
			elif label == "M":
				tmp_word += w
			elif label == "E":
				tmp_word += w
				words.append(tmp_word)
				tmp_word = ""
			else:
				if tmp_word != '':
					words.append(tmp_word)
					tmp_word = ""
				words.append(w)
		if tmp_word:
			words.append(tmp_word)
		return words

	def seg_default(self, sentence):
		blocks = re_han.split(sentence)
		cut_block = self.cut_words
		cut_all = False
		for block in blocks:
			if not block:
				continue
			if re_han.match(block):
				for word in cut_block(block):
					yield word
			else:
				tmp = re_skip.split(block)
				for x in tmp:
					if re_skip.match(x):
						yield x
					elif not cut_all:
						for xx in x:
							yield xx
					else:
						yield x
						
	def seg_new_word(self, sentence):
		blocks = re_han.split(sentence)
		cut_block = self.cut_words
		cut_all = False
		for block in blocks:
			if not block:
				continue
			if re_han.match(block):
				words1 = list(cut_block(block))
				# print(words1)

				words2 = self.model_cut(block)
				# print(words2)
				

				new_word = [] # 有冲突的不加，长度大于4的不加，加完记得删除
				length = len(words1)
				for n in range(3):
					can_limit = length - n + 1
					for i in range(0, can_limit):
						ngram = ''.join(words1[i:i + n])
						word_len = len(ngram)
						if word_len > 4 or word_len==1:
							continue
						if ngram in words2 and ngram not in words1:
							# print(ngram)
							new_word.append([ngram, 1])
				
				# new_word = []
				# for word in words2:
				# 	if word not in words1 and len(word)>1 and len(word) < 4 :#and not re_eng.match(word):
				#		new_word.append([word, 1])
				
				
				self.load_userdict(new_word)
				

				
				# print('------------------')
				
				for word in cut_block(block):
					yield word
					
				# 删除字典
				self.del_userdict(new_word)
				
				
			else:
				tmp = re_skip.split(block)
				for x in tmp:
					if re_skip.match(x):
						yield x
					elif not cut_all:
						for xx in x:
							yield xx
					else:
						yield x
						
	def seg(self, sentence, mode="default"):
		if self.initialized == False:
			self.init()
				
		if mode == 'probe':
			return list(self.seg_new_word(sentence))
		else:
			return list(self.seg_default(sentence))
		
		
		

if __name__=='__main__':
	s = Segment()
	
	# sg.load_userdict('dict/user.dict')
	# s.load_userdict(['知识图谱'])

	# text = '辽宁省铁岭市西丰县房木镇潭清村东屯' # bug
	# text = '黑龙江省双鸭山市宝清县宝清镇通达街341号'
	# text = '浙江省杭州市西湖区三墩镇紫宣路158号1幢801室'
	# text = '北京市西城区茶马街8号院1号楼15层1502'
	# text = '西藏自治区林芝市米林县羌纳乡羌渡岗村'
	# text = '深圳市南山区西丽街道松坪山社区宝深路科陆大厦B座13层B05'
	# text = '深圳市福田区福强路中港城裙楼6E部分602-A' # bug
	# text = '深圳市福田区福保街道石厦北二街89号新港商城C座3305室'
	# text = '五常市向阳镇致富村庆丰营屯'
	# text = '中牟县中兴路与益民巷交叉口路南'
	# text = '黄山市屯溪区华馨路38号二楼'
	text = '银川市金凤区北京中路福宁城11-1-号'
	
	# 直接将新词动态加入新词的字典中，有冲突的不加，加完记得删除
	
	
	# words = s.seg(text)
	# print(words)
	
	words = s.seg(text, 'probe')
	print('----------------')
	print(words)