You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

perceptron.py 6.9 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. # -*- coding:utf-8 -*-
  2. import os
  3. import gzip
  4. import pickle
  5. import random
  6. from collections import defaultdict
  7. '''
  8. http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
  9. '''
  10. class AveragedPerceptron(object):
  11. def __init__(self):
  12. # Each feature gets its own weight vector, so weights is a dict-of-dicts
  13. self.weights = {}
  14. self.classes = set()
  15. # The accumulated values, for the averaging. These will be keyed by
  16. # feature/clas tuples
  17. self._totals = defaultdict(int)
  18. # The last time the feature was changed, for the averaging. Also
  19. # keyed by feature/clas tuples
  20. # (tstamps is short for timestamps)
  21. self._tstamps = defaultdict(int)
  22. # Number of instances seen
  23. self.i = 0
  24. def predict(self, features):
  25. '''Dot-product the features and current weights and return the best label.'''
  26. scores = defaultdict(float)
  27. for feat, value in features.items():
  28. if feat not in self.weights or value == 0:
  29. continue
  30. weights = self.weights[feat]
  31. for label, weight in weights.items():
  32. scores[label] += value * weight
  33. # Do a secondary alphabetic sort, for stability
  34. return max(self.classes, key=lambda label: (scores[label], label))
  35. def update(self, truth, guess, features):
  36. '''Update the feature weights.'''
  37. def upd_feat(c, f, w, v):
  38. param = (f, c)
  39. self._totals[param] += (self.i - self._tstamps[param]) * w
  40. self._tstamps[param] = self.i
  41. self.weights[f][c] = w + v
  42. self.i += 1
  43. if truth == guess:
  44. return None
  45. for f in features:
  46. weights = self.weights.setdefault(f, {})
  47. upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
  48. upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
  49. return None
  50. def average_weights(self):
  51. '''Average weights from all iterations.'''
  52. for feat, weights in self.weights.items():
  53. new_feat_weights = {}
  54. for clas, weight in weights.items():
  55. param = (feat, clas)
  56. total = self._totals[param]
  57. total += (self.i - self._tstamps[param]) * weight
  58. averaged = round(total / float(self.i), 3)
  59. if averaged:
  60. new_feat_weights[clas] = averaged
  61. self.weights[feat] = new_feat_weights
  62. return None
  63. class Perceptron:
  64. def __init__(self, loc=None):
  65. self.START = ['-START-', '-START2-']
  66. self.END = ['-END-', '-END2-']
  67. self.model = AveragedPerceptron()
  68. if loc != None:
  69. self.load(loc)
  70. def predict(self, words):
  71. prev, prev2 = self.START
  72. labels = []
  73. context = self.START + words + self.END
  74. for i, word in enumerate(words):
  75. features = self._get_features(i, word, context, prev, prev2)
  76. tag = self.model.predict(features)
  77. labels.append(tag)
  78. prev2 = prev
  79. prev = tag
  80. return labels
  81. def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
  82. self._make_tagdict(sentences)
  83. for iter_ in range(nr_iter):
  84. c = 0
  85. n = 0
  86. for words, tags in sentences:
  87. prev, prev2 = self.START
  88. context = self.START + words + self.END
  89. for i, word in enumerate(words):
  90. feats = self._get_features(i, word, context, prev, prev2)
  91. guess = self.model.predict(feats)
  92. self.model.update(tags[i], guess, feats)
  93. prev2 = prev
  94. prev = guess
  95. c += guess == tags[i]
  96. n += 1
  97. if shuf == True:
  98. random.shuffle(sentences)
  99. print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
  100. self.save(save_loc)
  101. self.model.average_weights()
  102. self.save(save_loc)
  103. def save(self, loc='model/ap.model', zip=True):
  104. if zip == False:
  105. pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
  106. else:
  107. pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
  108. def load(self, loc='model/ap.model', zip=True):
  109. if zip == False:
  110. self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
  111. else:
  112. self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
  113. def _get_features(self, i, word, context, prev, prev2):
  114. '''Map tokens into a feature representation, implemented as a
  115. {hashable: float} dict. If the features change, a new model must be
  116. trained.
  117. '''
  118. def add(name, *args):
  119. features[' '.join((name,) + tuple(args))] += 1
  120. i += len(self.START)
  121. features = defaultdict(int)
  122. # It's useful to have a constant feature, which acts sort of like a prior
  123. add('bias')
  124. add('i suffix', word[-3:])
  125. add('i pref1', word[0])
  126. add('i-1 tag', prev)
  127. add('i-2 tag', prev2)
  128. add('i tag+i-2 tag', prev, prev2)
  129. add('i word', context[i])
  130. add('i-1 tag+i word', prev, context[i])
  131. add('i-1 word', context[i - 1])
  132. add('i-1 suffix', context[i - 1][-3:])
  133. add('i-2 word', context[i - 2])
  134. add('i+1 word', context[i + 1])
  135. add('i+1 suffix', context[i + 1][-3:])
  136. add('i+2 word', context[i + 2])
  137. return features
  138. def _make_tagdict(self, sentences):
  139. '''Make a tag dictionary for single-tag words.'''
  140. for words, tags in sentences:
  141. for word, tag in zip(words, tags):
  142. self.model.classes.add(tag)
  143. def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
  144. tagger = Perceptron()
  145. print('Reading corpus...')
  146. training_data = []
  147. sentence = ([], [])
  148. fin = open(filepath, 'r', encoding='utf8')
  149. for index, line in enumerate(fin):
  150. line = line.strip()
  151. if line == '':
  152. training_data.append(sentence)
  153. sentence = ([], [])
  154. else:
  155. params = line.split()
  156. if len(params) != 2: continue
  157. sentence[0].append(params[0])
  158. sentence[1].append(params[1])
  159. fin.close()
  160. print('training corpus size : %d', len(training_data))
  161. print('Start training...')
  162. tagger.train(training_data, save_loc=model, nr_iter=nr_iter)
  163. def eval(filepath='data/test.txt', model='model/ap.model'):
  164. tagger = Perceptron(model)
  165. print('Start testing...')
  166. right = 0.0
  167. total = 0.0
  168. sentence = ([], [])
  169. fin = open(filepath, 'r', encoding='utf8')
  170. for index, line in enumerate(fin):
  171. line = line.strip()
  172. if line == '':
  173. words = sentence[0]
  174. tags = sentence[1]
  175. outputs = tagger.predict(words)
  176. assert len(tags) == len(outputs)
  177. total += len(tags)
  178. for o, t in zip(outputs, tags):
  179. if o == t: right += 1
  180. sentence = ([], [])
  181. else:
  182. params = line.split()
  183. if len(params) != 2: continue
  184. sentence[0].append(params[0])
  185. sentence[1].append(params[1])
  186. fin.close()
  187. print("Precision : %f", right / total)
  188. def predict(model='model/ap.model'):
  189. tagger = Perceptron(model)
  190. while True:
  191. text = input('>')
  192. words = list(text)
  193. labels = tagger.predict(words)
  194. for word, label in zip(words, labels):
  195. print(word, label)
  196. if __name__ == '__main__':
  197. train()
  198. eval()
  199. # predict()

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家