You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

perceptron.py 6.8 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227
  1. # -*- coding:utf-8 -*-
  2. import os
  3. import gzip
  4. import pickle
  5. import random
  6. from collections import defaultdict
  7. class AveragedPerceptron(object):
  8. def __init__(self):
  9. # Each feature gets its own weight vector, so weights is a dict-of-dicts
  10. self.weights = {}
  11. self.classes = set()
  12. # The accumulated values, for the averaging. These will be keyed by
  13. # feature/clas tuples
  14. self._totals = defaultdict(int)
  15. # The last time the feature was changed, for the averaging. Also
  16. # keyed by feature/clas tuples
  17. # (tstamps is short for timestamps)
  18. self._tstamps = defaultdict(int)
  19. # Number of instances seen
  20. self.i = 0
  21. def predict(self, features):
  22. '''Dot-product the features and current weights and return the best label.'''
  23. scores = defaultdict(float)
  24. for feat, value in features.items():
  25. if feat not in self.weights or value == 0:
  26. continue
  27. weights = self.weights[feat]
  28. for label, weight in weights.items():
  29. scores[label] += value * weight
  30. # Do a secondary alphabetic sort, for stability
  31. return max(self.classes, key=lambda label: (scores[label], label))
  32. def update(self, truth, guess, features):
  33. '''Update the feature weights.'''
  34. def upd_feat(c, f, w, v):
  35. param = (f, c)
  36. self._totals[param] += (self.i - self._tstamps[param]) * w
  37. self._tstamps[param] = self.i
  38. self.weights[f][c] = w + v
  39. self.i += 1
  40. if truth == guess:
  41. return None
  42. for f in features:
  43. weights = self.weights.setdefault(f, {})
  44. upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
  45. upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
  46. return None
  47. def average_weights(self):
  48. '''Average weights from all iterations.'''
  49. for feat, weights in self.weights.items():
  50. new_feat_weights = {}
  51. for clas, weight in weights.items():
  52. param = (feat, clas)
  53. total = self._totals[param]
  54. total += (self.i - self._tstamps[param]) * weight
  55. averaged = round(total / float(self.i), 3)
  56. if averaged:
  57. new_feat_weights[clas] = averaged
  58. self.weights[feat] = new_feat_weights
  59. return None
  60. class Perceptron:
  61. def __init__(self, loc=None):
  62. self.START = ['-START-', '-START2-']
  63. self.END = ['-END-', '-END2-']
  64. self.model = AveragedPerceptron()
  65. if loc != None:
  66. self.load(loc)
  67. def predict(self, words):
  68. prev, prev2 = self.START
  69. labels = []
  70. context = self.START + words + self.END
  71. for i, word in enumerate(words):
  72. features = self._get_features(i, word, context, prev, prev2)
  73. tag = self.model.predict(features)
  74. labels.append(tag)
  75. prev2 = prev
  76. prev = tag
  77. return labels
  78. def train(self, sentences, save_loc=None, nr_iter=5, shuf=False):
  79. self._make_tagdict(sentences)
  80. for iter_ in range(nr_iter):
  81. c = 0
  82. n = 0
  83. for words, tags in sentences:
  84. prev, prev2 = self.START
  85. context = self.START + words + self.END
  86. for i, word in enumerate(words):
  87. feats = self._get_features(i, word, context, prev, prev2)
  88. guess = self.model.predict(feats)
  89. self.model.update(tags[i], guess, feats)
  90. prev2 = prev
  91. prev = guess
  92. c += guess == tags[i]
  93. n += 1
  94. if shuf == True:
  95. random.shuffle(sentences)
  96. print("Iter {0}: {1}/{2}={3}".format(iter_, c, n, (float(c) / n) * 100))
  97. self.save(save_loc)
  98. self.model.average_weights()
  99. self.save(save_loc)
  100. def save(self, loc='model/ap.model', zip=True):
  101. if zip == False:
  102. pickle.dump((self.model.weights, self.model.classes), open(loc, 'wb'))
  103. else:
  104. pickle.dump((self.model.weights, self.model.classes), gzip.open(loc, 'wb'))
  105. def load(self, loc='model/ap.model', zip=True):
  106. if zip == False:
  107. self.model.weights, self.model.classes = pickle.load(open(loc, 'rb'))
  108. else:
  109. self.model.weights, self.model.classes = pickle.load(gzip.open(loc,'rb'))
  110. def _get_features(self, i, word, context, prev, prev2):
  111. '''Map tokens into a feature representation, implemented as a
  112. {hashable: float} dict. If the features change, a new model must be
  113. trained.
  114. '''
  115. def add(name, *args):
  116. features[' '.join((name,) + tuple(args))] += 1
  117. i += len(self.START)
  118. features = defaultdict(int)
  119. # It's useful to have a constant feature, which acts sort of like a prior
  120. add('bias')
  121. add('i suffix', word[-3:])
  122. add('i pref1', word[0])
  123. add('i-1 tag', prev)
  124. add('i-2 tag', prev2)
  125. add('i tag+i-2 tag', prev, prev2)
  126. add('i word', context[i])
  127. add('i-1 tag+i word', prev, context[i])
  128. add('i-1 word', context[i - 1])
  129. add('i-1 suffix', context[i - 1][-3:])
  130. add('i-2 word', context[i - 2])
  131. add('i+1 word', context[i + 1])
  132. add('i+1 suffix', context[i + 1][-3:])
  133. add('i+2 word', context[i + 2])
  134. return features
  135. def _make_tagdict(self, sentences):
  136. '''Make a tag dictionary for single-tag words.'''
  137. for words, tags in sentences:
  138. for word, tag in zip(words, tags):
  139. self.model.classes.add(tag)
  140. def train(filepath='data/train.txt', model='model/ap.model', nr_iter=1):
  141. tagger = Perceptron()
  142. print('Reading corpus...')
  143. training_data = []
  144. sentence = ([], [])
  145. fin = open(filepath, 'r', encoding='utf8')
  146. for index, line in enumerate(fin):
  147. line = line.strip()
  148. if line == '':
  149. training_data.append(sentence)
  150. sentence = ([], [])
  151. else:
  152. params = line.split()
  153. if len(params) != 2: continue
  154. sentence[0].append(params[0])
  155. sentence[1].append(params[1])
  156. fin.close()
  157. print('training corpus size : %d', len(training_data))
  158. print('Start training...')
  159. tagger.train(training_data, save_loc=model, nr_iter=nr_iter)
  160. def eval(filepath='data/test.txt', model='model/ap.model'):
  161. tagger = Perceptron(model)
  162. print('Start testing...')
  163. right = 0.0
  164. total = 0.0
  165. sentence = ([], [])
  166. fin = open(filepath, 'r', encoding='utf8')
  167. for index, line in enumerate(fin):
  168. line = line.strip()
  169. if line == '':
  170. words = sentence[0]
  171. tags = sentence[1]
  172. outputs = tagger.predict(words)
  173. assert len(tags) == len(outputs)
  174. total += len(tags)
  175. for o, t in zip(outputs, tags):
  176. if o == t: right += 1
  177. sentence = ([], [])
  178. else:
  179. params = line.split()
  180. if len(params) != 2: continue
  181. sentence[0].append(params[0])
  182. sentence[1].append(params[1])
  183. fin.close()
  184. print("Precision : %f", right / total)
  185. def predict(model='model/ap.model'):
  186. tagger = Perceptron(model)
  187. while True:
  188. text = input('>')
  189. words = list(text)
  190. labels = tagger.predict(words)
  191. for word, label in zip(words, labels):
  192. print(word, label)
  193. if __name__ == '__main__':
  194. train()
  195. eval()
  196. # predict()

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家