You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

bayes.py 2.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import sys
  4. import gzip
  5. import marshal
  6. from math import log, exp
  7. class BaseProb(object):
  8. def __init__(self):
  9. self.d = {}
  10. self.total = 0.0
  11. self.none = 0
  12. def exists(self, key):
  13. return key in self.d
  14. def getsum(self):
  15. return self.total
  16. def get(self, key):
  17. if not self.exists(key):
  18. return False, self.none
  19. return True, self.d[key]
  20. def freq(self, key):
  21. return float(self.get(key)[1])/self.total
  22. class AddOneProb(BaseProb):
  23. def __init__(self):
  24. self.d = {}
  25. self.total = 0.0
  26. self.none = 1
  27. def add(self, key, value):
  28. self.total += value
  29. if not self.exists(key):
  30. self.d[key] = 1
  31. self.total += 1
  32. self.d[key] += value
  33. class Bayes(object):
  34. def __init__(self):
  35. self.d = {}
  36. self.total = 0
  37. def save(self, fname, iszip=True):
  38. d = {}
  39. d['total'] = self.total
  40. d['d'] = {}
  41. for k, v in self.d.items():
  42. d['d'][k] = v.__dict__
  43. if not iszip:
  44. marshal.dump(d, open(fname, 'wb'))
  45. else:
  46. f = gzip.open(fname, 'wb')
  47. f.write(marshal.dumps(d))
  48. f.close()
  49. def load(self, fname, iszip=True):
  50. if not iszip:
  51. d = marshal.load(open(fname, 'rb'))
  52. else:
  53. try:
  54. f = gzip.open(fname, 'rb')
  55. d = marshal.loads(f.read())
  56. except IOError:
  57. f = open(fname, 'rb')
  58. d = marshal.loads(f.read())
  59. f.close()
  60. self.total = d['total']
  61. self.d = {}
  62. for k, v in d['d'].items():
  63. self.d[k] = AddOneProb()
  64. self.d[k].__dict__ = v
  65. def train(self, data):
  66. for d in data:
  67. c = d[0]
  68. if c not in self.d:
  69. self.d[c] = AddOneProb()
  70. for word in d[1]:
  71. self.d[c].add(word, 1)
  72. self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys()))
  73. def classify(self, x):
  74. if self.d == {}:
  75. self.load(os.path.join(os.path.dirname(__file__), 'model/sentiment.model'))
  76. tmp = {}
  77. for k in self.d:
  78. tmp[k] = log(self.d[k].getsum()) - log(self.total)
  79. for word in x:
  80. tmp[k] += log(self.d[k].freq(word))
  81. ret, prob = 0, 0
  82. for k in self.d:
  83. now = 0
  84. try:
  85. for otherk in self.d:
  86. now += exp(tmp[otherk]-tmp[k])
  87. now = 1/now
  88. except OverflowError:
  89. now = 0
  90. if now > prob:
  91. ret, prob = k, now
  92. return (ret, prob)
  93. if __name__=='__main__':
  94. classifier = Bayes()
  95. # 预测
  96. classifier.load('model/1.model')
  97. import jiagu
  98. words = jiagu.seg('今天真的开心')
  99. ret, prob = classifier.classify(words)
  100. print(ret, prob)

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家