diff --git a/jiagu/normal/README.md b/jiagu/normal/README.md new file mode 100644 index 0000000..f00144e --- /dev/null +++ b/jiagu/normal/README.md @@ -0,0 +1,9 @@ + +文本归一化 + +包含 中文转拼音 + +全角半角等 + + + diff --git a/jiagu/sentiment/bayes.py b/jiagu/sentiment/bayes.py new file mode 100644 index 0000000..cdd6168 --- /dev/null +++ b/jiagu/sentiment/bayes.py @@ -0,0 +1,132 @@ +# -*- coding: utf-8 -*- +import sys +import gzip +import marshal +from math import log, exp + +class BaseProb(object): + def __init__(self): + self.d = {} + self.total = 0.0 + self.none = 0 + + def exists(self, key): + return key in self.d + + def getsum(self): + return self.total + + def get(self, key): + if not self.exists(key): + return False, self.none + return True, self.d[key] + + def freq(self, key): + return float(self.get(key)[1])/self.total + +class AddOneProb(BaseProb): + def __init__(self): + self.d = {} + self.total = 0.0 + self.none = 1 + + def add(self, key, value): + self.total += value + if not self.exists(key): + self.d[key] = 1 + self.total += 1 + self.d[key] += value + +class Bayes(object): + def __init__(self): + self.d = {} + self.total = 0 + + def save(self, fname, iszip=True): + d = {} + d['total'] = self.total + d['d'] = {} + for k, v in self.d.items(): + d['d'][k] = v.__dict__ + + if not iszip: + marshal.dump(d, open(fname, 'wb')) + else: + f = gzip.open(fname, 'wb') + f.write(marshal.dumps(d)) + f.close() + + def load(self, fname, iszip=True): + if not iszip: + d = marshal.load(open(fname, 'rb')) + else: + try: + f = gzip.open(fname, 'rb') + d = marshal.loads(f.read()) + except IOError: + f = open(fname, 'rb') + d = marshal.loads(f.read()) + f.close() + self.total = d['total'] + self.d = {} + for k, v in d['d'].items(): + self.d[k] = AddOneProb() + self.d[k].__dict__ = v + + def train(self, data): + for d in data: + c = d[0] + if c not in self.d: + self.d[c] = AddOneProb() + for word in d[1]: + self.d[c].add(word, 1) + self.total = sum(map(lambda x: self.d[x].getsum(), self.d.keys())) + + def classify(self, x): + tmp = {} + for k in self.d: + tmp[k] = log(self.d[k].getsum()) - log(self.total) + for word in x: + tmp[k] += log(self.d[k].freq(word)) + ret, prob = 0, 0 + for k in self.d: + now = 0 + try: + for otherk in self.d: + now += exp(tmp[otherk]-tmp[k]) + now = 1/now + except OverflowError: + now = 0 + if now > prob: + ret, prob = k, now + return (ret, prob) + +if __name__=='__main__': + classifier = Bayes() + + # 预测 + classifier.load('model/1.model') + + + import jiagu + + words = jiagu.seg('今天真的开心') + + ret, prob = classifier.classify(words) + print(ret, prob) + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/jiagu/sentiment/model/1.model b/jiagu/sentiment/model/1.model new file mode 100644 index 0000000..9120585 Binary files /dev/null and b/jiagu/sentiment/model/1.model differ