Browse Source

add tfidf features

master
zengbin93 5 years ago
parent
commit
5bc3a1f5eb
5 changed files with 57 additions and 10 deletions
  1. +0
    -1
      jiagu/__init__.py
  2. +3
    -1
      jiagu/cluster/__init__.py
  3. +42
    -3
      jiagu/cluster/base.py
  4. +10
    -3
      jiagu/cluster/text.py
  5. +2
    -2
      test/test_cluster.py

+ 0
- 1
jiagu/__init__.py View File

@@ -9,7 +9,6 @@
* Description :
"""
from jiagu import analyze
from jiagu.cluster.text import text_cluster

any = analyze.Analyze()



+ 3
- 1
jiagu/cluster/__init__.py View File

@@ -2,4 +2,6 @@

from .kmeans import KMeans
from .dbscan import DBSCAN
from .base import count_features
from .base import count_features, tfidf_features
from .text import text_cluster


+ 42
- 3
jiagu/cluster/base.py View File

@@ -25,12 +25,51 @@ def count_features(corpus, tokenizer=jiagu.cut):
>>> X, names = count_features(corpus)
"""
tokens = [tokenizer(x) for x in corpus]
feature_names = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()]
vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()]

features = []
for sent in tokens:
counter = Counter(sent)
feature = [counter.get(x, 0) for x in feature_names]
feature = [counter.get(x, 0) for x in vocab]
features.append(feature)

return np.array(features), feature_names
return np.array(features), vocab


def tfidf_features(corpus, tokenizer=jiagu.cut):
"""文本的 tfidf 特征

:param corpus: list of str
:param tokenizer: function for tokenize, default is `jiagu.cut`
:return:
features: np.array
names: list of str

example:
>>> import jiagu
>>> from jiagu.cluster.base import tfidf_features
>>> corpus = ["判断unicode是否是汉字。", "全角符号转半角符号。", "一些基于自然语言处理的预处理过程也会在本文中出现。"]
>>> X, names = tfidf_features(corpus, tokenizer=jiagu.cut)
"""
tokens = [tokenizer(x) for x in corpus]
vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()]

idf_dict = dict()
total_doc = len(corpus)
for word in vocab:
num = sum([1 if (word in s) else 0 for s in corpus])
if num == total_doc:
idf = np.log(total_doc / num)
else:
idf = np.log(total_doc / (num + 1))
idf_dict[word] = idf

features = []
for sent in tokens:
counter = Counter(sent)
feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab]
features.append(feature)

return np.array(features), vocab



+ 10
- 3
jiagu/cluster/text.py View File

@@ -1,14 +1,16 @@
# coding: utf-8
from collections import OrderedDict

from .base import count_features
from .base import count_features, tfidf_features
from .dbscan import DBSCAN
from .kmeans import KMeans


def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts=None):
def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_iter=100, eps=None, min_pts=None):
"""文本聚类,目前支持 K-Means 和 DBSCAN 两种方法

:param features_method: str
提取文本特征的方法,目前支持 tfidf 和 count 两种。
:param docs: list of str
输入的文本列表,如 ['k-means', 'dbscan']
:param method: str
@@ -24,7 +26,12 @@ def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts
:return: OrderedDict
聚类结果
"""
features, names = count_features(docs)
if features_method == 'tfidf':
features, names = tfidf_features(docs)
elif features_method == 'count':
features, names = count_features(docs)
else:
raise ValueError('features_method error')

# feature to doc
f2d = {k: v.tolist() for k, v in zip(docs, features)}


+ 2
- 2
test/test_cluster.py View File

@@ -102,14 +102,14 @@ class TestCluster(unittest.TestCase):
print("=" * 68, '\n')
print("text_cluster_by_kmeans ... ")
docs = load_docs()
clusters = text_cluster(docs, method='k-means', k=3, max_iter=100)
clusters = text_cluster(docs, features_method='tfidf', method='k-means', k=3, max_iter=100)
self.assertTrue(len(clusters) == 3)

def test_c_text_cluster_by_dbscan(self):
print("=" * 68, '\n')
print("text_cluster_by_dbscan ... ")
docs = load_docs()
clusters = text_cluster(docs, method='dbscan', eps=5, min_pts=1)
clusters = text_cluster(docs, features_method='count', method='dbscan', eps=5, min_pts=1)
self.assertTrue(len(clusters) == 3)




Loading…
Cancel
Save