diff --git a/jiagu/__init__.py b/jiagu/__init__.py index 84e336e..826a4d2 100644 --- a/jiagu/__init__.py +++ b/jiagu/__init__.py @@ -9,7 +9,6 @@ * Description : """ from jiagu import analyze -from jiagu.cluster.text import text_cluster any = analyze.Analyze() diff --git a/jiagu/cluster/__init__.py b/jiagu/cluster/__init__.py index 94e7efc..f81b35e 100644 --- a/jiagu/cluster/__init__.py +++ b/jiagu/cluster/__init__.py @@ -2,4 +2,6 @@ from .kmeans import KMeans from .dbscan import DBSCAN -from .base import count_features +from .base import count_features, tfidf_features +from .text import text_cluster + diff --git a/jiagu/cluster/base.py b/jiagu/cluster/base.py index 3763026..f668518 100644 --- a/jiagu/cluster/base.py +++ b/jiagu/cluster/base.py @@ -25,12 +25,51 @@ def count_features(corpus, tokenizer=jiagu.cut): >>> X, names = count_features(corpus) """ tokens = [tokenizer(x) for x in corpus] - feature_names = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] + vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] features = [] for sent in tokens: counter = Counter(sent) - feature = [counter.get(x, 0) for x in feature_names] + feature = [counter.get(x, 0) for x in vocab] features.append(feature) - return np.array(features), feature_names + return np.array(features), vocab + + +def tfidf_features(corpus, tokenizer=jiagu.cut): + """文本的 tfidf 特征 + + :param corpus: list of str + :param tokenizer: function for tokenize, default is `jiagu.cut` + :return: + features: np.array + names: list of str + + example: + >>> import jiagu + >>> from jiagu.cluster.base import tfidf_features + >>> corpus = ["判断unicode是否是汉字。", "全角符号转半角符号。", "一些基于自然语言处理的预处理过程也会在本文中出现。"] + >>> X, names = tfidf_features(corpus, tokenizer=jiagu.cut) + """ + tokens = [tokenizer(x) for x in corpus] + vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] + + idf_dict = dict() + total_doc = len(corpus) + for word in vocab: + num = sum([1 if (word in s) else 0 for s in corpus]) + if num == total_doc: + idf = np.log(total_doc / num) + else: + idf = np.log(total_doc / (num + 1)) + idf_dict[word] = idf + + features = [] + for sent in tokens: + counter = Counter(sent) + feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab] + features.append(feature) + + return np.array(features), vocab + + diff --git a/jiagu/cluster/text.py b/jiagu/cluster/text.py index 60ea935..1725b85 100644 --- a/jiagu/cluster/text.py +++ b/jiagu/cluster/text.py @@ -1,14 +1,16 @@ # coding: utf-8 from collections import OrderedDict -from .base import count_features +from .base import count_features, tfidf_features from .dbscan import DBSCAN from .kmeans import KMeans -def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts=None): +def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_iter=100, eps=None, min_pts=None): """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 + :param features_method: str + 提取文本特征的方法,目前支持 tfidf 和 count 两种。 :param docs: list of str 输入的文本列表,如 ['k-means', 'dbscan'] :param method: str @@ -24,7 +26,12 @@ def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts :return: OrderedDict 聚类结果 """ - features, names = count_features(docs) + if features_method == 'tfidf': + features, names = tfidf_features(docs) + elif features_method == 'count': + features, names = count_features(docs) + else: + raise ValueError('features_method error') # feature to doc f2d = {k: v.tolist() for k, v in zip(docs, features)} diff --git a/test/test_cluster.py b/test/test_cluster.py index 3e8b5bf..6c237b9 100644 --- a/test/test_cluster.py +++ b/test/test_cluster.py @@ -102,14 +102,14 @@ class TestCluster(unittest.TestCase): print("=" * 68, '\n') print("text_cluster_by_kmeans ... ") docs = load_docs() - clusters = text_cluster(docs, method='k-means', k=3, max_iter=100) + clusters = text_cluster(docs, features_method='tfidf', method='k-means', k=3, max_iter=100) self.assertTrue(len(clusters) == 3) def test_c_text_cluster_by_dbscan(self): print("=" * 68, '\n') print("text_cluster_by_dbscan ... ") docs = load_docs() - clusters = text_cluster(docs, method='dbscan', eps=5, min_pts=1) + clusters = text_cluster(docs, features_method='count', method='dbscan', eps=5, min_pts=1) self.assertTrue(len(clusters) == 3)