@@ -9,7 +9,6 @@ | |||||
* Description : | * Description : | ||||
""" | """ | ||||
from jiagu import analyze | from jiagu import analyze | ||||
from jiagu.cluster.text import text_cluster | |||||
any = analyze.Analyze() | any = analyze.Analyze() | ||||
@@ -2,4 +2,6 @@ | |||||
from .kmeans import KMeans | from .kmeans import KMeans | ||||
from .dbscan import DBSCAN | from .dbscan import DBSCAN | ||||
from .base import count_features | |||||
from .base import count_features, tfidf_features | |||||
from .text import text_cluster | |||||
@@ -25,12 +25,51 @@ def count_features(corpus, tokenizer=jiagu.cut): | |||||
>>> X, names = count_features(corpus) | >>> X, names = count_features(corpus) | ||||
""" | """ | ||||
tokens = [tokenizer(x) for x in corpus] | tokens = [tokenizer(x) for x in corpus] | ||||
feature_names = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] | |||||
vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] | |||||
features = [] | features = [] | ||||
for sent in tokens: | for sent in tokens: | ||||
counter = Counter(sent) | counter = Counter(sent) | ||||
feature = [counter.get(x, 0) for x in feature_names] | |||||
feature = [counter.get(x, 0) for x in vocab] | |||||
features.append(feature) | features.append(feature) | ||||
return np.array(features), feature_names | |||||
return np.array(features), vocab | |||||
def tfidf_features(corpus, tokenizer=jiagu.cut): | |||||
"""文本的 tfidf 特征 | |||||
:param corpus: list of str | |||||
:param tokenizer: function for tokenize, default is `jiagu.cut` | |||||
:return: | |||||
features: np.array | |||||
names: list of str | |||||
example: | |||||
>>> import jiagu | |||||
>>> from jiagu.cluster.base import tfidf_features | |||||
>>> corpus = ["判断unicode是否是汉字。", "全角符号转半角符号。", "一些基于自然语言处理的预处理过程也会在本文中出现。"] | |||||
>>> X, names = tfidf_features(corpus, tokenizer=jiagu.cut) | |||||
""" | |||||
tokens = [tokenizer(x) for x in corpus] | |||||
vocab = [x[0] for x in Counter([x for s in tokens for x in s]).most_common()] | |||||
idf_dict = dict() | |||||
total_doc = len(corpus) | |||||
for word in vocab: | |||||
num = sum([1 if (word in s) else 0 for s in corpus]) | |||||
if num == total_doc: | |||||
idf = np.log(total_doc / num) | |||||
else: | |||||
idf = np.log(total_doc / (num + 1)) | |||||
idf_dict[word] = idf | |||||
features = [] | |||||
for sent in tokens: | |||||
counter = Counter(sent) | |||||
feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab] | |||||
features.append(feature) | |||||
return np.array(features), vocab | |||||
@@ -1,14 +1,16 @@ | |||||
# coding: utf-8 | # coding: utf-8 | ||||
from collections import OrderedDict | from collections import OrderedDict | ||||
from .base import count_features | |||||
from .base import count_features, tfidf_features | |||||
from .dbscan import DBSCAN | from .dbscan import DBSCAN | ||||
from .kmeans import KMeans | from .kmeans import KMeans | ||||
def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts=None): | |||||
def text_cluster(docs, features_method='tfidf', method="k-means", k=None, max_iter=100, eps=None, min_pts=None): | |||||
"""文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | ||||
:param features_method: str | |||||
提取文本特征的方法,目前支持 tfidf 和 count 两种。 | |||||
:param docs: list of str | :param docs: list of str | ||||
输入的文本列表,如 ['k-means', 'dbscan'] | 输入的文本列表,如 ['k-means', 'dbscan'] | ||||
:param method: str | :param method: str | ||||
@@ -24,7 +26,12 @@ def text_cluster(docs, method="k-means", k=None, max_iter=100, eps=None, min_pts | |||||
:return: OrderedDict | :return: OrderedDict | ||||
聚类结果 | 聚类结果 | ||||
""" | """ | ||||
features, names = count_features(docs) | |||||
if features_method == 'tfidf': | |||||
features, names = tfidf_features(docs) | |||||
elif features_method == 'count': | |||||
features, names = count_features(docs) | |||||
else: | |||||
raise ValueError('features_method error') | |||||
# feature to doc | # feature to doc | ||||
f2d = {k: v.tolist() for k, v in zip(docs, features)} | f2d = {k: v.tolist() for k, v in zip(docs, features)} | ||||
@@ -102,14 +102,14 @@ class TestCluster(unittest.TestCase): | |||||
print("=" * 68, '\n') | print("=" * 68, '\n') | ||||
print("text_cluster_by_kmeans ... ") | print("text_cluster_by_kmeans ... ") | ||||
docs = load_docs() | docs = load_docs() | ||||
clusters = text_cluster(docs, method='k-means', k=3, max_iter=100) | |||||
clusters = text_cluster(docs, features_method='tfidf', method='k-means', k=3, max_iter=100) | |||||
self.assertTrue(len(clusters) == 3) | self.assertTrue(len(clusters) == 3) | ||||
def test_c_text_cluster_by_dbscan(self): | def test_c_text_cluster_by_dbscan(self): | ||||
print("=" * 68, '\n') | print("=" * 68, '\n') | ||||
print("text_cluster_by_dbscan ... ") | print("text_cluster_by_dbscan ... ") | ||||
docs = load_docs() | docs = load_docs() | ||||
clusters = text_cluster(docs, method='dbscan', eps=5, min_pts=1) | |||||
clusters = text_cluster(docs, features_method='count', method='dbscan', eps=5, min_pts=1) | |||||
self.assertTrue(len(clusters) == 3) | self.assertTrue(len(clusters) == 3) | ||||