@@ -1,12 +1,18 @@ | |||
# -*-coding:utf-8-*- | |||
from collections import Counter | |||
import numpy as np | |||
import math | |||
def elu_distance(a, b): | |||
"""计算两点之间的欧氏距离并返回""" | |||
dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b)))) | |||
return dist | |||
"""计算两点之间的欧氏距离并返回 | |||
:param a: list of float | |||
:param b: list of float | |||
:return: float | |||
""" | |||
x = sum([pow((a_-b_), 2) for a_, b_ in zip(a, b)]) | |||
return math.sqrt(x) | |||
def count_features(corpus, tokenizer=list): | |||
@@ -15,7 +21,7 @@ def count_features(corpus, tokenizer=list): | |||
:param corpus: list of str | |||
:param tokenizer: function for tokenize, default is `jiagu.cut` | |||
:return: | |||
features: np.array | |||
features: list of list of float | |||
names: list of str | |||
example: | |||
@@ -32,7 +38,7 @@ def count_features(corpus, tokenizer=list): | |||
feature = [counter.get(x, 0) for x in vocab] | |||
features.append(feature) | |||
return np.array(features), vocab | |||
return features, vocab | |||
def tfidf_features(corpus, tokenizer=list): | |||
@@ -41,7 +47,7 @@ def tfidf_features(corpus, tokenizer=list): | |||
:param corpus: list of str | |||
:param tokenizer: function for tokenize, default is `jiagu.cut` | |||
:return: | |||
features: np.array | |||
features: list of list of float | |||
names: list of str | |||
example: | |||
@@ -58,9 +64,9 @@ def tfidf_features(corpus, tokenizer=list): | |||
for word in vocab: | |||
num = sum([1 if (word in s) else 0 for s in corpus]) | |||
if num == total_doc: | |||
idf = np.log(total_doc / num) | |||
idf = math.log(total_doc / num) | |||
else: | |||
idf = np.log(total_doc / (num + 1)) | |||
idf = math.log(total_doc / (num + 1)) | |||
idf_dict[word] = idf | |||
features = [] | |||
@@ -69,6 +75,6 @@ def tfidf_features(corpus, tokenizer=list): | |||
feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab] | |||
features.append(feature) | |||
return np.array(features), vocab | |||
return features, vocab | |||
@@ -9,7 +9,6 @@ | |||
""" | |||
import random | |||
import numpy as np | |||
from collections import OrderedDict | |||
from .base import elu_distance | |||
@@ -31,14 +30,13 @@ class DBSCAN(object): | |||
def train(self, X): | |||
"""输入数据,完成 KMeans 聚类 | |||
:param X: list of tuple / np.array | |||
:param X: list of tuple | |||
输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||
:return: OrderedDict | |||
""" | |||
if isinstance(X, np.ndarray): | |||
X = [tuple(x) for x in X.tolist()] | |||
# 确定数据集中的全部核心对象集合 | |||
X = [tuple(x) for x in X] | |||
cores = self._find_cores(X) | |||
not_visit = set(X) | |||
@@ -8,7 +8,6 @@ | |||
* Description : KMeans 算法实现 | |||
""" | |||
import numpy as np | |||
import random | |||
from collections import OrderedDict | |||
@@ -57,11 +56,18 @@ class KMeans(object): | |||
self.clusters = clusters | |||
def _mean(self, features): | |||
res = [] | |||
for i in range(len(features[0])): | |||
col = [x[i] for x in features] | |||
res.append(sum(col) / len(col)) | |||
return res | |||
def _update_centroids(self): | |||
"""根据簇类结果重新计算每个簇的中心,更新 centroids""" | |||
centroids = [] | |||
for key in self.clusters.keys(): | |||
centroid = np.mean(self.clusters[key], axis=0) | |||
centroid = self._mean(self.clusters[key]) | |||
centroids.append(centroid) | |||
self.centroids = centroids | |||
@@ -88,13 +94,10 @@ class KMeans(object): | |||
def train(self, X): | |||
"""输入数据,完成 KMeans 聚类 | |||
:param X: list of list / np.array | |||
:param X: list of list | |||
输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] | |||
:return: OrderedDict | |||
""" | |||
if isinstance(X, np.ndarray): | |||
X = X.tolist() | |||
# 随机选择 k 个 example 作为初始类簇均值向量 | |||
self.centroids = random.sample(X, self.k) | |||
@@ -5,7 +5,7 @@ from .kmeans import KMeans | |||
def text_cluster(docs, features_method='tfidf', method="dbscan", | |||
k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): | |||
k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): | |||
"""文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 | |||
:param features_method: str | |||
@@ -33,7 +33,7 @@ def text_cluster(docs, features_method='tfidf', method="dbscan", | |||
raise ValueError('features_method error') | |||
# feature to doc | |||
f2d = {k: v.tolist() for k, v in zip(docs, features)} | |||
f2d = {k: v for k, v in zip(docs, features)} | |||
if method == 'k-means': | |||
km = KMeans(k=k, max_iter=max_iter) | |||
@@ -57,7 +57,3 @@ def text_cluster(docs, features_method='tfidf', method="dbscan", | |||
clusters_out[label] = list(set(c_docs)) | |||
return clusters_out | |||