diff --git a/jiagu/cluster/base.py b/jiagu/cluster/base.py index 9ffb4ed..a6e09db 100644 --- a/jiagu/cluster/base.py +++ b/jiagu/cluster/base.py @@ -1,12 +1,18 @@ # -*-coding:utf-8-*- from collections import Counter -import numpy as np +import math def elu_distance(a, b): - """计算两点之间的欧氏距离并返回""" - dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b)))) - return dist + """计算两点之间的欧氏距离并返回 + + :param a: list of float + :param b: list of float + :return: float + """ + + x = sum([pow((a_-b_), 2) for a_, b_ in zip(a, b)]) + return math.sqrt(x) def count_features(corpus, tokenizer=list): @@ -15,7 +21,7 @@ def count_features(corpus, tokenizer=list): :param corpus: list of str :param tokenizer: function for tokenize, default is `jiagu.cut` :return: - features: np.array + features: list of list of float names: list of str example: @@ -32,7 +38,7 @@ def count_features(corpus, tokenizer=list): feature = [counter.get(x, 0) for x in vocab] features.append(feature) - return np.array(features), vocab + return features, vocab def tfidf_features(corpus, tokenizer=list): @@ -41,7 +47,7 @@ def tfidf_features(corpus, tokenizer=list): :param corpus: list of str :param tokenizer: function for tokenize, default is `jiagu.cut` :return: - features: np.array + features: list of list of float names: list of str example: @@ -58,9 +64,9 @@ def tfidf_features(corpus, tokenizer=list): for word in vocab: num = sum([1 if (word in s) else 0 for s in corpus]) if num == total_doc: - idf = np.log(total_doc / num) + idf = math.log(total_doc / num) else: - idf = np.log(total_doc / (num + 1)) + idf = math.log(total_doc / (num + 1)) idf_dict[word] = idf features = [] @@ -69,6 +75,6 @@ def tfidf_features(corpus, tokenizer=list): feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab] features.append(feature) - return np.array(features), vocab + return features, vocab diff --git a/jiagu/cluster/dbscan.py b/jiagu/cluster/dbscan.py index 2f1bc96..f0df485 100644 --- a/jiagu/cluster/dbscan.py +++ b/jiagu/cluster/dbscan.py @@ -9,7 +9,6 @@ """ import random -import numpy as np from collections import OrderedDict from .base import elu_distance @@ -31,14 +30,13 @@ class DBSCAN(object): def train(self, X): """输入数据,完成 KMeans 聚类 - :param X: list of tuple / np.array + :param X: list of tuple 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] :return: OrderedDict """ - if isinstance(X, np.ndarray): - X = [tuple(x) for x in X.tolist()] # 确定数据集中的全部核心对象集合 + X = [tuple(x) for x in X] cores = self._find_cores(X) not_visit = set(X) diff --git a/jiagu/cluster/kmeans.py b/jiagu/cluster/kmeans.py index 258c774..fcb0590 100644 --- a/jiagu/cluster/kmeans.py +++ b/jiagu/cluster/kmeans.py @@ -8,7 +8,6 @@ * Description : KMeans 算法实现 """ -import numpy as np import random from collections import OrderedDict @@ -57,11 +56,18 @@ class KMeans(object): self.clusters = clusters + def _mean(self, features): + res = [] + for i in range(len(features[0])): + col = [x[i] for x in features] + res.append(sum(col) / len(col)) + return res + def _update_centroids(self): """根据簇类结果重新计算每个簇的中心,更新 centroids""" centroids = [] for key in self.clusters.keys(): - centroid = np.mean(self.clusters[key], axis=0) + centroid = self._mean(self.clusters[key]) centroids.append(centroid) self.centroids = centroids @@ -88,13 +94,10 @@ class KMeans(object): def train(self, X): """输入数据,完成 KMeans 聚类 - :param X: list of list / np.array + :param X: list of list 输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]] :return: OrderedDict """ - if isinstance(X, np.ndarray): - X = X.tolist() - # 随机选择 k 个 example 作为初始类簇均值向量 self.centroids = random.sample(X, self.k) diff --git a/jiagu/cluster/text.py b/jiagu/cluster/text.py index a7f5b76..3245ec5 100644 --- a/jiagu/cluster/text.py +++ b/jiagu/cluster/text.py @@ -5,7 +5,7 @@ from .kmeans import KMeans def text_cluster(docs, features_method='tfidf', method="dbscan", - k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): + k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list): """文本聚类,目前支持 K-Means 和 DBSCAN 两种方法 :param features_method: str @@ -33,7 +33,7 @@ def text_cluster(docs, features_method='tfidf', method="dbscan", raise ValueError('features_method error') # feature to doc - f2d = {k: v.tolist() for k, v in zip(docs, features)} + f2d = {k: v for k, v in zip(docs, features)} if method == 'k-means': km = KMeans(k=k, max_iter=max_iter) @@ -57,7 +57,3 @@ def text_cluster(docs, features_method='tfidf', method="dbscan", clusters_out[label] = list(set(c_docs)) return clusters_out - - - -