Browse Source

remove numpy

master
zengbin93 5 years ago
parent
commit
16dc071ce5
4 changed files with 29 additions and 26 deletions
  1. +16
    -10
      jiagu/cluster/base.py
  2. +2
    -4
      jiagu/cluster/dbscan.py
  3. +9
    -6
      jiagu/cluster/kmeans.py
  4. +2
    -6
      jiagu/cluster/text.py

+ 16
- 10
jiagu/cluster/base.py View File

@@ -1,12 +1,18 @@
# -*-coding:utf-8-*-
from collections import Counter
import numpy as np
import math


def elu_distance(a, b):
"""计算两点之间的欧氏距离并返回"""
dist = np.sqrt(np.sum(np.square(np.array(a) - np.array(b))))
return dist
"""计算两点之间的欧氏距离并返回

:param a: list of float
:param b: list of float
:return: float
"""

x = sum([pow((a_-b_), 2) for a_, b_ in zip(a, b)])
return math.sqrt(x)


def count_features(corpus, tokenizer=list):
@@ -15,7 +21,7 @@ def count_features(corpus, tokenizer=list):
:param corpus: list of str
:param tokenizer: function for tokenize, default is `jiagu.cut`
:return:
features: np.array
features: list of list of float
names: list of str

example:
@@ -32,7 +38,7 @@ def count_features(corpus, tokenizer=list):
feature = [counter.get(x, 0) for x in vocab]
features.append(feature)

return np.array(features), vocab
return features, vocab


def tfidf_features(corpus, tokenizer=list):
@@ -41,7 +47,7 @@ def tfidf_features(corpus, tokenizer=list):
:param corpus: list of str
:param tokenizer: function for tokenize, default is `jiagu.cut`
:return:
features: np.array
features: list of list of float
names: list of str

example:
@@ -58,9 +64,9 @@ def tfidf_features(corpus, tokenizer=list):
for word in vocab:
num = sum([1 if (word in s) else 0 for s in corpus])
if num == total_doc:
idf = np.log(total_doc / num)
idf = math.log(total_doc / num)
else:
idf = np.log(total_doc / (num + 1))
idf = math.log(total_doc / (num + 1))
idf_dict[word] = idf

features = []
@@ -69,6 +75,6 @@ def tfidf_features(corpus, tokenizer=list):
feature = [counter.get(x, 0) / len(sent) * idf_dict.get(x, 0) for x in vocab]
features.append(feature)

return np.array(features), vocab
return features, vocab



+ 2
- 4
jiagu/cluster/dbscan.py View File

@@ -9,7 +9,6 @@
"""

import random
import numpy as np
from collections import OrderedDict

from .base import elu_distance
@@ -31,14 +30,13 @@ class DBSCAN(object):
def train(self, X):
"""输入数据,完成 KMeans 聚类

:param X: list of tuple / np.array
:param X: list of tuple
输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]]
:return: OrderedDict
"""
if isinstance(X, np.ndarray):
X = [tuple(x) for x in X.tolist()]

# 确定数据集中的全部核心对象集合
X = [tuple(x) for x in X]
cores = self._find_cores(X)
not_visit = set(X)



+ 9
- 6
jiagu/cluster/kmeans.py View File

@@ -8,7 +8,6 @@
* Description : KMeans 算法实现
"""

import numpy as np
import random
from collections import OrderedDict

@@ -57,11 +56,18 @@ class KMeans(object):

self.clusters = clusters

def _mean(self, features):
res = []
for i in range(len(features[0])):
col = [x[i] for x in features]
res.append(sum(col) / len(col))
return res

def _update_centroids(self):
"""根据簇类结果重新计算每个簇的中心,更新 centroids"""
centroids = []
for key in self.clusters.keys():
centroid = np.mean(self.clusters[key], axis=0)
centroid = self._mean(self.clusters[key])
centroids.append(centroid)
self.centroids = centroids

@@ -88,13 +94,10 @@ class KMeans(object):
def train(self, X):
"""输入数据,完成 KMeans 聚类

:param X: list of list / np.array
:param X: list of list
输入数据特征,[n_samples, n_features],如:[[0.36, 0.37], [0.483, 0.312]]
:return: OrderedDict
"""
if isinstance(X, np.ndarray):
X = X.tolist()

# 随机选择 k 个 example 作为初始类簇均值向量
self.centroids = random.sample(X, self.k)



+ 2
- 6
jiagu/cluster/text.py View File

@@ -5,7 +5,7 @@ from .kmeans import KMeans


def text_cluster(docs, features_method='tfidf', method="dbscan",
k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list):
k=3, max_iter=100, eps=0.5, min_pts=2, tokenizer=list):
"""文本聚类,目前支持 K-Means 和 DBSCAN 两种方法

:param features_method: str
@@ -33,7 +33,7 @@ def text_cluster(docs, features_method='tfidf', method="dbscan",
raise ValueError('features_method error')

# feature to doc
f2d = {k: v.tolist() for k, v in zip(docs, features)}
f2d = {k: v for k, v in zip(docs, features)}

if method == 'k-means':
km = KMeans(k=k, max_iter=max_iter)
@@ -57,7 +57,3 @@ def text_cluster(docs, features_method='tfidf', method="dbscan",
clusters_out[label] = list(set(c_docs))

return clusters_out





Loading…
Cancel
Save