# -*-coding:utf-8-*- import unittest import numpy as np import matplotlib.pyplot as plt from pprint import pprint from jiagu.cluster.kmeans import KMeans from jiagu.cluster.dbscan import DBSCAN from jiagu.cluster.text import text_cluster def load_dataset(): # 西瓜数据集4.0 编号,密度,含糖率 # 数据集来源:《机器学习》第九章 周志华教授 data = ''' 1,0.697,0.460, 2,0.774,0.376, 3,0.634,0.264, 4,0.608,0.318, 5,0.556,0.215, 6,0.403,0.237, 7,0.481,0.149, 8,0.437,0.211, 9,0.666,0.091, 10,0.243,0.267, 11,0.245,0.057, 12,0.343,0.099, 13,0.639,0.161, 14,0.657,0.198, 15,0.360,0.370, 16,0.593,0.042, 17,0.719,0.103, 18,0.359,0.188, 19,0.339,0.241, 20,0.282,0.257, 21,0.748,0.232, 22,0.714,0.346, 23,0.483,0.312, 24,0.478,0.437, 25,0.525,0.369, 26,0.751,0.489, 27,0.532,0.472, 28,0.473,0.376, 29,0.725,0.445, 30,0.446,0.459''' data_ = data.strip().split(',') dataset = [(float(data_[i]), float(data_[i + 1])) for i in range(1, len(data_) - 1, 3)] return np.array(dataset) def show_dataset(): dataset = load_dataset() fig = plt.figure() ax = fig.add_subplot(111) ax.scatter(dataset[:, 0], dataset[:, 1]) plt.title("Dataset") plt.show() def load_docs(): docs = [ "百度深度学习中文情感分析工具Senta试用及在线测试", "情感分析是自然语言处理里面一个热门话题", "AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总", "深度学习实践:从零开始做电影评论文本情感分析", "BERT相关论文、文章和代码资源汇总", "将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上", "自然语言处理工具包spaCy介绍", "现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文" ] return docs class TestCluster(unittest.TestCase): def test_a_kmeans(self): print("=" * 68, '\n') print("test k-means ... ") X = load_dataset() print("shape of X: ", X.shape) k = 4 km = KMeans(k=k, max_iter=100) clusters = km.train(X) pprint(clusters) self.assertEqual(len(clusters), k) pprint({k: len(v) for k, v in clusters.items()}) print("\n\n") def test_b_dbscan(self): print("=" * 68, '\n') print("test dbscan ... ") X = load_dataset() ds = DBSCAN(eps=0.11, min_pts=5) clusters = ds.train(X) pprint(clusters) self.assertTrue(len(clusters) < len(X)) # self.assertEqual(len(clusters), 6) pprint({k: len(v) for k, v in clusters.items()}) def test_c_text_cluster_by_kmeans(self): print("=" * 68, '\n') print("text_cluster_by_kmeans ... ") docs = load_docs() clusters = text_cluster(docs, features_method='tfidf', method='k-means', k=3, max_iter=100) self.assertTrue(len(clusters) == 3) def test_c_text_cluster_by_dbscan(self): print("=" * 68, '\n') print("text_cluster_by_dbscan ... ") docs = load_docs() clusters = text_cluster(docs, features_method='count', method='dbscan', eps=5, min_pts=1) self.assertTrue(len(clusters) == 3) if __name__ == '__main__': unittest.main()