You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_cluster.py 3.5 kB

5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. # -*-coding:utf-8-*-
  2. import unittest
  3. import numpy as np
  4. import matplotlib.pyplot as plt
  5. from pprint import pprint
  6. from jiagu.cluster.kmeans import KMeans
  7. from jiagu.cluster.dbscan import DBSCAN
  8. from jiagu.cluster.text import text_cluster
  9. def load_dataset():
  10. # 西瓜数据集4.0 编号,密度,含糖率
  11. # 数据集来源:《机器学习》第九章 周志华教授
  12. data = '''
  13. 1,0.697,0.460,
  14. 2,0.774,0.376,
  15. 3,0.634,0.264,
  16. 4,0.608,0.318,
  17. 5,0.556,0.215,
  18. 6,0.403,0.237,
  19. 7,0.481,0.149,
  20. 8,0.437,0.211,
  21. 9,0.666,0.091,
  22. 10,0.243,0.267,
  23. 11,0.245,0.057,
  24. 12,0.343,0.099,
  25. 13,0.639,0.161,
  26. 14,0.657,0.198,
  27. 15,0.360,0.370,
  28. 16,0.593,0.042,
  29. 17,0.719,0.103,
  30. 18,0.359,0.188,
  31. 19,0.339,0.241,
  32. 20,0.282,0.257,
  33. 21,0.748,0.232,
  34. 22,0.714,0.346,
  35. 23,0.483,0.312,
  36. 24,0.478,0.437,
  37. 25,0.525,0.369,
  38. 26,0.751,0.489,
  39. 27,0.532,0.472,
  40. 28,0.473,0.376,
  41. 29,0.725,0.445,
  42. 30,0.446,0.459'''
  43. data_ = data.strip().split(',')
  44. dataset = [(float(data_[i]), float(data_[i + 1])) for i in range(1, len(data_) - 1, 3)]
  45. return np.array(dataset)
  46. def show_dataset():
  47. dataset = load_dataset()
  48. fig = plt.figure()
  49. ax = fig.add_subplot(111)
  50. ax.scatter(dataset[:, 0], dataset[:, 1])
  51. plt.title("Dataset")
  52. plt.show()
  53. def load_docs():
  54. docs = [
  55. "百度深度学习中文情感分析工具Senta试用及在线测试",
  56. "情感分析是自然语言处理里面一个热门话题",
  57. "AI Challenger 2018 文本挖掘类竞赛相关解决方案及代码汇总",
  58. "深度学习实践:从零开始做电影评论文本情感分析",
  59. "BERT相关论文、文章和代码资源汇总",
  60. "将不同长度的句子用BERT预训练模型编码,映射到一个固定长度的向量上",
  61. "自然语言处理工具包spaCy介绍",
  62. "现在可以快速测试一下spaCy的相关功能,我们以英文数据为例,spaCy目前主要支持英文和德文"
  63. ]
  64. return docs
  65. class TestCluster(unittest.TestCase):
  66. def test_a_kmeans(self):
  67. print("=" * 68, '\n')
  68. print("test k-means ... ")
  69. X = load_dataset()
  70. print("shape of X: ", X.shape)
  71. k = 4
  72. km = KMeans(k=k, max_iter=100)
  73. clusters = km.train(X)
  74. pprint(clusters)
  75. self.assertEqual(len(clusters), k)
  76. pprint({k: len(v) for k, v in clusters.items()})
  77. print("\n\n")
  78. def test_b_dbscan(self):
  79. print("=" * 68, '\n')
  80. print("test dbscan ... ")
  81. X = load_dataset()
  82. ds = DBSCAN(eps=0.11, min_pts=5)
  83. clusters = ds.train(X)
  84. pprint(clusters)
  85. self.assertTrue(len(clusters) < len(X))
  86. # self.assertEqual(len(clusters), 6)
  87. pprint({k: len(v) for k, v in clusters.items()})
  88. def test_c_text_cluster_by_kmeans(self):
  89. print("=" * 68, '\n')
  90. print("text_cluster_by_kmeans ... ")
  91. docs = load_docs()
  92. clusters = text_cluster(docs, features_method='tfidf', method='k-means', k=3, max_iter=100)
  93. self.assertTrue(len(clusters) == 3)
  94. def test_c_text_cluster_by_dbscan(self):
  95. print("=" * 68, '\n')
  96. print("text_cluster_by_dbscan ... ")
  97. docs = load_docs()
  98. clusters = text_cluster(docs, features_method='count', method='dbscan', eps=5, min_pts=1)
  99. self.assertTrue(len(clusters) == 3)
  100. if __name__ == '__main__':
  101. unittest.main()

Jiagu使用大规模语料训练而成。将提供中文分词、词性标注、命名实体识别、情感分析、知识图谱关系抽取、关键词抽取、文本摘要、新词发现、情感分析、文本聚类等常用自然语言处理功能。参考了各大工具优缺点制作,将Jiagu回馈给大家