You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Mon May 11 11:03:01 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. from sklearn.model_selection import ShuffleSplit
  9. from sklearn.neighbors import KNeighborsClassifier
  10. from sklearn.metrics import accuracy_score
  11. from gklearn.utils.utils import get_graph_kernel_by_name
  12. # from gklearn.preimage.utils import get_same_item_indices
  13. def sum_squares(a, b):
  14. """
  15. Return the sum of squares of the difference between a and b, aka MSE
  16. """
  17. return np.sum([(a[i] - b[i])**2 for i in range(len(a))])
  18. def euclid_d(x, y):
  19. """
  20. 1D euclidean distance
  21. """
  22. return np.sqrt((x-y)**2)
  23. def man_d(x, y):
  24. """
  25. 1D manhattan distance
  26. """
  27. return np.abs((x-y))
  28. def knn_regression(D_app, D_test, y_app, y_test, n_neighbors, verbose=True, text=None):
  29. from sklearn.neighbors import KNeighborsRegressor
  30. knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric='precomputed')
  31. knn.fit(D_app, y_app)
  32. y_pred = knn.predict(D_app)
  33. y_pred_test = knn.predict(D_test.T)
  34. perf_app = np.sqrt(sum_squares(y_pred, y_app)/len(y_app))
  35. perf_test = np.sqrt(sum_squares(y_pred_test, y_test)/len(y_test))
  36. if (verbose):
  37. print("Learning error with {} train examples : {}".format(text, perf_app))
  38. print("Test error with {} train examples : {}".format(text, perf_test))
  39. return perf_app, perf_test
  40. def knn_classification(d_app, d_test, y_app, y_test, n_neighbors, verbose=True, text=None):
  41. knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='precomputed')
  42. knn.fit(d_app, y_app)
  43. y_pred = knn.predict(d_app)
  44. y_pred_test = knn.predict(d_test.T)
  45. perf_app = accuracy_score(y_app, y_pred)
  46. perf_test = accuracy_score(y_test, y_pred_test)
  47. if (verbose):
  48. print("Learning accuracy with {} costs : {}".format(text, perf_app))
  49. print("Test accuracy with {} costs : {}".format(text, perf_test))
  50. return perf_app, perf_test
  51. def knn_cv(dataset, kernel_options, trainset=None, n_neighbors=1, n_splits=50, test_size=0.9, verbose=True):
  52. '''
  53. Perform a knn classification cross-validation on given dataset.
  54. '''
  55. # Gn = dataset.graphs
  56. y_all = dataset.targets
  57. # compute kernel distances.
  58. dis_mat = _compute_kernel_distances(dataset, kernel_options, trainset=trainset)
  59. rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0)
  60. # train_indices = [[] for _ in range(n_splits)]
  61. # test_indices = [[] for _ in range(n_splits)]
  62. # idx_targets = get_same_item_indices(y_all)
  63. # for key, item in idx_targets.items():
  64. # i = 0
  65. # for train_i, test_i in rs.split(item): # @todo: careful when parallel.
  66. # train_indices[i] += [item[idx] for idx in train_i]
  67. # test_indices[i] += [item[idx] for idx in test_i]
  68. # i += 1
  69. accuracies = []
  70. # for trial in range(len(train_indices)):
  71. # train_index = train_indices[trial]
  72. # test_index = test_indices[trial]
  73. for train_index, test_index in rs.split(y_all):
  74. # print(train_index, test_index)
  75. # G_app = [Gn[i] for i in train_index]
  76. # G_test = [Gn[i] for i in test_index]
  77. y_app = [y_all[i] for i in train_index]
  78. y_test = [y_all[i] for i in test_index]
  79. N = len(train_index)
  80. d_app = dis_mat.copy()
  81. d_app = d_app[train_index,:]
  82. d_app = d_app[:,train_index]
  83. d_test = np.zeros((N, len(test_index)))
  84. for i in range(N):
  85. for j in range(len(test_index)):
  86. d_test[i, j] = dis_mat[train_index[i], test_index[j]]
  87. accuracies.append(knn_classification(d_app, d_test, y_app, y_test, n_neighbors, verbose=verbose, text=''))
  88. results = {}
  89. results['ave_perf_train'] = np.mean([i[0] for i in accuracies], axis=0)
  90. results['std_perf_train'] = np.std([i[0] for i in accuracies], axis=0, ddof=1)
  91. results['ave_perf_test'] = np.mean([i[1] for i in accuracies], axis=0)
  92. results['std_perf_test'] = np.std([i[1] for i in accuracies], axis=0, ddof=1)
  93. return results
  94. def _compute_kernel_distances(dataset, kernel_options, trainset=None):
  95. graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
  96. node_labels=dataset.node_labels,
  97. edge_labels=dataset.edge_labels,
  98. node_attrs=dataset.node_attrs,
  99. edge_attrs=dataset.edge_attrs,
  100. ds_infos=dataset.get_dataset_infos(keys=['directed']),
  101. kernel_options=kernel_options)
  102. gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options)
  103. dis_mat, _, _, _ = graph_kernel.compute_distance_matrix()
  104. if trainset is not None:
  105. gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm
  106. return dis_mat

A Python package for graph kernels, graph edit distances and graph pre-image problem.