You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

random_preimage_generator.py 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Fri May 29 14:29:52 2020
  5. @author: ljia
  6. """
  7. import numpy as np
  8. import time
  9. import random
  10. import sys
  11. from tqdm import tqdm
  12. import multiprocessing
  13. import networkx as nx
  14. from multiprocessing import Pool
  15. from functools import partial
  16. from gklearn.preimage import PreimageGenerator
  17. from gklearn.preimage.utils import compute_k_dis
  18. from gklearn.utils import Timer
  19. from gklearn.utils.utils import get_graph_kernel_by_name
  20. # from gklearn.utils.dataset import Dataset
  21. class RandomPreimageGenerator(PreimageGenerator):
  22. def __init__(self, dataset=None):
  23. PreimageGenerator.__init__(self, dataset=dataset)
  24. # arguments to set.
  25. self.__k = 5 # number of nearest neighbors of phi in D_N.
  26. self.__r_max = 10 # maximum number of iterations.
  27. self.__l = 500 # numbers of graphs generated for each graph in D_k U {g_i_hat}.
  28. self.__alphas = None # weights of linear combinations of points in kernel space.
  29. self.__parallel = True
  30. self.__n_jobs = multiprocessing.cpu_count()
  31. self.__time_limit_in_sec = 0 # @todo
  32. self.__max_itrs = 100 # @todo
  33. # values to compute.
  34. self.__runtime_generate_preimage = None
  35. self.__runtime_total = None
  36. self.__preimage = None
  37. self.__best_from_dataset = None
  38. self.__k_dis_preimage = None
  39. self.__k_dis_dataset = None
  40. self.__itrs = 0
  41. self.__converged = False # @todo
  42. self.__num_updates = 0
  43. # values that can be set or to be computed.
  44. self.__gram_matrix_unnorm = None
  45. self.__runtime_precompute_gm = None
  46. def set_options(self, **kwargs):
  47. self._kernel_options = kwargs.get('kernel_options', {})
  48. self._graph_kernel = kwargs.get('graph_kernel', None)
  49. self._verbose = kwargs.get('verbose', 2)
  50. self.__k = kwargs.get('k', 5)
  51. self.__r_max = kwargs.get('r_max', 10)
  52. self.__l = kwargs.get('l', 500)
  53. self.__alphas = kwargs.get('alphas', None)
  54. self.__parallel = kwargs.get('parallel', True)
  55. self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count())
  56. self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0)
  57. self.__max_itrs = kwargs.get('max_itrs', 100)
  58. self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None)
  59. self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None)
  60. def run(self):
  61. self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'],
  62. node_labels=self._dataset.node_labels,
  63. edge_labels=self._dataset.edge_labels,
  64. node_attrs=self._dataset.node_attrs,
  65. edge_attrs=self._dataset.edge_attrs,
  66. ds_infos=self._dataset.get_dataset_infos(keys=['directed']),
  67. kernel_options=self._kernel_options)
  68. # record start time.
  69. start = time.time()
  70. # 1. precompute gram matrix.
  71. if self.__gram_matrix_unnorm is None:
  72. gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options)
  73. self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm
  74. end_precompute_gm = time.time()
  75. self.__runtime_precompute_gm = end_precompute_gm - start
  76. else:
  77. if self.__runtime_precompute_gm is None:
  78. raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.')
  79. self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm
  80. if self._kernel_options['normalize']:
  81. self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm))
  82. else:
  83. self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm)
  84. end_precompute_gm = time.time()
  85. start -= self.__runtime_precompute_gm
  86. # 2. compute k nearest neighbors of phi in D_N.
  87. if self._verbose >= 2:
  88. print('\nstart computing k nearest neighbors of phi in D_N...\n')
  89. D_N = self._dataset.graphs
  90. if self.__alphas is None:
  91. self.__alphas = [1 / len(D_N)] * len(D_N)
  92. k_dis_list = [] # distance between g_star and each graph.
  93. term3 = 0
  94. for i1, a1 in enumerate(self.__alphas):
  95. for i2, a2 in enumerate(self.__alphas):
  96. term3 += a1 * a2 * self._graph_kernel.gram_matrix[i1, i2]
  97. for idx in range(len(D_N)):
  98. k_dis_list.append(compute_k_dis(idx, range(0, len(D_N)), self.__alphas, self._graph_kernel.gram_matrix, term3=term3, withterm3=True))
  99. # sort.
  100. sort_idx = np.argsort(k_dis_list)
  101. dis_gs = [k_dis_list[idis] for idis in sort_idx[0:self.__k]] # the k shortest distances.
  102. nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist())
  103. g0hat_list = [D_N[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in D_N
  104. self.__best_from_dataset = g0hat_list[0] # get the first best graph if there are muitlple.
  105. self.__k_dis_dataset = dis_gs[0]
  106. if self.__k_dis_dataset == 0: # get the exact pre-image.
  107. end_generate_preimage = time.time()
  108. self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm
  109. self.__runtime_total = end_generate_preimage - start
  110. self.__preimage = self.__best_from_dataset.copy()
  111. self.__k_dis_preimage = self.__k_dis_dataset
  112. if self._verbose:
  113. print()
  114. print('=============================================================================')
  115. print('The exact pre-image is found from the input dataset.')
  116. print('-----------------------------------------------------------------------------')
  117. print('Distance in kernel space for the best graph from dataset and for preimage:', self.__k_dis_dataset)
  118. print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm)
  119. print('Time to generate pre-images:', self.__runtime_generate_preimage)
  120. print('Total time:', self.__runtime_total)
  121. print('=============================================================================')
  122. print()
  123. return
  124. dhat = dis_gs[0] # the nearest distance
  125. Gk = [D_N[ig].copy() for ig in sort_idx[0:self.__k]] # the k nearest neighbors
  126. Gs_nearest = [nx.convert_node_labels_to_integers(g) for g in Gk] # [g.copy() for g in Gk]
  127. # 3. start iterations.
  128. if self._verbose >= 2:
  129. print('starting iterations...')
  130. gihat_list = []
  131. dihat_list = []
  132. r = 0
  133. dis_of_each_itr = [dhat]
  134. if self.__parallel:
  135. self._kernel_options['parallel'] = None
  136. while r < self.__r_max:
  137. print('\n- r =', r)
  138. found = False
  139. dis_bests = dis_gs + dihat_list
  140. # compute numbers of edges to be inserted/deleted.
  141. # @todo what if the log is negetive? how to choose alpha (scalar)?
  142. fdgs_list = np.array(dis_bests)
  143. if np.min(fdgs_list) < 1:
  144. fdgs_list /= np.min(dis_bests)
  145. fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))]
  146. if np.min(fdgs_list) < 1:
  147. fdgs_list = np.array(fdgs_list) + 1
  148. for ig, gs in enumerate(Gs_nearest + gihat_list):
  149. if self._verbose >= 2:
  150. print('-- computing', ig + 1, 'graphs out of', len(Gs_nearest) + len(gihat_list))
  151. gnew, dhat, found = self.__generate_l_graphs(gs, fdgs_list[ig], dhat, ig, found, term3)
  152. if found:
  153. r = 0
  154. gihat_list = [gnew]
  155. dihat_list = [dhat]
  156. else:
  157. r += 1
  158. dis_of_each_itr.append(dhat)
  159. self.__itrs += 1
  160. if self._verbose >= 2:
  161. print('Total number of iterations is', self.__itrs, '.')
  162. print('The preimage is updated', self.__num_updates, 'times.')
  163. print('The shortest distances for previous iterations are', dis_of_each_itr, '.')
  164. # get results and print.
  165. end_generate_preimage = time.time()
  166. self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm
  167. self.__runtime_total = end_generate_preimage - start
  168. self.__preimage = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0])
  169. self.__k_dis_preimage = dhat
  170. if self._verbose:
  171. print()
  172. print('=============================================================================')
  173. print('Finished generalization of preimages.')
  174. print('-----------------------------------------------------------------------------')
  175. print('Distance in kernel space for the best graph from dataset:', self.__k_dis_dataset)
  176. print('Distance in kernel space for the preimage:', self.__k_dis_preimage)
  177. print('Total number of iterations for optimizing:', self.__itrs)
  178. print('Total number of updating preimage:', self.__num_updates)
  179. print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm)
  180. print('Time to generate pre-images:', self.__runtime_generate_preimage)
  181. print('Total time:', self.__runtime_total)
  182. print('=============================================================================')
  183. print()
  184. def __generate_l_graphs(self, g_init, fdgs, dhat, ig, found, term3):
  185. if self.__parallel:
  186. gnew, dhat, found = self.__generate_l_graphs_parallel(g_init, fdgs, dhat, ig, found, term3)
  187. else:
  188. gnew, dhat, found = self.__generate_l_graphs_series(g_init, fdgs, dhat, ig, found, term3)
  189. return gnew, dhat, found
  190. def __generate_l_graphs_series(self, g_init, fdgs, dhat, ig, found, term3):
  191. gnew = None
  192. for trail in range(0, self.__l):
  193. if self._verbose >= 2:
  194. print('---', trail + 1, 'trail out of', self.__l)
  195. # add and delete edges.
  196. gtemp = g_init.copy()
  197. np.random.seed() # @todo: may not work for possible parallel.
  198. # which edges to change.
  199. # @todo: should we use just half of the adjacency matrix for undirected graphs?
  200. nb_vpairs = nx.number_of_nodes(g_init) * (nx.number_of_nodes(g_init) - 1)
  201. # @todo: what if fdgs is bigger than nb_vpairs?
  202. idx_change = random.sample(range(nb_vpairs), fdgs if
  203. fdgs < nb_vpairs else nb_vpairs)
  204. for item in idx_change:
  205. node1 = int(item / (nx.number_of_nodes(g_init) - 1))
  206. node2 = (item - node1 * (nx.number_of_nodes(g_init) - 1))
  207. if node2 >= node1: # skip the self pair.
  208. node2 += 1
  209. # @todo: is the randomness correct?
  210. if not gtemp.has_edge(node1, node2):
  211. gtemp.add_edge(node1, node2)
  212. else:
  213. gtemp.remove_edge(node1, node2)
  214. # compute new distances.
  215. kernels_to_gtmp, _ = self._graph_kernel.compute(gtemp, self._dataset.graphs, **self._kernel_options)
  216. kernel_gtmp, _ = self._graph_kernel.compute(gtemp, gtemp, **self._kernel_options)
  217. kernels_to_gtmp = [kernels_to_gtmp[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gtmp) for i in range(len(kernels_to_gtmp))] # normalize
  218. # @todo: not correct kernel value
  219. gram_with_gtmp = np.concatenate((np.array([kernels_to_gtmp]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  220. gram_with_gtmp = np.concatenate((np.array([[1] + kernels_to_gtmp]).T, gram_with_gtmp), axis=1)
  221. dnew = compute_k_dis(0, range(1, 1 + len(self._dataset.graphs)), self.__alphas, gram_with_gtmp, term3=term3, withterm3=True)
  222. # get the better graph preimage.
  223. if dnew <= dhat: # @todo: the new distance is smaller or also equal?
  224. if dnew < dhat:
  225. if self._verbose >= 2:
  226. print('trail =', str(trail))
  227. print('\nI am smaller!')
  228. print('index (as in D_k U {gihat} =', str(ig))
  229. print('distance:', dhat, '->', dnew)
  230. self.__num_updates += 1
  231. elif dnew == dhat:
  232. if self._verbose >= 2:
  233. print('I am equal!')
  234. dhat = dnew
  235. gnew = gtemp.copy()
  236. found = True # found better graph.
  237. return gnew, dhat, found
  238. def __generate_l_graphs_parallel(self, g_init, fdgs, dhat, ig, found, term3):
  239. gnew = None
  240. len_itr = self.__l
  241. gnew_list = [None] * len_itr
  242. dnew_list = [None] * len_itr
  243. itr = range(0, len_itr)
  244. n_jobs = multiprocessing.cpu_count()
  245. if len_itr < 100 * n_jobs:
  246. chunksize = int(len_itr / n_jobs) + 1
  247. else:
  248. chunksize = 100
  249. do_fun = partial(self._generate_graph_parallel, g_init, fdgs, term3)
  250. pool = Pool(processes=n_jobs)
  251. if self._verbose >= 2:
  252. iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize),
  253. desc='Generating l graphs', file=sys.stdout)
  254. else:
  255. iterator = pool.imap_unordered(do_fun, itr, chunksize)
  256. for idx, gnew, dnew in iterator:
  257. gnew_list[idx] = gnew
  258. dnew_list[idx] = dnew
  259. pool.close()
  260. pool.join()
  261. # check if get the better graph preimage.
  262. idx_min = np.argmin(dnew_list)
  263. dnew = dnew_list[idx_min]
  264. if dnew <= dhat: # @todo: the new distance is smaller or also equal?
  265. if dnew < dhat:
  266. if self._verbose >= 2:
  267. print('\nI am smaller!')
  268. print('index (as in D_k U {gihat} =', str(ig))
  269. print('distance:', dhat, '->', dnew)
  270. self.__num_updates += 1
  271. elif dnew == dhat:
  272. if self._verbose >= 2:
  273. print('I am equal!')
  274. dhat = dnew
  275. gnew = gnew_list[idx_min]
  276. found = True # found better graph.
  277. return gnew, dhat, found
  278. def _generate_graph_parallel(self, g_init, fdgs, term3, itr):
  279. trail = itr
  280. # add and delete edges.
  281. gtemp = g_init.copy()
  282. np.random.seed() # @todo: may not work for possible parallel.
  283. # which edges to change.
  284. # @todo: should we use just half of the adjacency matrix for undirected graphs?
  285. nb_vpairs = nx.number_of_nodes(g_init) * (nx.number_of_nodes(g_init) - 1)
  286. # @todo: what if fdgs is bigger than nb_vpairs?
  287. idx_change = random.sample(range(nb_vpairs), fdgs if
  288. fdgs < nb_vpairs else nb_vpairs)
  289. for item in idx_change:
  290. node1 = int(item / (nx.number_of_nodes(g_init) - 1))
  291. node2 = (item - node1 * (nx.number_of_nodes(g_init) - 1))
  292. if node2 >= node1: # skip the self pair.
  293. node2 += 1
  294. # @todo: is the randomness correct?
  295. if not gtemp.has_edge(node1, node2):
  296. gtemp.add_edge(node1, node2)
  297. else:
  298. gtemp.remove_edge(node1, node2)
  299. # compute new distances.
  300. kernels_to_gtmp, _ = self._graph_kernel.compute(gtemp, self._dataset.graphs, **self._kernel_options)
  301. kernel_gtmp, _ = self._graph_kernel.compute(gtemp, gtemp, **self._kernel_options)
  302. kernels_to_gtmp = [kernels_to_gtmp[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gtmp) for i in range(len(kernels_to_gtmp))] # normalize
  303. # @todo: not correct kernel value
  304. gram_with_gtmp = np.concatenate((np.array([kernels_to_gtmp]), np.copy(self._graph_kernel.gram_matrix)), axis=0)
  305. gram_with_gtmp = np.concatenate((np.array([[1] + kernels_to_gtmp]).T, gram_with_gtmp), axis=1)
  306. dnew = compute_k_dis(0, range(1, 1 + len(self._dataset.graphs)), self.__alphas, gram_with_gtmp, term3=term3, withterm3=True)
  307. return trail, gtemp, dnew
  308. def get_results(self):
  309. results = {}
  310. results['runtime_precompute_gm'] = self.__runtime_precompute_gm
  311. results['runtime_generate_preimage'] = self.__runtime_generate_preimage
  312. results['runtime_total'] = self.__runtime_total
  313. results['k_dis_dataset'] = self.__k_dis_dataset
  314. results['k_dis_preimage'] = self.__k_dis_preimage
  315. results['itrs'] = self.__itrs
  316. results['num_updates'] = self.__num_updates
  317. return results
  318. def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
  319. if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
  320. # if self.__state == AlgorithmState.TERMINATED:
  321. # self.__state = AlgorithmState.INITIALIZED
  322. return True
  323. return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False)
  324. @property
  325. def preimage(self):
  326. return self.__preimage
  327. @property
  328. def best_from_dataset(self):
  329. return self.__best_from_dataset
  330. @property
  331. def gram_matrix_unnorm(self):
  332. return self.__gram_matrix_unnorm
  333. @gram_matrix_unnorm.setter
  334. def gram_matrix_unnorm(self, value):
  335. self.__gram_matrix_unnorm = value

A Python package for graph kernels, graph edit distances and graph pre-image problem.