You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

rwalk_sym.py 38 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Created on Sun Dec 23 16:53:57 2018
  5. @author: ljia
  6. @references: S Vichy N Vishwanathan, Nicol N Schraudolph, Risi Kondor, and
  7. Karsten M Borgwardt. Graph kernels. Journal of Machine Learning Research,
  8. 11(Apr):1201–1242, 2010.
  9. """
  10. import sys
  11. sys.path.insert(0, "../")
  12. import time
  13. from functools import partial
  14. from tqdm import tqdm
  15. import networkx as nx
  16. import numpy as np
  17. from scipy.sparse import identity, kron
  18. from scipy.sparse.linalg import cg
  19. from scipy.optimize import fixed_point
  20. from gklearn.utils.graphdataset import get_dataset_attributes
  21. from gklearn.utils.parallel import parallel_gm
  22. def randomwalkkernel(*args,
  23. # params for all method.
  24. compute_method=None,
  25. weight=1,
  26. p=None,
  27. q=None,
  28. edge_weight=None,
  29. # params for conjugate and fp method.
  30. node_kernels=None,
  31. edge_kernels=None,
  32. node_label='atom',
  33. edge_label='bond_type',
  34. # params for spectral method.
  35. sub_kernel=None,
  36. n_jobs=None):
  37. """Calculate random walk graph kernels.
  38. Parameters
  39. ----------
  40. Gn : List of NetworkX graph
  41. List of graphs between which the kernels are calculated.
  42. /
  43. G1, G2 : NetworkX graphs
  44. 2 graphs between which the kernel is calculated.
  45. node_label : string
  46. node attribute used as label. The default node label is atom.
  47. edge_label : string
  48. edge attribute used as label. The default edge label is bond_type.
  49. h : integer
  50. Longest length of walks.
  51. method : string
  52. Method used to compute the random walk kernel. Available methods are 'sylvester', 'conjugate', 'fp', 'spectral' and 'kron'.
  53. Return
  54. ------
  55. Kmatrix : Numpy matrix
  56. Kernel matrix, each element of which is the path kernel up to d between 2 praphs.
  57. """
  58. compute_method = compute_method.lower()
  59. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  60. eweight = None
  61. if edge_weight == None:
  62. print('\n None edge weight specified. Set all weight to 1.\n')
  63. else:
  64. try:
  65. some_weight = list(
  66. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  67. if isinstance(some_weight, float) or isinstance(some_weight, int):
  68. eweight = edge_weight
  69. else:
  70. print(
  71. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  72. % edge_weight)
  73. except:
  74. print(
  75. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  76. % edge_weight)
  77. ds_attrs = get_dataset_attributes(
  78. Gn,
  79. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  80. 'edge_attr_dim', 'is_directed'],
  81. node_label=node_label,
  82. edge_label=edge_label)
  83. ds_attrs['node_attr_dim'] = 0
  84. ds_attrs['edge_attr_dim'] = 0
  85. # remove graphs with no edges, as no walk can be found in their structures,
  86. # so the weight matrix between such a graph and itself might be zero.
  87. len_gn = len(Gn)
  88. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  89. idx = [G[0] for G in Gn]
  90. Gn = [G[1] for G in Gn]
  91. if len(Gn) != len_gn:
  92. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  93. (len_gn - len(Gn)))
  94. start_time = time.time()
  95. # # get vertex and edge concatenated labels for each graph
  96. # label_list, d = getLabels(Gn, node_label, edge_label, ds_attrs['is_directed'])
  97. # gmf = filterGramMatrix(A_wave_list[0], label_list[0], ('C', '0', 'O'), ds_attrs['is_directed'])
  98. if compute_method == 'sylvester':
  99. import warnings
  100. warnings.warn('All labels are ignored.')
  101. Kmatrix = _sylvester_equation(Gn, weight, p, q, eweight, n_jobs)
  102. elif compute_method == 'conjugate':
  103. Kmatrix = _conjugate_gradient(Gn, weight, p, q, ds_attrs,
  104. node_kernels, edge_kernels,
  105. node_label, edge_label, eweight, n_jobs)
  106. elif compute_method == 'fp':
  107. Kmatrix = _fixed_point(Gn, weight, p, q, ds_attrs, node_kernels,
  108. edge_kernels, node_label, edge_label,
  109. eweight, n_jobs)
  110. elif compute_method == 'spectral':
  111. import warnings
  112. warnings.warn('All labels are ignored. Only works for undirected graphs.')
  113. Kmatrix = _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs)
  114. elif compute_method == 'kron':
  115. for i in range(0, len(Gn)):
  116. for j in range(i, len(Gn)):
  117. Kmatrix[i][j] = _randomwalkkernel_kron(Gn[i], Gn[j],
  118. node_label, edge_label)
  119. Kmatrix[j][i] = Kmatrix[i][j]
  120. else:
  121. raise Exception(
  122. 'compute method name incorrect. Available methods: "sylvester", "conjugate", "fp", "spectral" and "kron".'
  123. )
  124. run_time = time.time() - start_time
  125. print(
  126. "\n --- kernel matrix of random walk kernel of size %d built in %s seconds ---"
  127. % (len(Gn), run_time))
  128. return Kmatrix, run_time, idx
  129. ###############################################################################
  130. def _sylvester_equation(Gn, lmda, p, q, eweight, n_jobs):
  131. """Calculate walk graph kernels up to n between 2 graphs using Sylvester method.
  132. Parameters
  133. ----------
  134. G1, G2 : NetworkX graph
  135. Graphs between which the kernel is calculated.
  136. node_label : string
  137. node attribute used as label.
  138. edge_label : string
  139. edge attribute used as label.
  140. Return
  141. ------
  142. kernel : float
  143. Kernel between 2 graphs.
  144. """
  145. Kmatrix = np.zeros((len(Gn), len(Gn)))
  146. if q == None:
  147. # don't normalize adjacency matrices if q is a uniform vector. Note
  148. # A_wave_list accually contains the transposes of the adjacency matrices.
  149. A_wave_list = [
  150. nx.adjacency_matrix(G, eweight).todense().transpose() for G in tqdm(
  151. Gn, desc='compute adjacency matrices', file=sys.stdout)
  152. ]
  153. # # normalized adjacency matrices
  154. # A_wave_list = []
  155. # for G in tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout):
  156. # A_tilde = nx.adjacency_matrix(G, eweight).todense().transpose()
  157. # norm = A_tilde.sum(axis=0)
  158. # norm[norm == 0] = 1
  159. # A_wave_list.append(A_tilde / norm)
  160. if p == None: # p is uniform distribution as default.
  161. def init_worker(Awl_toshare):
  162. global G_Awl
  163. G_Awl = Awl_toshare
  164. do_partial = partial(wrapper_se_do, lmda)
  165. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  166. glbv=(A_wave_list,), n_jobs=n_jobs)
  167. # pbar = tqdm(
  168. # total=(1 + len(Gn)) * len(Gn) / 2,
  169. # desc='calculating kernels',
  170. # file=sys.stdout)
  171. # for i in range(0, len(Gn)):
  172. # for j in range(i, len(Gn)):
  173. # S = lmda * A_wave_list[j]
  174. # T_t = A_wave_list[i]
  175. # # use uniform distribution if there is no prior knowledge.
  176. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  177. # p_times_uni = 1 / nb_pd
  178. # M0 = np.full((len(A_wave_list[j]), len(A_wave_list[i])), p_times_uni)
  179. # X = dlyap(S, T_t, M0)
  180. # X = np.reshape(X, (-1, 1), order='F')
  181. # # use uniform distribution if there is no prior knowledge.
  182. # q_times = np.full((1, nb_pd), p_times_uni)
  183. # Kmatrix[i][j] = np.dot(q_times, X)
  184. # Kmatrix[j][i] = Kmatrix[i][j]
  185. # pbar.update(1)
  186. return Kmatrix
  187. def wrapper_se_do(lmda, itr):
  188. i = itr[0]
  189. j = itr[1]
  190. return i, j, _se_do(G_Awl[i], G_Awl[j], lmda)
  191. def _se_do(A_wave1, A_wave2, lmda):
  192. from control import dlyap
  193. S = lmda * A_wave2
  194. T_t = A_wave1
  195. # use uniform distribution if there is no prior knowledge.
  196. nb_pd = len(A_wave1) * len(A_wave2)
  197. p_times_uni = 1 / nb_pd
  198. M0 = np.full((len(A_wave2), len(A_wave1)), p_times_uni)
  199. X = dlyap(S, T_t, M0)
  200. X = np.reshape(X, (-1, 1), order='F')
  201. # use uniform distribution if there is no prior knowledge.
  202. q_times = np.full((1, nb_pd), p_times_uni)
  203. return np.dot(q_times, X)
  204. ###############################################################################
  205. def _conjugate_gradient(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  206. node_label, edge_label, eweight, n_jobs):
  207. """Calculate walk graph kernels up to n between 2 graphs using conjugate method.
  208. Parameters
  209. ----------
  210. G1, G2 : NetworkX graph
  211. Graphs between which the kernel is calculated.
  212. node_label : string
  213. node attribute used as label.
  214. edge_label : string
  215. edge attribute used as label.
  216. Return
  217. ------
  218. kernel : float
  219. Kernel between 2 graphs.
  220. """
  221. Kmatrix = np.zeros((len(Gn), len(Gn)))
  222. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  223. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] < 1:
  224. # # this is faster from unlabeled graphs. @todo: why?
  225. # if q == None:
  226. # # don't normalize adjacency matrices if q is a uniform vector. Note
  227. # # A_wave_list accually contains the transposes of the adjacency matrices.
  228. # A_wave_list = [
  229. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  230. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  231. # ]
  232. # if p == None: # p is uniform distribution as default.
  233. # def init_worker(Awl_toshare):
  234. # global G_Awl
  235. # G_Awl = Awl_toshare
  236. # do_partial = partial(wrapper_cg_unlabled_do, lmda)
  237. # parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  238. # glbv=(A_wave_list,), n_jobs=n_jobs)
  239. # else:
  240. # reindex nodes using consecutive integers for convenience of kernel calculation.
  241. Gn = [nx.convert_node_labels_to_integers(
  242. g, first_label=0, label_attribute='label_orignal') for g in tqdm(
  243. Gn, desc='reindex vertices', file=sys.stdout)]
  244. if p == None and q == None: # p and q are uniform distributions as default.
  245. def init_worker(gn_toshare):
  246. global G_gn
  247. G_gn = gn_toshare
  248. do_partial = partial(wrapper_cg_labled_do, ds_attrs, node_kernels,
  249. node_label, edge_kernels, edge_label, lmda)
  250. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  251. glbv=(Gn,), n_jobs=n_jobs)
  252. # pbar = tqdm(
  253. # total=(1 + len(Gn)) * len(Gn) / 2,
  254. # desc='calculating kernels',
  255. # file=sys.stdout)
  256. # for i in range(0, len(Gn)):
  257. # for j in range(i, len(Gn)):
  258. # result = _cg_labled_do(Gn[i], Gn[j], ds_attrs, node_kernels,
  259. # node_label, edge_kernels, edge_label, lmda)
  260. # Kmatrix[i][j] = result
  261. # Kmatrix[j][i] = Kmatrix[i][j]
  262. # pbar.update(1)
  263. return Kmatrix
  264. def wrapper_cg_unlabled_do(lmda, itr):
  265. i = itr[0]
  266. j = itr[1]
  267. return i, j, _cg_unlabled_do(G_Awl[i], G_Awl[j], lmda)
  268. def _cg_unlabled_do(A_wave1, A_wave2, lmda):
  269. nb_pd = len(A_wave1) * len(A_wave2)
  270. p_times_uni = 1 / nb_pd
  271. w_times = kron(A_wave1, A_wave2).todense()
  272. A = identity(w_times.shape[0]) - w_times * lmda
  273. b = np.full((nb_pd, 1), p_times_uni)
  274. x, _ = cg(A, b)
  275. # use uniform distribution if there is no prior knowledge.
  276. q_times = np.full((1, nb_pd), p_times_uni)
  277. return np.dot(q_times, x)
  278. def wrapper_cg_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  279. edge_label, lmda, itr):
  280. i = itr[0]
  281. j = itr[1]
  282. return i, j, _cg_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  283. node_label, edge_kernels, edge_label, lmda)
  284. def _cg_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  285. edge_kernels, edge_label, lmda):
  286. # Frist, ompute kernels between all pairs of nodes, method borrowed
  287. # from FCSP. It is faster than directly computing all edge kernels
  288. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  289. # graphs compared, which is the most case we went though. For very
  290. # sparse graphs, this would be slow.
  291. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  292. # Compute weight matrix of the direct product graph.
  293. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  294. edge_kernels, edge_label)
  295. # use uniform distribution if there is no prior knowledge.
  296. p_times_uni = 1 / w_dim
  297. A = identity(w_times.shape[0]) - w_times * lmda
  298. b = np.full((w_dim, 1), p_times_uni)
  299. x, _ = cg(A, b)
  300. # use uniform distribution if there is no prior knowledge.
  301. q_times = np.full((1, w_dim), p_times_uni)
  302. return np.dot(q_times, x)
  303. ###############################################################################
  304. def _fixed_point(Gn, lmda, p, q, ds_attrs, node_kernels, edge_kernels,
  305. node_label, edge_label, eweight, n_jobs):
  306. """Calculate walk graph kernels up to n between 2 graphs using Fixed-Point method.
  307. Parameters
  308. ----------
  309. G1, G2 : NetworkX graph
  310. Graphs between which the kernel is calculated.
  311. node_label : string
  312. node attribute used as label.
  313. edge_label : string
  314. edge attribute used as label.
  315. Return
  316. ------
  317. kernel : float
  318. Kernel between 2 graphs.
  319. """
  320. Kmatrix = np.zeros((len(Gn), len(Gn)))
  321. # if not ds_attrs['node_labeled'] and ds_attrs['node_attr_dim'] < 1 and \
  322. # not ds_attrs['edge_labeled'] and ds_attrs['edge_attr_dim'] > 1:
  323. # # this is faster from unlabeled graphs. @todo: why?
  324. # if q == None:
  325. # # don't normalize adjacency matrices if q is a uniform vector. Note
  326. # # A_wave_list accually contains the transposes of the adjacency matrices.
  327. # A_wave_list = [
  328. # nx.adjacency_matrix(G, eweight).todense().transpose() for G in
  329. # tqdm(Gn, desc='compute adjacency matrices', file=sys.stdout)
  330. # ]
  331. # if p == None: # p is uniform distribution as default.
  332. # pbar = tqdm(
  333. # total=(1 + len(Gn)) * len(Gn) / 2,
  334. # desc='calculating kernels',
  335. # file=sys.stdout)
  336. # for i in range(0, len(Gn)):
  337. # for j in range(i, len(Gn)):
  338. # # use uniform distribution if there is no prior knowledge.
  339. # nb_pd = len(A_wave_list[i]) * len(A_wave_list[j])
  340. # p_times_uni = 1 / nb_pd
  341. # w_times = kron(A_wave_list[i], A_wave_list[j]).todense()
  342. # p_times = np.full((nb_pd, 1), p_times_uni)
  343. # x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times))
  344. # # use uniform distribution if there is no prior knowledge.
  345. # q_times = np.full((1, nb_pd), p_times_uni)
  346. # Kmatrix[i][j] = np.dot(q_times, x)
  347. # Kmatrix[j][i] = Kmatrix[i][j]
  348. # pbar.update(1)
  349. # else:
  350. # reindex nodes using consecutive integers for convenience of kernel calculation.
  351. Gn = [nx.convert_node_labels_to_integers(
  352. g, first_label=0, label_attribute='label_orignal') for g in tqdm(
  353. Gn, desc='reindex vertices', file=sys.stdout)]
  354. if p == None and q == None: # p and q are uniform distributions as default.
  355. def init_worker(gn_toshare):
  356. global G_gn
  357. G_gn = gn_toshare
  358. do_partial = partial(wrapper_fp_labled_do, ds_attrs, node_kernels,
  359. node_label, edge_kernels, edge_label, lmda)
  360. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  361. glbv=(Gn,), n_jobs=n_jobs)
  362. return Kmatrix
  363. def wrapper_fp_labled_do(ds_attrs, node_kernels, node_label, edge_kernels,
  364. edge_label, lmda, itr):
  365. i = itr[0]
  366. j = itr[1]
  367. return i, j, _fp_labled_do(G_gn[i], G_gn[j], ds_attrs, node_kernels,
  368. node_label, edge_kernels, edge_label, lmda)
  369. def _fp_labled_do(g1, g2, ds_attrs, node_kernels, node_label,
  370. edge_kernels, edge_label, lmda):
  371. # Frist, ompute kernels between all pairs of nodes, method borrowed
  372. # from FCSP. It is faster than directly computing all edge kernels
  373. # when $d_1d_2>2$, where $d_1$ and $d_2$ are vertex degrees of the
  374. # graphs compared, which is the most case we went though. For very
  375. # sparse graphs, this would be slow.
  376. vk_dict = computeVK(g1, g2, ds_attrs, node_kernels, node_label)
  377. # Compute weight matrix of the direct product graph.
  378. w_times, w_dim = computeW(g1, g2, vk_dict, ds_attrs,
  379. edge_kernels, edge_label)
  380. # use uniform distribution if there is no prior knowledge.
  381. p_times_uni = 1 / w_dim
  382. p_times = np.full((w_dim, 1), p_times_uni)
  383. x = fixed_point(func_fp, p_times, args=(p_times, lmda, w_times),
  384. xtol=1e-06, maxiter=1000)
  385. # use uniform distribution if there is no prior knowledge.
  386. q_times = np.full((1, w_dim), p_times_uni)
  387. return np.dot(q_times, x)
  388. def func_fp(x, p_times, lmda, w_times):
  389. haha = w_times * x
  390. haha = lmda * haha
  391. haha = p_times + haha
  392. return p_times + lmda * np.dot(w_times, x)
  393. ###############################################################################
  394. def _spectral_decomposition(Gn, weight, p, q, sub_kernel, eweight, n_jobs):
  395. """Calculate walk graph kernels up to n between 2 unlabeled graphs using
  396. spectral decomposition method. Labels will be ignored.
  397. Parameters
  398. ----------
  399. G1, G2 : NetworkX graph
  400. Graphs between which the kernel is calculated.
  401. node_label : string
  402. node attribute used as label.
  403. edge_label : string
  404. edge attribute used as label.
  405. Return
  406. ------
  407. kernel : float
  408. Kernel between 2 graphs.
  409. """
  410. Kmatrix = np.zeros((len(Gn), len(Gn)))
  411. if q == None:
  412. # precompute the spectral decomposition of each graph.
  413. P_list = []
  414. D_list = []
  415. for G in tqdm(Gn, desc='spectral decompose', file=sys.stdout):
  416. # don't normalize adjacency matrices if q is a uniform vector. Note
  417. # A accually is the transpose of the adjacency matrix.
  418. A = nx.adjacency_matrix(G, eweight).todense().transpose()
  419. ew, ev = np.linalg.eig(A)
  420. D_list.append(ew)
  421. P_list.append(ev)
  422. # P_inv_list = [p.T for p in P_list] # @todo: also works for directed graphs?
  423. if p == None: # p is uniform distribution as default.
  424. q_T_list = [np.full((1, nx.number_of_nodes(G)), 1 / nx.number_of_nodes(G)) for G in Gn]
  425. # q_T_list = [q.T for q in q_list]
  426. def init_worker(q_T_toshare, P_toshare, D_toshare):
  427. global G_q_T, G_P, G_D
  428. G_q_T = q_T_toshare
  429. G_P = P_toshare
  430. G_D = D_toshare
  431. do_partial = partial(wrapper_sd_do, weight, sub_kernel)
  432. parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
  433. glbv=(q_T_list, P_list, D_list), n_jobs=n_jobs)
  434. # pbar = tqdm(
  435. # total=(1 + len(Gn)) * len(Gn) / 2,
  436. # desc='calculating kernels',
  437. # file=sys.stdout)
  438. # for i in range(0, len(Gn)):
  439. # for j in range(i, len(Gn)):
  440. # result = _sd_do(q_T_list[i], q_T_list[j], P_list[i], P_list[j],
  441. # D_list[i], D_list[j], weight, sub_kernel)
  442. # Kmatrix[i][j] = result
  443. # Kmatrix[j][i] = Kmatrix[i][j]
  444. # pbar.update(1)
  445. return Kmatrix
  446. def wrapper_sd_do(weight, sub_kernel, itr):
  447. i = itr[0]
  448. j = itr[1]
  449. return i, j, _sd_do(G_q_T[i], G_q_T[j], G_P[i], G_P[j], G_D[i], G_D[j],
  450. weight, sub_kernel)
  451. def _sd_do(q_T1, q_T2, P1, P2, D1, D2, weight, sub_kernel):
  452. # use uniform distribution if there is no prior knowledge.
  453. kl = kron(np.dot(q_T1, P1), np.dot(q_T2, P2)).todense()
  454. # @todo: this is not be needed when p = q (kr = kl.T) for undirected graphs
  455. # kr = kron(np.dot(P_inv_list[i], q_list[i]), np.dot(P_inv_list[j], q_list[j])).todense()
  456. if sub_kernel == 'exp':
  457. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  458. kmiddle = np.diag(np.exp(weight * D_diag))
  459. elif sub_kernel == 'geo':
  460. D_diag = np.array([d1 * d2 for d1 in D1 for d2 in D2])
  461. kmiddle = np.diag(weight * D_diag)
  462. kmiddle = np.identity(len(kmiddle)) - weight * kmiddle
  463. kmiddle = np.linalg.inv(kmiddle)
  464. return np.dot(np.dot(kl, kmiddle), kl.T)[0, 0]
  465. ###############################################################################
  466. def _randomwalkkernel_kron(G1, G2, node_label, edge_label):
  467. """Calculate walk graph kernels up to n between 2 graphs using nearest Kronecker product approximation method.
  468. Parameters
  469. ----------
  470. G1, G2 : NetworkX graph
  471. Graphs between which the kernel is calculated.
  472. node_label : string
  473. node attribute used as label.
  474. edge_label : string
  475. edge attribute used as label.
  476. Return
  477. ------
  478. kernel : float
  479. Kernel between 2 graphs.
  480. """
  481. pass
  482. ###############################################################################
  483. def getLabels(Gn, node_label, edge_label, directed):
  484. """Get symbolic labels of a graph dataset, where vertex labels are dealt
  485. with by concatenating them to the edge labels of adjacent edges.
  486. """
  487. label_list = []
  488. label_set = set()
  489. for g in Gn:
  490. label_g = {}
  491. for e in g.edges(data=True):
  492. nl1 = g.node[e[0]][node_label]
  493. nl2 = g.node[e[1]][node_label]
  494. if not directed and nl1 > nl2:
  495. nl1, nl2 = nl2, nl1
  496. label = (nl1, e[2][edge_label], nl2)
  497. label_g[(e[0], e[1])] = label
  498. label_list.append(label_g)
  499. label_set = set([l for lg in label_list for l in lg.values()])
  500. return label_list, len(label_set)
  501. def filterGramMatrix(gmt, label_dict, label, directed):
  502. """Compute (the transpose of) the Gram matrix filtered by a label.
  503. """
  504. gmf = np.zeros(gmt.shape)
  505. for (n1, n2), l in label_dict.items():
  506. if l == label:
  507. gmf[n2, n1] = gmt[n2, n1]
  508. if not directed:
  509. gmf[n1, n2] = gmt[n1, n2]
  510. return gmf
  511. def computeVK(g1, g2, ds_attrs, node_kernels, node_label):
  512. '''Compute vertex kernels between vertices of two graphs.
  513. '''
  514. vk_dict = {} # shortest path matrices dict
  515. if ds_attrs['node_labeled']:
  516. # node symb and non-synb labeled
  517. if ds_attrs['node_attr_dim'] > 0:
  518. kn = node_kernels['mix']
  519. for n1 in g1.nodes(data=True):
  520. for n2 in g2.nodes(data=True):
  521. vk_dict[(n1[0], n2[0])] = kn(
  522. n1[1][node_label], n2[1][node_label],
  523. n1[1]['attributes'], n2[1]['attributes'])
  524. # node symb labeled
  525. else:
  526. kn = node_kernels['symb']
  527. for n1 in g1.nodes(data=True):
  528. for n2 in g2.nodes(data=True):
  529. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  530. n2[1][node_label])
  531. else:
  532. # node non-synb labeled
  533. if ds_attrs['node_attr_dim'] > 0:
  534. kn = node_kernels['nsymb']
  535. for n1 in g1.nodes(data=True):
  536. for n2 in g2.nodes(data=True):
  537. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  538. n2[1]['attributes'])
  539. # node unlabeled
  540. else:
  541. pass
  542. return vk_dict
  543. def computeW(g1, g2, vk_dict, ds_attrs, edge_kernels, edge_label):
  544. '''Compute weight matrix of the direct product graph.
  545. '''
  546. w_dim = nx.number_of_nodes(g1) * nx.number_of_nodes(g2)
  547. w_times = np.zeros((w_dim, w_dim))
  548. if vk_dict: # node labeled
  549. if ds_attrs['is_directed']:
  550. if ds_attrs['edge_labeled']:
  551. # edge symb and non-synb labeled
  552. if ds_attrs['edge_attr_dim'] > 0:
  553. ke = edge_kernels['mix']
  554. for e1 in g1.edges(data=True):
  555. for e2 in g2.edges(data=True):
  556. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  557. e1[2]['attributes'], e2[2]['attributes'])
  558. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  559. e1[1] * nx.number_of_nodes(g2) + e2[1])
  560. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  561. * ek_temp * vk_dict[(e1[1], e2[1])]
  562. # edge symb labeled
  563. else:
  564. ke = edge_kernels['symb']
  565. for e1 in g1.edges(data=True):
  566. for e2 in g2.edges(data=True):
  567. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  568. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  569. e1[1] * nx.number_of_nodes(g2) + e2[1])
  570. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  571. * ek_temp * vk_dict[(e1[1], e2[1])]
  572. else:
  573. # edge non-synb labeled
  574. if ds_attrs['edge_attr_dim'] > 0:
  575. ke = edge_kernels['nsymb']
  576. for e1 in g1.edges(data=True):
  577. for e2 in g2.edges(data=True):
  578. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  579. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  580. e1[1] * nx.number_of_nodes(g2) + e2[1])
  581. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  582. * ek_temp * vk_dict[(e1[1], e2[1])]
  583. # edge unlabeled
  584. else:
  585. for e1 in g1.edges(data=True):
  586. for e2 in g2.edges(data=True):
  587. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  588. e1[1] * nx.number_of_nodes(g2) + e2[1])
  589. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  590. * vk_dict[(e1[1], e2[1])]
  591. else: # undirected
  592. if ds_attrs['edge_labeled']:
  593. # edge symb and non-synb labeled
  594. if ds_attrs['edge_attr_dim'] > 0:
  595. ke = edge_kernels['mix']
  596. for e1 in g1.edges(data=True):
  597. for e2 in g2.edges(data=True):
  598. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  599. e1[2]['attributes'], e2[2]['attributes'])
  600. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  601. e1[1] * nx.number_of_nodes(g2) + e2[1])
  602. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  603. * ek_temp * vk_dict[(e1[1], e2[1])] \
  604. + vk_dict[(e1[0], e2[1])] \
  605. * ek_temp * vk_dict[(e1[1], e2[0])]
  606. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  607. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  608. e1[1] * nx.number_of_nodes(g2) + e2[0])
  609. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  610. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  611. # edge symb labeled
  612. else:
  613. ke = edge_kernels['symb']
  614. for e1 in g1.edges(data=True):
  615. for e2 in g2.edges(data=True):
  616. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  617. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  618. e1[1] * nx.number_of_nodes(g2) + e2[1])
  619. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  620. * ek_temp * vk_dict[(e1[1], e2[1])] \
  621. + vk_dict[(e1[0], e2[1])] \
  622. * ek_temp * vk_dict[(e1[1], e2[0])]
  623. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  624. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  625. e1[1] * nx.number_of_nodes(g2) + e2[0])
  626. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  627. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  628. else:
  629. # edge non-synb labeled
  630. if ds_attrs['edge_attr_dim'] > 0:
  631. ke = edge_kernels['nsymb']
  632. for e1 in g1.edges(data=True):
  633. for e2 in g2.edges(data=True):
  634. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  635. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  636. e1[1] * nx.number_of_nodes(g2) + e2[1])
  637. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  638. * ek_temp * vk_dict[(e1[1], e2[1])] \
  639. + vk_dict[(e1[0], e2[1])] \
  640. * ek_temp * vk_dict[(e1[1], e2[0])]
  641. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  642. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  643. e1[1] * nx.number_of_nodes(g2) + e2[0])
  644. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  645. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  646. # edge unlabeled
  647. else:
  648. for e1 in g1.edges(data=True):
  649. for e2 in g2.edges(data=True):
  650. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  651. e1[1] * nx.number_of_nodes(g2) + e2[1])
  652. w_times[w_idx] = vk_dict[(e1[0], e2[0])] \
  653. * vk_dict[(e1[1], e2[1])] \
  654. + vk_dict[(e1[0], e2[1])] \
  655. * vk_dict[(e1[1], e2[0])]
  656. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  657. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  658. e1[1] * nx.number_of_nodes(g2) + e2[0])
  659. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  660. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  661. else: # node unlabeled
  662. if ds_attrs['is_directed']:
  663. if ds_attrs['edge_labeled']:
  664. # edge symb and non-synb labeled
  665. if ds_attrs['edge_attr_dim'] > 0:
  666. ke = edge_kernels['mix']
  667. for e1 in g1.edges(data=True):
  668. for e2 in g2.edges(data=True):
  669. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  670. e1[2]['attributes'], e2[2]['attributes'])
  671. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  672. e1[1] * nx.number_of_nodes(g2) + e2[1])
  673. w_times[w_idx] = ek_temp
  674. # edge symb labeled
  675. else:
  676. ke = edge_kernels['symb']
  677. for e1 in g1.edges(data=True):
  678. for e2 in g2.edges(data=True):
  679. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  680. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  681. e1[1] * nx.number_of_nodes(g2) + e2[1])
  682. w_times[w_idx] = ek_temp
  683. else:
  684. # edge non-synb labeled
  685. if ds_attrs['edge_attr_dim'] > 0:
  686. ke = edge_kernels['nsymb']
  687. for e1 in g1.edges(data=True):
  688. for e2 in g2.edges(data=True):
  689. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  690. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  691. e1[1] * nx.number_of_nodes(g2) + e2[1])
  692. w_times[w_idx] = ek_temp
  693. # edge unlabeled
  694. else:
  695. for e1 in g1.edges(data=True):
  696. for e2 in g2.edges(data=True):
  697. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  698. e1[1] * nx.number_of_nodes(g2) + e2[1])
  699. w_times[w_idx] = 1
  700. else: # undirected
  701. if ds_attrs['edge_labeled']:
  702. # edge symb and non-synb labeled
  703. if ds_attrs['edge_attr_dim'] > 0:
  704. ke = edge_kernels['mix']
  705. for e1 in g1.edges(data=True):
  706. for e2 in g2.edges(data=True):
  707. ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
  708. e1[2]['attributes'], e2[2]['attributes'])
  709. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  710. e1[1] * nx.number_of_nodes(g2) + e2[1])
  711. w_times[w_idx] = ek_temp
  712. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  713. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  714. e1[1] * nx.number_of_nodes(g2) + e2[0])
  715. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  716. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  717. # edge symb labeled
  718. else:
  719. ke = edge_kernels['symb']
  720. for e1 in g1.edges(data=True):
  721. for e2 in g2.edges(data=True):
  722. ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
  723. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  724. e1[1] * nx.number_of_nodes(g2) + e2[1])
  725. w_times[w_idx] = ek_temp
  726. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  727. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  728. e1[1] * nx.number_of_nodes(g2) + e2[0])
  729. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  730. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  731. else:
  732. # edge non-synb labeled
  733. if ds_attrs['edge_attr_dim'] > 0:
  734. ke = edge_kernels['nsymb']
  735. for e1 in g1.edges(data=True):
  736. for e2 in g2.edges(data=True):
  737. ek_temp = ke(e1[2]['attributes'], e2[2]['attributes'])
  738. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  739. e1[1] * nx.number_of_nodes(g2) + e2[1])
  740. w_times[w_idx] = ek_temp
  741. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  742. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  743. e1[1] * nx.number_of_nodes(g2) + e2[0])
  744. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  745. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  746. # edge unlabeled
  747. else:
  748. for e1 in g1.edges(data=True):
  749. for e2 in g2.edges(data=True):
  750. w_idx = (e1[0] * nx.number_of_nodes(g2) + e2[0],
  751. e1[1] * nx.number_of_nodes(g2) + e2[1])
  752. w_times[w_idx] = 1
  753. w_times[w_idx[1], w_idx[0]] = w_times[w_idx[0], w_idx[1]]
  754. w_idx2 = (e1[0] * nx.number_of_nodes(g2) + e2[1],
  755. e1[1] * nx.number_of_nodes(g2) + e2[0])
  756. w_times[w_idx2[0], w_idx2[1]] = w_times[w_idx[0], w_idx[1]]
  757. w_times[w_idx2[1], w_idx2[0]] = w_times[w_idx[0], w_idx[1]]
  758. return w_times, w_dim

A Python package for graph kernels, graph edit distances and graph pre-image problem.