You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_parallel.py 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Test of parallel, find the best parallel chunksize and iteration seperation scheme.
  5. Created on Wed Sep 26 12:09:34 2018
  6. @author: ljia
  7. """
  8. import sys
  9. import time
  10. from itertools import combinations_with_replacement, product, combinations
  11. from functools import partial
  12. from multiprocessing import Pool
  13. from tqdm import tqdm
  14. import networkx as nx
  15. import numpy as np
  16. import functools
  17. from libs import *
  18. import multiprocessing
  19. from sklearn.metrics.pairwise import rbf_kernel
  20. from matplotlib import pyplot as plt
  21. from sklearn.model_selection import ParameterGrid
  22. sys.path.insert(0, "../")
  23. from pygraph.utils.utils import getSPGraph, direct_product
  24. from pygraph.utils.graphdataset import get_dataset_attributes
  25. from pygraph.utils.graphfiles import loadDataset
  26. from pygraph.utils.kernels import deltakernel, kernelproduct
  27. def spkernel(*args,
  28. node_label='atom',
  29. edge_weight=None,
  30. node_kernels=None,
  31. n_jobs=None,
  32. chunksize=1):
  33. """Calculate shortest-path kernels between graphs.
  34. Parameters
  35. ----------
  36. Gn : List of NetworkX graph
  37. List of graphs between which the kernels are calculated.
  38. /
  39. G1, G2 : NetworkX graphs
  40. 2 graphs between which the kernel is calculated.
  41. edge_weight : string
  42. Edge attribute name corresponding to the edge weight.
  43. node_kernels: dict
  44. A dictionary of kernel functions for nodes, including 3 items: 'symb' for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix' for both labels. The first 2 functions take two node labels as parameters, and the 'mix' function takes 4 parameters, a symbolic and a non-symbolic label for each the two nodes. Each label is in form of 2-D dimension array (n_samples, n_features). Each function returns an number as the kernel value. Ignored when nodes are unlabeled.
  45. Return
  46. ------
  47. Kmatrix : Numpy matrix
  48. Kernel matrix, each element of which is the sp kernel between 2 praphs.
  49. """
  50. # pre-process
  51. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  52. weight = None
  53. if edge_weight is None:
  54. pass
  55. else:
  56. try:
  57. some_weight = list(
  58. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  59. if isinstance(some_weight, (float, int)):
  60. weight = edge_weight
  61. except:
  62. pass
  63. ds_attrs = get_dataset_attributes(
  64. Gn,
  65. attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
  66. node_label=node_label)
  67. # remove graphs with no edges, as no sp can be found in their structures, so the kernel between such a graph and itself will be zero.
  68. len_gn = len(Gn)
  69. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  70. idx = [G[0] for G in Gn]
  71. Gn = [G[1] for G in Gn]
  72. if len(Gn) != len_gn:
  73. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  74. (len_gn - len(Gn)))
  75. start_time = time.time()
  76. pool = Pool(n_jobs)
  77. # get shortest path graphs of Gn
  78. getsp_partial = partial(wrap_getSPGraph, Gn, weight)
  79. for i, g in tqdm(
  80. pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
  81. desc='getting sp graphs',
  82. file=sys.stdout):
  83. Gn[i] = g
  84. Kmatrix = np.zeros((len(Gn), len(Gn)))
  85. # ---- use pool.imap_unordered to parallel and track progress. ----
  86. do_partial = partial(spkernel_do, Gn, ds_attrs, node_label, node_kernels)
  87. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  88. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  89. # if len_itr < 100:
  90. # chunksize, extra = divmod(len_itr, n_jobs * 4)
  91. # if extra:
  92. # chunksize += 1
  93. # else:
  94. # chunksize = 300
  95. for i, j, kernel in tqdm(
  96. pool.imap_unordered(do_partial, itr, chunksize),
  97. desc='calculating kernels',
  98. file=sys.stdout):
  99. Kmatrix[i][j] = kernel
  100. Kmatrix[j][i] = kernel
  101. pool.close()
  102. pool.join()
  103. run_time = time.time() - start_time
  104. print(
  105. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  106. % (len(Gn), run_time))
  107. return Kmatrix, run_time, idx
  108. def spkernel_do(Gn, ds_attrs, node_label, node_kernels, ij):
  109. i = ij[0]
  110. j = ij[1]
  111. g1 = Gn[i]
  112. g2 = Gn[j]
  113. Kmatrix = 0
  114. try:
  115. # compute shortest path matrices first, method borrowed from FCSP.
  116. if ds_attrs['node_labeled']:
  117. # node symb and non-synb labeled
  118. if ds_attrs['node_attr_dim'] > 0:
  119. kn = node_kernels['mix']
  120. vk_dict = {} # shortest path matrices dict
  121. for n1, n2 in product(
  122. g1.nodes(data=True), g2.nodes(data=True)):
  123. vk_dict[(n1[0], n2[0])] = kn(
  124. n1[1][node_label], n2[1][node_label],
  125. [n1[1]['attributes']], [n2[1]['attributes']])
  126. # node symb labeled
  127. else:
  128. kn = node_kernels['symb']
  129. vk_dict = {} # shortest path matrices dict
  130. for n1 in g1.nodes(data=True):
  131. for n2 in g2.nodes(data=True):
  132. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  133. n2[1][node_label])
  134. else:
  135. # node non-synb labeled
  136. if ds_attrs['node_attr_dim'] > 0:
  137. kn = node_kernels['nsymb']
  138. vk_dict = {} # shortest path matrices dict
  139. for n1 in g1.nodes(data=True):
  140. for n2 in g2.nodes(data=True):
  141. vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
  142. [n2[1]['attributes']])
  143. # node unlabeled
  144. else:
  145. for e1, e2 in product(
  146. Gn[i].edges(data=True), Gn[j].edges(data=True)):
  147. if e1[2]['cost'] == e2[2]['cost']:
  148. Kmatrix += 1
  149. return i, j, Kmatrix
  150. # compute graph kernels
  151. if ds_attrs['is_directed']:
  152. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  153. if e1[2]['cost'] == e2[2]['cost']:
  154. # each edge walk is counted twice, starting from both its extreme nodes.
  155. nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
  156. e2[1])]
  157. kn1 = nk11 * nk22
  158. Kmatrix += kn1
  159. else:
  160. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  161. if e1[2]['cost'] == e2[2]['cost']:
  162. # each edge walk is counted twice, starting from both its extreme nodes.
  163. nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
  164. e1[0], e2[1])], vk_dict[(e1[1],
  165. e2[0])], vk_dict[(e1[1],
  166. e2[1])]
  167. kn1 = nk11 * nk22
  168. kn2 = nk12 * nk21
  169. Kmatrix += kn1 + kn2
  170. except KeyError: # missing labels or attributes
  171. pass
  172. return i, j, Kmatrix
  173. def wrap_getSPGraph(Gn, weight, i):
  174. return i, getSPGraph(Gn[i], edge_weight=weight)
  175. def commonwalkkernel(*args,
  176. node_label='atom',
  177. edge_label='bond_type',
  178. n=None,
  179. weight=1,
  180. compute_method=None,
  181. n_jobs=None,
  182. chunksize=1):
  183. """Calculate common walk graph kernels between graphs.
  184. """
  185. compute_method = compute_method.lower()
  186. # arrange all graphs in a list
  187. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  188. Kmatrix = np.zeros((len(Gn), len(Gn)))
  189. ds_attrs = get_dataset_attributes(
  190. Gn,
  191. attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  192. node_label=node_label,
  193. edge_label=edge_label)
  194. if not ds_attrs['node_labeled']:
  195. for G in Gn:
  196. nx.set_node_attributes(G, '0', 'atom')
  197. if not ds_attrs['edge_labeled']:
  198. for G in Gn:
  199. nx.set_edge_attributes(G, '0', 'bond_type')
  200. if not ds_attrs['is_directed']: # convert
  201. Gn = [G.to_directed() for G in Gn]
  202. start_time = time.time()
  203. # ---- use pool.imap_unordered to parallel and track progress. ----
  204. pool = Pool(n_jobs)
  205. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  206. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  207. # if len_itr < 100:
  208. # chunksize, extra = divmod(len_itr, n_jobs * 4)
  209. # if extra:
  210. # chunksize += 1
  211. # else:
  212. # chunksize = 100
  213. # direct product graph method - exponential
  214. if compute_method == 'exp':
  215. do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
  216. weight)
  217. # direct product graph method - geometric
  218. elif compute_method == 'geo':
  219. do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
  220. weight)
  221. for i, j, kernel in tqdm(
  222. pool.imap_unordered(do_partial, itr, chunksize),
  223. desc='calculating kernels',
  224. file=sys.stdout):
  225. Kmatrix[i][j] = kernel
  226. Kmatrix[j][i] = kernel
  227. pool.close()
  228. pool.join()
  229. run_time = time.time() - start_time
  230. print(
  231. "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
  232. % (len(Gn), run_time))
  233. return Kmatrix, run_time
  234. def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
  235. """Calculate walk graph kernels up to n between 2 graphs using exponential
  236. series.
  237. """
  238. i = ij[0]
  239. j = ij[1]
  240. g1 = Gn[i]
  241. g2 = Gn[j]
  242. # get tensor product / direct product
  243. gp = direct_product(g1, g2, node_label, edge_label)
  244. A = nx.adjacency_matrix(gp).todense()
  245. ew, ev = np.linalg.eig(A)
  246. D = np.zeros((len(ew), len(ew)))
  247. for i in range(len(ew)):
  248. D[i][i] = np.exp(beta * ew[i])
  249. exp_D = ev * D * ev.T
  250. return i, j, exp_D.sum()
  251. def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
  252. """Calculate common walk graph kernels up to n between 2 graphs using
  253. geometric series.
  254. """
  255. i = ij[0]
  256. j = ij[1]
  257. g1 = Gn[i]
  258. g2 = Gn[j]
  259. # get tensor product / direct product
  260. gp = direct_product(g1, g2, node_label, edge_label)
  261. A = nx.adjacency_matrix(gp).todense()
  262. mat = np.identity(len(A)) - gamma * A
  263. try:
  264. return i, j, mat.I.sum()
  265. except np.linalg.LinAlgError:
  266. return i, j, np.nan
  267. def compute_gram_matrices(datafile,
  268. estimator,
  269. param_grid_precomputed,
  270. datafile_y=None,
  271. extra_params=None,
  272. ds_name='ds-unknown',
  273. n_jobs=1,
  274. chunksize=1):
  275. """
  276. Parameters
  277. ----------
  278. datafile : string
  279. Path of dataset file.
  280. estimator : function
  281. kernel function used to estimate. This function needs to return a gram matrix.
  282. param_grid_precomputed : dictionary
  283. Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
  284. datafile_y : string
  285. Path of file storing y data. This parameter is optional depending on the given dataset file.
  286. """
  287. tqdm.monitor_interval = 0
  288. # Load the dataset
  289. dataset, y = loadDataset(
  290. datafile, filename_y=datafile_y, extra_params=extra_params)
  291. # Grid of parameters with a discrete number of values for each.
  292. param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  293. gram_matrix_time = [
  294. ] # a list to store time to calculate gram matrices
  295. # calculate all gram matrices
  296. for idx, params_out in enumerate(param_list_precomputed):
  297. params_out['n_jobs'] = n_jobs
  298. params_out['chunksize'] = chunksize
  299. rtn_data = estimator(dataset, **params_out)
  300. Kmatrix = rtn_data[0]
  301. current_run_time = rtn_data[1]
  302. # for some kernels, some graphs in datasets may not meet the
  303. # kernels' requirements for graph structure. These graphs are trimmed.
  304. if len(rtn_data) == 3:
  305. idx_trim = rtn_data[2] # the index of trimmed graph list
  306. y = [y[idx] for idx in idx_trim] # trim y accordingly
  307. Kmatrix_diag = Kmatrix.diagonal().copy()
  308. # remove graphs whose kernels with themselves are zeros
  309. nb_g_ignore = 0
  310. for idx, diag in enumerate(Kmatrix_diag):
  311. if diag == 0:
  312. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
  313. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
  314. nb_g_ignore += 1
  315. # normalization
  316. for i in range(len(Kmatrix)):
  317. for j in range(i, len(Kmatrix)):
  318. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  319. Kmatrix[j][i] = Kmatrix[i][j]
  320. gram_matrix_time.append(current_run_time)
  321. average_gram_matrix_time = np.mean(gram_matrix_time)
  322. return average_gram_matrix_time
  323. def structuralspkernel(*args,
  324. node_label='atom',
  325. edge_weight=None,
  326. edge_label='bond_type',
  327. node_kernels=None,
  328. edge_kernels=None,
  329. n_jobs=None,
  330. chunksize=1):
  331. """Calculate mean average structural shortest path kernels between graphs.
  332. """
  333. # pre-process
  334. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  335. weight = None
  336. if edge_weight is None:
  337. print('\n None edge weight specified. Set all weight to 1.\n')
  338. else:
  339. try:
  340. some_weight = list(
  341. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  342. if isinstance(some_weight, (float, int)):
  343. weight = edge_weight
  344. else:
  345. print(
  346. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  347. % edge_weight)
  348. except:
  349. print(
  350. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  351. % edge_weight)
  352. ds_attrs = get_dataset_attributes(
  353. Gn,
  354. attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  355. 'edge_attr_dim', 'is_directed'],
  356. node_label=node_label, edge_label=edge_label)
  357. start_time = time.time()
  358. # get shortest paths of each graph in Gn
  359. splist = [[] for _ in range(len(Gn))]
  360. pool = Pool(n_jobs)
  361. # get shortest path graphs of Gn
  362. getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
  363. # if len(Gn) < 100:
  364. # # use default chunksize as pool.map when iterable is less than 100
  365. # chunksize, extra = divmod(len(Gn), n_jobs * 4)
  366. # if extra:
  367. # chunksize += 1
  368. # else:
  369. # chunksize = 100
  370. # chunksize = 300 # int(len(list(itr)) / n_jobs)
  371. for i, sp in tqdm(
  372. pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
  373. desc='getting shortest paths',
  374. file=sys.stdout):
  375. splist[i] = sp
  376. Kmatrix = np.zeros((len(Gn), len(Gn)))
  377. # ---- use pool.imap_unordered to parallel and track progress. ----
  378. do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
  379. node_label, edge_label, node_kernels, edge_kernels)
  380. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  381. # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  382. # if len_itr < 100:
  383. # chunksize, extra = divmod(len_itr, n_jobs * 4)
  384. # if extra:
  385. # chunksize += 1
  386. # else:
  387. # chunksize = 100
  388. for i, j, kernel in tqdm(
  389. pool.imap_unordered(do_partial, itr, chunksize),
  390. desc='calculating kernels',
  391. file=sys.stdout):
  392. Kmatrix[i][j] = kernel
  393. Kmatrix[j][i] = kernel
  394. pool.close()
  395. pool.join()
  396. run_time = time.time() - start_time
  397. print(
  398. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  399. % (len(Gn), run_time))
  400. return Kmatrix, run_time
  401. def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
  402. node_kernels, edge_kernels, ij):
  403. iglobal = ij[0]
  404. jglobal = ij[1]
  405. g1 = Gn[iglobal]
  406. g2 = Gn[jglobal]
  407. spl1 = splist[iglobal]
  408. spl2 = splist[jglobal]
  409. kernel = 0
  410. try:
  411. # First, compute shortest path matrices, method borrowed from FCSP.
  412. if ds_attrs['node_labeled']:
  413. # node symb and non-synb labeled
  414. if ds_attrs['node_attr_dim'] > 0:
  415. kn = node_kernels['mix']
  416. vk_dict = {} # shortest path matrices dict
  417. for n1, n2 in product(
  418. g1.nodes(data=True), g2.nodes(data=True)):
  419. vk_dict[(n1[0], n2[0])] = kn(
  420. n1[1][node_label], n2[1][node_label],
  421. [n1[1]['attributes']], [n2[1]['attributes']])
  422. # node symb labeled
  423. else:
  424. kn = node_kernels['symb']
  425. vk_dict = {} # shortest path matrices dict
  426. for n1 in g1.nodes(data=True):
  427. for n2 in g2.nodes(data=True):
  428. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  429. n2[1][node_label])
  430. else:
  431. # node non-synb labeled
  432. if ds_attrs['node_attr_dim'] > 0:
  433. kn = node_kernels['nsymb']
  434. vk_dict = {} # shortest path matrices dict
  435. for n1 in g1.nodes(data=True):
  436. for n2 in g2.nodes(data=True):
  437. vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
  438. [n2[1]['attributes']])
  439. # node unlabeled
  440. else:
  441. vk_dict = {}
  442. # Then, compute kernels between all pairs of edges, which idea is an
  443. # extension of FCSP. It suits sparse graphs, which is the most case we
  444. # went though. For dense graphs, it would be slow.
  445. if ds_attrs['edge_labeled']:
  446. # edge symb and non-synb labeled
  447. if ds_attrs['edge_attr_dim'] > 0:
  448. ke = edge_kernels['mix']
  449. ek_dict = {} # dict of edge kernels
  450. for e1, e2 in product(
  451. g1.edges(data=True), g2.edges(data=True)):
  452. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
  453. e1[2][edge_label], e2[2][edge_label],
  454. [e1[2]['attributes']], [e2[2]['attributes']])
  455. # edge symb labeled
  456. else:
  457. ke = edge_kernels['symb']
  458. ek_dict = {}
  459. for e1 in g1.edges(data=True):
  460. for e2 in g2.edges(data=True):
  461. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
  462. e1[2][edge_label], e2[2][edge_label])
  463. else:
  464. # edge non-synb labeled
  465. if ds_attrs['edge_attr_dim'] > 0:
  466. ke = edge_kernels['nsymb']
  467. ek_dict = {}
  468. for e1 in g1.edges(data=True):
  469. for e2 in g2.edges(data=True):
  470. ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = kn(
  471. [e1[2]['attributes']], [e2[2]['attributes']])
  472. # edge unlabeled
  473. else:
  474. ek_dict = {}
  475. # compute graph kernels
  476. if vk_dict:
  477. if ek_dict:
  478. for p1, p2 in product(spl1, spl2):
  479. if len(p1) == len(p2):
  480. kpath = vk_dict[(p1[0], p2[0])]
  481. if kpath:
  482. for idx in range(1, len(p1)):
  483. kpath *= vk_dict[(p1[idx], p2[idx])] * \
  484. ek_dict[((p1[idx-1], p1[idx]),
  485. (p2[idx-1], p2[idx]))]
  486. if not kpath:
  487. break
  488. kernel += kpath # add up kernels of all paths
  489. else:
  490. for p1, p2 in product(spl1, spl2):
  491. if len(p1) == len(p2):
  492. kpath = vk_dict[(p1[0], p2[0])]
  493. if kpath:
  494. for idx in range(1, len(p1)):
  495. kpath *= vk_dict[(p1[idx], p2[idx])]
  496. if not kpath:
  497. break
  498. kernel += kpath # add up kernels of all paths
  499. else:
  500. if ek_dict:
  501. for p1, p2 in product(spl1, spl2):
  502. if len(p1) == len(p2):
  503. if len(p1) == 0:
  504. kernel += 1
  505. else:
  506. kpath = 1
  507. for idx in range(0, len(p1) - 1):
  508. kpath *= ek_dict[((p1[idx], p1[idx+1]),
  509. (p2[idx], p2[idx+1]))]
  510. if not kpath:
  511. break
  512. kernel += kpath # add up kernels of all paths
  513. else:
  514. for p1, p2 in product(spl1, spl2):
  515. if len(p1) == len(p2):
  516. kernel += 1
  517. kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  518. except KeyError: # missing labels or attributes
  519. pass
  520. return iglobal, jglobal, kernel
  521. def get_shortest_paths(G, weight, directed):
  522. """Get all shortest paths of a graph.
  523. """
  524. sp = []
  525. for n1, n2 in combinations(G.nodes(), 2):
  526. try:
  527. sptemp = nx.shortest_path(G, n1, n2, weight=weight)
  528. sp.append(sptemp)
  529. # each edge walk is counted twice, starting from both its extreme nodes.
  530. if not directed:
  531. sp.append(sptemp[::-1])
  532. except nx.NetworkXNoPath: # nodes not connected
  533. # sp.append([])
  534. pass
  535. # add single nodes as length 0 paths.
  536. sp += [[n] for n in G.nodes()]
  537. return sp
  538. def wrap_getSP(Gn, weight, directed, i):
  539. return i, get_shortest_paths(Gn[i], weight, directed)
  540. dslist = [
  541. {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
  542. 'task': 'regression'}, # node symb
  543. {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
  544. 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb
  545. {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
  546. {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
  547. {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
  548. 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
  549. {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
  550. # node symb/nsymb
  551. {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
  552. # node/edge symb
  553. {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
  554. {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
  555. 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
  556. ]
  557. fig, ax = plt.subplots()
  558. ax.set_xscale('log', nonposx='clip')
  559. ax.set_yscale('log', nonposy='clip')
  560. ax.set_xlabel('parallel chunksize')
  561. ax.set_ylabel('runtime($s$)')
  562. ax.set_title('Runtime of the sp kernel on all datasets V.S. parallel chunksize')
  563. estimator = structuralspkernel
  564. if estimator.__name__ == 'spkernel':
  565. mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
  566. param_grid_precomputed = {'node_kernels': [
  567. {'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
  568. elif estimator.__name__ == 'commonwalkkernel':
  569. mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
  570. param_grid_precomputed = {'compute_method': ['geo'],
  571. 'weight': [1]}
  572. elif estimator.__name__ == 'structuralspkernel':
  573. mixkernel = functools.partial(kernelproduct, deltakernel, rbf_kernel)
  574. param_grid_precomputed = {'node_kernels':
  575. [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}],
  576. 'edge_kernels':
  577. [{'symb': deltakernel, 'nsymb': rbf_kernel, 'mix': mixkernel}]}
  578. #list(range(10, 100, 20)) +
  579. chunklist = list(range(10, 100, 20)) + list(range(100, 1000, 200)) + \
  580. list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
  581. # chunklist = list(range(300, 1000, 200)) + list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
  582. gmtmat = np.zeros((len(dslist), len(chunklist)))
  583. for idx1, ds in enumerate(dslist):
  584. print()
  585. print(ds['name'])
  586. for idx2, cs in enumerate(chunklist):
  587. print(ds['name'], idx2, cs)
  588. gmtmat[idx1][idx2] = compute_gram_matrices(
  589. ds['dataset'],
  590. estimator,
  591. param_grid_precomputed,
  592. datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
  593. extra_params=(ds['extra_params']
  594. if 'extra_params' in ds else None),
  595. ds_name=ds['name'],
  596. n_jobs=multiprocessing.cpu_count(),
  597. chunksize=cs)
  598. print()
  599. print(gmtmat[idx1, :])
  600. np.save('test_parallel/' + estimator.__name__ + '.' + ds['name'],
  601. gmtmat[idx1, :])
  602. p = ax.plot(chunklist, gmtmat[idx1, :], '.-', label=ds['name'])
  603. ax.legend(loc='upper center')
  604. plt.savefig('test_parallel/' + estimator.__name__ + str(idx1) + '.eps',
  605. format='eps', dpi=300)
  606. # plt.show()

A Python package for graph kernels, graph edit distances and graph pre-image problem.