You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

test_parallel.py 26 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. Test of parallel, find the best parallel chunksize and iteration seperation scheme.
  5. Created on Wed Sep 26 12:09:34 2018
  6. @author: ljia
  7. """
  8. import sys
  9. import time
  10. from itertools import combinations_with_replacement, product, combinations
  11. from functools import partial
  12. from multiprocessing import Pool
  13. from tqdm import tqdm
  14. import networkx as nx
  15. import numpy as np
  16. import functools
  17. from libs import *
  18. #import multiprocessing
  19. from matplotlib import pyplot as plt
  20. from sklearn.model_selection import ParameterGrid
  21. sys.path.insert(0, "../")
  22. from pygraph.utils.utils import getSPGraph, direct_product
  23. from pygraph.utils.graphdataset import get_dataset_attributes
  24. from pygraph.utils.graphfiles import loadDataset
  25. from pygraph.utils.kernels import deltakernel, gaussiankernel, kernelproduct
  26. def spkernel(*args,
  27. node_label='atom',
  28. edge_weight=None,
  29. node_kernels=None,
  30. n_jobs=None,
  31. chunksize=1):
  32. """Calculate shortest-path kernels between graphs.
  33. """
  34. # pre-process
  35. Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  36. weight = None
  37. if edge_weight is None:
  38. print('\n None edge weight specified. Set all weight to 1.\n')
  39. else:
  40. try:
  41. some_weight = list(
  42. nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  43. if isinstance(some_weight, (float, int)):
  44. weight = edge_weight
  45. else:
  46. print(
  47. '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  48. % edge_weight)
  49. except:
  50. print(
  51. '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  52. % edge_weight)
  53. ds_attrs = get_dataset_attributes(
  54. Gn,
  55. attr_names=['node_labeled', 'node_attr_dim', 'is_directed'],
  56. node_label=node_label)
  57. # remove graphs with no edges, as no sp can be found in their structures,
  58. # so the kernel between such a graph and itself will be zero.
  59. len_gn = len(Gn)
  60. Gn = [(idx, G) for idx, G in enumerate(Gn) if nx.number_of_edges(G) != 0]
  61. idx = [G[0] for G in Gn]
  62. Gn = [G[1] for G in Gn]
  63. if len(Gn) != len_gn:
  64. print('\n %d graphs are removed as they don\'t contain edges.\n' %
  65. (len_gn - len(Gn)))
  66. start_time = time.time()
  67. pool = Pool(n_jobs)
  68. # get shortest path graphs of Gn
  69. getsp_partial = partial(wrapper_getSPGraph, weight)
  70. itr = zip(Gn, range(0, len(Gn)))
  71. for i, g in tqdm(
  72. pool.imap_unordered(getsp_partial, itr, chunksize),
  73. desc='getting sp graphs', file=sys.stdout):
  74. Gn[i] = g
  75. pool.close()
  76. pool.join()
  77. Kmatrix = np.zeros((len(Gn), len(Gn)))
  78. # ---- use pool.imap_unordered to parallel and track progress. ----
  79. def init_worker(gn_toshare):
  80. global G_gn
  81. G_gn = gn_toshare
  82. do_partial = partial(wrapper_sp_do, ds_attrs, node_label, node_kernels)
  83. itr = combinations_with_replacement(range(0, len(Gn)), 2)
  84. with Pool(processes=n_jobs, initializer=init_worker, initargs=(Gn,)) as pool:
  85. for i, j, kernel in tqdm(pool.imap_unordered(do_partial, itr, chunksize),
  86. desc='calculating kernels', file=sys.stdout):
  87. Kmatrix[i][j] = kernel
  88. Kmatrix[j][i] = kernel
  89. # # ---- direct running, normally use single CPU core. ----
  90. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  91. # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
  92. # kernel = spkernel_do(Gn[i], Gn[j], ds_attrs, node_label, node_kernels)
  93. # Kmatrix[i][j] = kernel
  94. # Kmatrix[j][i] = kernel
  95. run_time = time.time() - start_time
  96. print(
  97. "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  98. % (len(Gn), run_time))
  99. return Kmatrix, run_time, idx
  100. def spkernel_do(g1, g2, ds_attrs, node_label, node_kernels):
  101. kernel = 0
  102. # compute shortest path matrices first, method borrowed from FCSP.
  103. if ds_attrs['node_labeled']:
  104. # node symb and non-synb labeled
  105. if ds_attrs['node_attr_dim'] > 0:
  106. kn = node_kernels['mix']
  107. vk_dict = {} # shortest path matrices dict
  108. for n1, n2 in product(
  109. g1.nodes(data=True), g2.nodes(data=True)):
  110. vk_dict[(n1[0], n2[0])] = kn(
  111. n1[1][node_label], n2[1][node_label],
  112. n1[1]['attributes'], n2[1]['attributes'])
  113. # node symb labeled
  114. else:
  115. kn = node_kernels['symb']
  116. vk_dict = {} # shortest path matrices dict
  117. for n1 in g1.nodes(data=True):
  118. for n2 in g2.nodes(data=True):
  119. vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  120. n2[1][node_label])
  121. else:
  122. # node non-synb labeled
  123. if ds_attrs['node_attr_dim'] > 0:
  124. kn = node_kernels['nsymb']
  125. vk_dict = {} # shortest path matrices dict
  126. for n1 in g1.nodes(data=True):
  127. for n2 in g2.nodes(data=True):
  128. vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
  129. n2[1]['attributes'])
  130. # node unlabeled
  131. else:
  132. for e1, e2 in product(
  133. g1.edges(data=True), g2.edges(data=True)):
  134. if e1[2]['cost'] == e2[2]['cost']:
  135. kernel += 1
  136. return kernel
  137. # compute graph kernels
  138. if ds_attrs['is_directed']:
  139. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  140. if e1[2]['cost'] == e2[2]['cost']:
  141. nk11, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(e1[1],
  142. e2[1])]
  143. kn1 = nk11 * nk22
  144. kernel += kn1
  145. else:
  146. for e1, e2 in product(g1.edges(data=True), g2.edges(data=True)):
  147. if e1[2]['cost'] == e2[2]['cost']:
  148. # each edge walk is counted twice, starting from both its extreme nodes.
  149. nk11, nk12, nk21, nk22 = vk_dict[(e1[0], e2[0])], vk_dict[(
  150. e1[0], e2[1])], vk_dict[(e1[1],
  151. e2[0])], vk_dict[(e1[1],
  152. e2[1])]
  153. kn1 = nk11 * nk22
  154. kn2 = nk12 * nk21
  155. kernel += kn1 + kn2
  156. return kernel
  157. def wrapper_sp_do(ds_attrs, node_label, node_kernels, itr):
  158. i = itr[0]
  159. j = itr[1]
  160. return i, j, spkernel_do(G_gn[i], G_gn[j], ds_attrs, node_label, node_kernels)
  161. def wrapper_getSPGraph(weight, itr_item):
  162. g = itr_item[0]
  163. i = itr_item[1]
  164. return i, getSPGraph(g, edge_weight=weight)
  165. #
  166. #
  167. #def commonwalkkernel(*args,
  168. # node_label='atom',
  169. # edge_label='bond_type',
  170. # n=None,
  171. # weight=1,
  172. # compute_method=None,
  173. # n_jobs=None,
  174. # chunksize=1):
  175. # """Calculate common walk graph kernels between graphs.
  176. # """
  177. # compute_method = compute_method.lower()
  178. # # arrange all graphs in a list
  179. # Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  180. # Kmatrix = np.zeros((len(Gn), len(Gn)))
  181. # ds_attrs = get_dataset_attributes(
  182. # Gn,
  183. # attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
  184. # node_label=node_label,
  185. # edge_label=edge_label)
  186. # if not ds_attrs['node_labeled']:
  187. # for G in Gn:
  188. # nx.set_node_attributes(G, '0', 'atom')
  189. # if not ds_attrs['edge_labeled']:
  190. # for G in Gn:
  191. # nx.set_edge_attributes(G, '0', 'bond_type')
  192. # if not ds_attrs['is_directed']: # convert
  193. # Gn = [G.to_directed() for G in Gn]
  194. #
  195. # start_time = time.time()
  196. #
  197. # # ---- use pool.imap_unordered to parallel and track progress. ----
  198. # pool = Pool(n_jobs)
  199. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  200. ## len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  201. ## if len_itr < 100:
  202. ## chunksize, extra = divmod(len_itr, n_jobs * 4)
  203. ## if extra:
  204. ## chunksize += 1
  205. ## else:
  206. ## chunksize = 100
  207. #
  208. # # direct product graph method - exponential
  209. # if compute_method == 'exp':
  210. # do_partial = partial(_commonwalkkernel_exp, Gn, node_label, edge_label,
  211. # weight)
  212. # # direct product graph method - geometric
  213. # elif compute_method == 'geo':
  214. # do_partial = partial(_commonwalkkernel_geo, Gn, node_label, edge_label,
  215. # weight)
  216. #
  217. # for i, j, kernel in tqdm(
  218. # pool.imap_unordered(do_partial, itr, chunksize),
  219. # desc='calculating kernels',
  220. # file=sys.stdout):
  221. # Kmatrix[i][j] = kernel
  222. # Kmatrix[j][i] = kernel
  223. # pool.close()
  224. # pool.join()
  225. #
  226. # run_time = time.time() - start_time
  227. # print(
  228. # "\n --- kernel matrix of common walk kernel of size %d built in %s seconds ---"
  229. # % (len(Gn), run_time))
  230. #
  231. # return Kmatrix, run_time
  232. #
  233. #
  234. #def _commonwalkkernel_exp(Gn, node_label, edge_label, beta, ij):
  235. # """Calculate walk graph kernels up to n between 2 graphs using exponential
  236. # series.
  237. # """
  238. # i = ij[0]
  239. # j = ij[1]
  240. # g1 = Gn[i]
  241. # g2 = Gn[j]
  242. #
  243. # # get tensor product / direct product
  244. # gp = direct_product(g1, g2, node_label, edge_label)
  245. # A = nx.adjacency_matrix(gp).todense()
  246. #
  247. # ew, ev = np.linalg.eig(A)
  248. # D = np.zeros((len(ew), len(ew)))
  249. # for i in range(len(ew)):
  250. # D[i][i] = np.exp(beta * ew[i])
  251. # exp_D = ev * D * ev.T
  252. #
  253. # return i, j, exp_D.sum()
  254. #
  255. #
  256. #def _commonwalkkernel_geo(Gn, node_label, edge_label, gamma, ij):
  257. # """Calculate common walk graph kernels up to n between 2 graphs using
  258. # geometric series.
  259. # """
  260. # i = ij[0]
  261. # j = ij[1]
  262. # g1 = Gn[i]
  263. # g2 = Gn[j]
  264. #
  265. # # get tensor product / direct product
  266. # gp = direct_product(g1, g2, node_label, edge_label)
  267. # A = nx.adjacency_matrix(gp).todense()
  268. # mat = np.identity(len(A)) - gamma * A
  269. # try:
  270. # return i, j, mat.I.sum()
  271. # except np.linalg.LinAlgError:
  272. # return i, j, np.nan
  273. #def structuralspkernel(*args,
  274. # node_label='atom',
  275. # edge_weight=None,
  276. # edge_label='bond_type',
  277. # node_kernels=None,
  278. # edge_kernels=None,
  279. # n_jobs=None,
  280. # chunksize=1):
  281. # """Calculate mean average structural shortest path kernels between graphs.
  282. # """
  283. # # pre-process
  284. # Gn = args[0] if len(args) == 1 else [args[0], args[1]]
  285. #
  286. # weight = None
  287. # if edge_weight is None:
  288. # print('\n None edge weight specified. Set all weight to 1.\n')
  289. # else:
  290. # try:
  291. # some_weight = list(
  292. # nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
  293. # if isinstance(some_weight, (float, int)):
  294. # weight = edge_weight
  295. # else:
  296. # print(
  297. # '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
  298. # % edge_weight)
  299. # except:
  300. # print(
  301. # '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
  302. # % edge_weight)
  303. # ds_attrs = get_dataset_attributes(
  304. # Gn,
  305. # attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
  306. # 'edge_attr_dim', 'is_directed'],
  307. # node_label=node_label, edge_label=edge_label)
  308. #
  309. # start_time = time.time()
  310. #
  311. # # get shortest paths of each graph in Gn
  312. # splist = [[] for _ in range(len(Gn))]
  313. # pool = Pool(n_jobs)
  314. # # get shortest path graphs of Gn
  315. # getsp_partial = partial(wrap_getSP, Gn, weight, ds_attrs['is_directed'])
  316. ## if len(Gn) < 100:
  317. ## # use default chunksize as pool.map when iterable is less than 100
  318. ## chunksize, extra = divmod(len(Gn), n_jobs * 4)
  319. ## if extra:
  320. ## chunksize += 1
  321. ## else:
  322. ## chunksize = 100
  323. # # chunksize = 300 # int(len(list(itr)) / n_jobs)
  324. # for i, sp in tqdm(
  325. # pool.imap_unordered(getsp_partial, range(0, len(Gn)), chunksize),
  326. # desc='getting shortest paths',
  327. # file=sys.stdout):
  328. # splist[i] = sp
  329. #
  330. # Kmatrix = np.zeros((len(Gn), len(Gn)))
  331. #
  332. # # ---- use pool.imap_unordered to parallel and track progress. ----
  333. # do_partial = partial(structuralspkernel_do, Gn, splist, ds_attrs,
  334. # node_label, edge_label, node_kernels, edge_kernels)
  335. # itr = combinations_with_replacement(range(0, len(Gn)), 2)
  336. ## len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
  337. ## if len_itr < 100:
  338. ## chunksize, extra = divmod(len_itr, n_jobs * 4)
  339. ## if extra:
  340. ## chunksize += 1
  341. ## else:
  342. ## chunksize = 100
  343. # for i, j, kernel in tqdm(
  344. # pool.imap_unordered(do_partial, itr, chunksize),
  345. # desc='calculating kernels',
  346. # file=sys.stdout):
  347. # Kmatrix[i][j] = kernel
  348. # Kmatrix[j][i] = kernel
  349. # pool.close()
  350. # pool.join()
  351. #
  352. # run_time = time.time() - start_time
  353. # print(
  354. # "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
  355. # % (len(Gn), run_time))
  356. #
  357. # return Kmatrix, run_time
  358. #
  359. #
  360. #def structuralspkernel_do(Gn, splist, ds_attrs, node_label, edge_label,
  361. # node_kernels, edge_kernels, ij):
  362. #
  363. # iglobal = ij[0]
  364. # jglobal = ij[1]
  365. # g1 = Gn[iglobal]
  366. # g2 = Gn[jglobal]
  367. # spl1 = splist[iglobal]
  368. # spl2 = splist[jglobal]
  369. # kernel = 0
  370. #
  371. # try:
  372. # # First, compute shortest path matrices, method borrowed from FCSP.
  373. # if ds_attrs['node_labeled']:
  374. # # node symb and non-synb labeled
  375. # if ds_attrs['node_attr_dim'] > 0:
  376. # kn = node_kernels['mix']
  377. # vk_dict = {} # shortest path matrices dict
  378. # for n1, n2 in product(
  379. # g1.nodes(data=True), g2.nodes(data=True)):
  380. # vk_dict[(n1[0], n2[0])] = kn(
  381. # n1[1][node_label], n2[1][node_label],
  382. # [n1[1]['attributes']], [n2[1]['attributes']])
  383. # # node symb labeled
  384. # else:
  385. # kn = node_kernels['symb']
  386. # vk_dict = {} # shortest path matrices dict
  387. # for n1 in g1.nodes(data=True):
  388. # for n2 in g2.nodes(data=True):
  389. # vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
  390. # n2[1][node_label])
  391. # else:
  392. # # node non-synb labeled
  393. # if ds_attrs['node_attr_dim'] > 0:
  394. # kn = node_kernels['nsymb']
  395. # vk_dict = {} # shortest path matrices dict
  396. # for n1 in g1.nodes(data=True):
  397. # for n2 in g2.nodes(data=True):
  398. # vk_dict[(n1[0], n2[0])] = kn([n1[1]['attributes']],
  399. # [n2[1]['attributes']])
  400. # # node unlabeled
  401. # else:
  402. # vk_dict = {}
  403. #
  404. # # Then, compute kernels between all pairs of edges, which idea is an
  405. # # extension of FCSP. It suits sparse graphs, which is the most case we
  406. # # went though. For dense graphs, it would be slow.
  407. # if ds_attrs['edge_labeled']:
  408. # # edge symb and non-synb labeled
  409. # if ds_attrs['edge_attr_dim'] > 0:
  410. # ke = edge_kernels['mix']
  411. # ek_dict = {} # dict of edge kernels
  412. # for e1, e2 in product(
  413. # g1.edges(data=True), g2.edges(data=True)):
  414. # ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
  415. # e1[2][edge_label], e2[2][edge_label],
  416. # [e1[2]['attributes']], [e2[2]['attributes']])
  417. # # edge symb labeled
  418. # else:
  419. # ke = edge_kernels['symb']
  420. # ek_dict = {}
  421. # for e1 in g1.edges(data=True):
  422. # for e2 in g2.edges(data=True):
  423. # ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ke(
  424. # e1[2][edge_label], e2[2][edge_label])
  425. # else:
  426. # # edge non-synb labeled
  427. # if ds_attrs['edge_attr_dim'] > 0:
  428. # ke = edge_kernels['nsymb']
  429. # ek_dict = {}
  430. # for e1 in g1.edges(data=True):
  431. # for e2 in g2.edges(data=True):
  432. # ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = kn(
  433. # [e1[2]['attributes']], [e2[2]['attributes']])
  434. # # edge unlabeled
  435. # else:
  436. # ek_dict = {}
  437. #
  438. # # compute graph kernels
  439. # if vk_dict:
  440. # if ek_dict:
  441. # for p1, p2 in product(spl1, spl2):
  442. # if len(p1) == len(p2):
  443. # kpath = vk_dict[(p1[0], p2[0])]
  444. # if kpath:
  445. # for idx in range(1, len(p1)):
  446. # kpath *= vk_dict[(p1[idx], p2[idx])] * \
  447. # ek_dict[((p1[idx-1], p1[idx]),
  448. # (p2[idx-1], p2[idx]))]
  449. # if not kpath:
  450. # break
  451. # kernel += kpath # add up kernels of all paths
  452. # else:
  453. # for p1, p2 in product(spl1, spl2):
  454. # if len(p1) == len(p2):
  455. # kpath = vk_dict[(p1[0], p2[0])]
  456. # if kpath:
  457. # for idx in range(1, len(p1)):
  458. # kpath *= vk_dict[(p1[idx], p2[idx])]
  459. # if not kpath:
  460. # break
  461. # kernel += kpath # add up kernels of all paths
  462. # else:
  463. # if ek_dict:
  464. # for p1, p2 in product(spl1, spl2):
  465. # if len(p1) == len(p2):
  466. # if len(p1) == 0:
  467. # kernel += 1
  468. # else:
  469. # kpath = 1
  470. # for idx in range(0, len(p1) - 1):
  471. # kpath *= ek_dict[((p1[idx], p1[idx+1]),
  472. # (p2[idx], p2[idx+1]))]
  473. # if not kpath:
  474. # break
  475. # kernel += kpath # add up kernels of all paths
  476. # else:
  477. # for p1, p2 in product(spl1, spl2):
  478. # if len(p1) == len(p2):
  479. # kernel += 1
  480. #
  481. # kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
  482. # except KeyError: # missing labels or attributes
  483. # pass
  484. #
  485. # return iglobal, jglobal, kernel
  486. #
  487. #
  488. #def get_shortest_paths(G, weight, directed):
  489. # """Get all shortest paths of a graph.
  490. # """
  491. # sp = []
  492. # for n1, n2 in combinations(G.nodes(), 2):
  493. # try:
  494. # sptemp = nx.shortest_path(G, n1, n2, weight=weight)
  495. # sp.append(sptemp)
  496. # # each edge walk is counted twice, starting from both its extreme nodes.
  497. # if not directed:
  498. # sp.append(sptemp[::-1])
  499. # except nx.NetworkXNoPath: # nodes not connected
  500. # # sp.append([])
  501. # pass
  502. # # add single nodes as length 0 paths.
  503. # sp += [[n] for n in G.nodes()]
  504. # return sp
  505. #
  506. #
  507. #def wrap_getSP(Gn, weight, directed, i):
  508. # return i, get_shortest_paths(Gn[i], weight, directed)
  509. def compute_gram_matrices(datafile,
  510. estimator,
  511. param_grid_precomputed,
  512. datafile_y=None,
  513. extra_params=None,
  514. ds_name='ds-unknown',
  515. n_jobs=1,
  516. chunksize=1):
  517. """
  518. Parameters
  519. ----------
  520. datafile : string
  521. Path of dataset file.
  522. estimator : function
  523. kernel function used to estimate. This function needs to return a gram matrix.
  524. param_grid_precomputed : dictionary
  525. Dictionary with names (string) of parameters used to calculate gram matrices as keys and lists of parameter settings to try as values. This enables searching over any sequence of parameter settings. Params with length 1 will be omitted.
  526. datafile_y : string
  527. Path of file storing y data. This parameter is optional depending on the given dataset file.
  528. """
  529. tqdm.monitor_interval = 0
  530. # Load the dataset
  531. dataset, y_all = loadDataset(
  532. datafile, filename_y=datafile_y, extra_params=extra_params)
  533. # Grid of parameters with a discrete number of values for each.
  534. param_list_precomputed = list(ParameterGrid(param_grid_precomputed))
  535. gram_matrix_time = [
  536. ] # a list to store time to calculate gram matrices
  537. # calculate all gram matrices
  538. for idx, params_out in enumerate(param_list_precomputed):
  539. y = y_all[:]
  540. params_out['n_jobs'] = n_jobs
  541. params_out['chunksize'] = chunksize
  542. rtn_data = estimator(dataset[:], **params_out)
  543. Kmatrix = rtn_data[0]
  544. current_run_time = rtn_data[1]
  545. # for some kernels, some graphs in datasets may not meet the
  546. # kernels' requirements for graph structure. These graphs are trimmed.
  547. if len(rtn_data) == 3:
  548. idx_trim = rtn_data[2] # the index of trimmed graph list
  549. y = [y[idx] for idx in idx_trim] # trim y accordingly
  550. Kmatrix_diag = Kmatrix.diagonal().copy()
  551. # remove graphs whose kernels with themselves are zeros
  552. nb_g_ignore = 0
  553. for idx, diag in enumerate(Kmatrix_diag):
  554. if diag == 0:
  555. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=0)
  556. Kmatrix = np.delete(Kmatrix, (idx - nb_g_ignore), axis=1)
  557. nb_g_ignore += 1
  558. # normalization
  559. for i in range(len(Kmatrix)):
  560. for j in range(i, len(Kmatrix)):
  561. Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
  562. Kmatrix[j][i] = Kmatrix[i][j]
  563. gram_matrix_time.append(current_run_time)
  564. average_gram_matrix_time = np.mean(gram_matrix_time)
  565. return average_gram_matrix_time
  566. dslist = [
  567. {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
  568. 'task': 'regression'}, # node symb
  569. {'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
  570. 'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, # contains single node graph, node symb
  571. {'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
  572. {'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
  573. {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
  574. 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
  575. {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
  576. # node symb/nsymb
  577. {'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
  578. # node/edge symb
  579. # {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
  580. # {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
  581. # 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb
  582. ]
  583. fig, ax = plt.subplots()
  584. ax.set_xscale('log', nonposx='clip')
  585. ax.set_yscale('log', nonposy='clip')
  586. ax.set_xlabel('parallel chunksize')
  587. ax.set_ylabel('runtime($s$)')
  588. ax.set_title('28 cpus')
  589. ax.grid(axis='both')
  590. estimator = spkernel
  591. if estimator.__name__ == 'spkernel':
  592. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  593. param_grid_precomputed = {'node_kernels': [
  594. {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
  595. elif estimator.__name__ == 'commonwalkkernel':
  596. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  597. param_grid_precomputed = {'compute_method': ['geo'],
  598. 'weight': [1]}
  599. elif estimator.__name__ == 'structuralspkernel':
  600. mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel)
  601. param_grid_precomputed = {'node_kernels':
  602. [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}],
  603. 'edge_kernels':
  604. [{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}]}
  605. #list(range(10, 100, 20)) +
  606. #chunklist = list(range(10, 100, 20)) + list(range(100, 1000, 200)) + \
  607. # list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
  608. # chunklist = list(range(300, 1000, 200)) + list(range(1000, 10000, 2000)) + list(range(10000, 100000, 20000))
  609. chunklist = list(range(10, 100, 10)) + list(range(100, 1000, 100)) + \
  610. list(range(1000, 10000, 1000)) + list(range(10000, 100000, 10000))
  611. #chunklist = list(range(1000, 10000, 1000))
  612. gmtmat = np.zeros((len(dslist), len(chunklist)))
  613. cpus = 28
  614. for idx1, ds in enumerate(dslist):
  615. print()
  616. print(ds['name'])
  617. for idx2, cs in enumerate(chunklist):
  618. print(ds['name'], idx2, cs)
  619. gmtmat[idx1][idx2] = compute_gram_matrices(
  620. ds['dataset'],
  621. estimator,
  622. param_grid_precomputed,
  623. datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
  624. extra_params=(ds['extra_params']
  625. if 'extra_params' in ds else None),
  626. ds_name=ds['name'],
  627. n_jobs=cpus,
  628. chunksize=cs)
  629. print()
  630. print(gmtmat[idx1, :])
  631. np.save('test_parallel/' + estimator.__name__ + '.' + ds['name'] + '_' +
  632. str(idx1), gmtmat[idx1, :])
  633. p = ax.plot(chunklist, gmtmat[idx1, :], '.-', label=ds['name'], zorder=3)
  634. ax.legend(loc='upper right', ncol=3, labelspacing=0.1, handletextpad=0.4,
  635. columnspacing=0.6)
  636. plt.savefig('test_parallel/' + estimator.__name__ + str(idx1) + '_' +
  637. str(cpus) + '.eps', format='eps', dpi=300)
  638. # plt.show()

A Python package for graph kernels, graph edit distances and graph pre-image problem.