|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Sun Dec 23 16:42:48 2018
-
- @author: ljia
- """
-
- import sys
- import time
- from itertools import combinations, product
- from functools import partial
- from multiprocessing import Pool
- from tqdm import tqdm
-
- import networkx as nx
- import numpy as np
-
- from gklearn.utils.graphdataset import get_dataset_attributes
- from gklearn.utils.parallel import parallel_gm
-
- sys.path.insert(0, "../")
-
-
- def structuralspkernel(*args,
- node_label='atom',
- edge_weight=None,
- edge_label='bond_type',
- node_kernels=None,
- edge_kernels=None,
- n_jobs=None):
- """Calculate mean average structural shortest path kernels between graphs.
-
- Parameters
- ----------
- Gn : List of NetworkX graph
- List of graphs between which the kernels are calculated.
- /
- G1, G2 : NetworkX graphs
- 2 graphs between which the kernel is calculated.
- node_label : string
- node attribute used as label. The default node label is atom.
- edge_weight : string
- Edge attribute name corresponding to the edge weight.
- edge_label : string
- edge attribute used as label. The default edge label is bond_type.
- node_kernels: dict
- A dictionary of kernel functions for nodes, including 3 items: 'symb'
- for symbolic node labels, 'nsymb' for non-symbolic node labels, 'mix'
- for both labels. The first 2 functions take two node labels as
- parameters, and the 'mix' function takes 4 parameters, a symbolic and a
- non-symbolic label for each the two nodes. Each label is in form of 2-D
- dimension array (n_samples, n_features). Each function returns a number
- as the kernel value. Ignored when nodes are unlabeled.
- edge_kernels: dict
- A dictionary of kernel functions for edges, including 3 items: 'symb'
- for symbolic edge labels, 'nsymb' for non-symbolic edge labels, 'mix'
- for both labels. The first 2 functions take two edge labels as
- parameters, and the 'mix' function takes 4 parameters, a symbolic and a
- non-symbolic label for each the two edges. Each label is in form of 2-D
- dimension array (n_samples, n_features). Each function returns a number
- as the kernel value. Ignored when edges are unlabeled.
-
- Return
- ------
- Kmatrix : Numpy matrix
- Kernel matrix, each element of which is the mean average structural
- shortest path kernel between 2 praphs.
- """
- # pre-process
- Gn = args[0] if len(args) == 1 else [args[0], args[1]]
- weight = None
- if edge_weight is None:
- print('\n None edge weight specified. Set all weight to 1.\n')
- else:
- try:
- some_weight = list(
- nx.get_edge_attributes(Gn[0], edge_weight).values())[0]
- if isinstance(some_weight, (float, int)):
- weight = edge_weight
- else:
- print(
- '\n Edge weight with name %s is not float or integer. Set all weight to 1.\n'
- % edge_weight)
- except:
- print(
- '\n Edge weight with name "%s" is not found in the edge attributes. Set all weight to 1.\n'
- % edge_weight)
- ds_attrs = get_dataset_attributes(
- Gn,
- attr_names=['node_labeled', 'node_attr_dim', 'edge_labeled',
- 'edge_attr_dim', 'is_directed'],
- node_label=node_label, edge_label=edge_label)
- ds_attrs['node_attr_dim'] = 0
- ds_attrs['edge_attr_dim'] = 0
-
- start_time = time.time()
-
- # get shortest paths of each graph in Gn
- splist = [None] * len(Gn)
- pool = Pool(n_jobs)
- # get shortest path graphs of Gn
- getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
- itr = zip(Gn, range(0, len(Gn)))
- if len(Gn) < 100 * n_jobs:
- chunksize = int(len(Gn) / n_jobs) + 1
- else:
- chunksize = 100
- # chunksize = 300 # int(len(list(itr)) / n_jobs)
- for i, sp in tqdm(
- pool.imap_unordered(getsp_partial, itr, chunksize),
- desc='getting shortest paths',
- file=sys.stdout):
- splist[i] = sp
- # time.sleep(10)
- pool.close()
- pool.join()
-
-
- # # get shortest paths of each graph in Gn
- # splist = [[] for _ in range(len(Gn))]
- # # get shortest path graphs of Gn
- # getsp_partial = partial(wrapper_getSP, weight, ds_attrs['is_directed'])
- # itr = zip(Gn, range(0, len(Gn)))
- # if len(Gn) < 1000 * n_jobs:
- # chunksize = int(len(Gn) / n_jobs) + 1
- # else:
- # chunksize = 1000
- # # chunksize = 300 # int(len(list(itr)) / n_jobs)
- # from contextlib import closing
- # with closing(Pool(n_jobs)) as pool:
- ## for i, sp in tqdm(
- # res = pool.imap_unordered(getsp_partial, itr, 10)
- ## desc='getting shortest paths',
- ## file=sys.stdout):
- ## splist[i] = sp
- ## time.sleep(10)
- # pool.close()
- # pool.join()
-
- # ss = 0
- # ss += sys.getsizeof(splist)
- # for spss in splist:
- # ss += sys.getsizeof(spss)
- # for spp in spss:
- # ss += sys.getsizeof(spp)
-
-
- # time.sleep(20)
-
- # # ---- direct running, normally use single CPU core. ----
- # splist = []
- # for g in tqdm(Gn, desc='getting sp graphs', file=sys.stdout):
- # splist.append(get_shortest_paths(g, weight, ds_attrs['is_directed']))
-
- # # ---- only for the Fast Computation of Shortest Path Kernel (FCSP)
- # sp_ml = [0] * len(Gn) # shortest path matrices
- # for i in result_sp:
- # sp_ml[i[0]] = i[1]
- # edge_x_g = [[] for i in range(len(sp_ml))]
- # edge_y_g = [[] for i in range(len(sp_ml))]
- # edge_w_g = [[] for i in range(len(sp_ml))]
- # for idx, item in enumerate(sp_ml):
- # for i1 in range(len(item)):
- # for i2 in range(i1 + 1, len(item)):
- # if item[i1, i2] != np.inf:
- # edge_x_g[idx].append(i1)
- # edge_y_g[idx].append(i2)
- # edge_w_g[idx].append(item[i1, i2])
- # print(len(edge_x_g[0]))
- # print(len(edge_y_g[0]))
- # print(len(edge_w_g[0]))
-
- Kmatrix = np.zeros((len(Gn), len(Gn)))
-
- # ---- use pool.imap_unordered to parallel and track progress. ----
- def init_worker(spl_toshare, gs_toshare):
- global G_spl, G_gs
- G_spl = spl_toshare
- G_gs = gs_toshare
- do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels)
- parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
- glbv=(splist, Gn), n_jobs=n_jobs)
-
-
- # # ---- use pool.imap_unordered to parallel and track progress. ----
- # pool = Pool(n_jobs)
- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
- # node_kernels, edge_kernels)
- # itr = zip(combinations_with_replacement(Gn, 2),
- # combinations_with_replacement(splist, 2),
- # combinations_with_replacement(range(0, len(Gn)), 2))
- # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
- # if len_itr < 1000 * n_jobs:
- # chunksize = int(len_itr / n_jobs) + 1
- # else:
- # chunksize = 1000
- # for i, j, kernel in tqdm(
- # pool.imap_unordered(do_partial, itr, chunksize),
- # desc='calculating kernels',
- # file=sys.stdout):
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
- # pool.close()
- # pool.join()
-
- # # ---- use pool.map to parallel. ----
- # pool = Pool(n_jobs)
- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
- # node_kernels, edge_kernels)
- # itr = zip(combinations_with_replacement(Gn, 2),
- # combinations_with_replacement(splist, 2),
- # combinations_with_replacement(range(0, len(Gn)), 2))
- # for i, j, kernel in tqdm(
- # pool.map(do_partial, itr), desc='calculating kernels',
- # file=sys.stdout):
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
- # pool.close()
- # pool.join()
-
- # # ---- use pool.imap_unordered to parallel and track progress. ----
- # do_partial = partial(wrapper_ssp_do, ds_attrs, node_label, edge_label,
- # node_kernels, edge_kernels)
- # itr = zip(combinations_with_replacement(Gn, 2),
- # combinations_with_replacement(splist, 2),
- # combinations_with_replacement(range(0, len(Gn)), 2))
- # len_itr = int(len(Gn) * (len(Gn) + 1) / 2)
- # if len_itr < 1000 * n_jobs:
- # chunksize = int(len_itr / n_jobs) + 1
- # else:
- # chunksize = 1000
- # from contextlib import closing
- # with closing(Pool(n_jobs)) as pool:
- # for i, j, kernel in tqdm(
- # pool.imap_unordered(do_partial, itr, 1000),
- # desc='calculating kernels',
- # file=sys.stdout):
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
- # pool.close()
- # pool.join()
-
-
- # # ---- direct running, normally use single CPU core. ----
- # from itertools import combinations_with_replacement
- # itr = combinations_with_replacement(range(0, len(Gn)), 2)
- # for i, j in tqdm(itr, desc='calculating kernels', file=sys.stdout):
- # kernel = structuralspkernel_do(Gn[i], Gn[j], splist[i], splist[j],
- # ds_attrs, node_label, edge_label, node_kernels, edge_kernels)
- ## if(kernel > 1):
- ## print("error here ")
- # Kmatrix[i][j] = kernel
- # Kmatrix[j][i] = kernel
-
- run_time = time.time() - start_time
- print(
- "\n --- shortest path kernel matrix of size %d built in %s seconds ---"
- % (len(Gn), run_time))
-
- return Kmatrix, run_time
-
-
- def structuralspkernel_do(g1, g2, spl1, spl2, ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels):
-
- kernel = 0
-
- # First, compute shortest path matrices, method borrowed from FCSP.
- vk_dict = {} # shortest path matrices dict
- if ds_attrs['node_labeled']:
- # node symb and non-synb labeled
- if ds_attrs['node_attr_dim'] > 0:
- kn = node_kernels['mix']
- for n1, n2 in product(
- g1.nodes(data=True), g2.nodes(data=True)):
- vk_dict[(n1[0], n2[0])] = kn(
- n1[1][node_label], n2[1][node_label],
- n1[1]['attributes'], n2[1]['attributes'])
- # node symb labeled
- else:
- kn = node_kernels['symb']
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- vk_dict[(n1[0], n2[0])] = kn(n1[1][node_label],
- n2[1][node_label])
- else:
- # node non-synb labeled
- if ds_attrs['node_attr_dim'] > 0:
- kn = node_kernels['nsymb']
- for n1 in g1.nodes(data=True):
- for n2 in g2.nodes(data=True):
- vk_dict[(n1[0], n2[0])] = kn(n1[1]['attributes'],
- n2[1]['attributes'])
- # node unlabeled
- else:
- pass
-
- # Then, compute kernels between all pairs of edges, which idea is an
- # extension of FCSP. It suits sparse graphs, which is the most case we
- # went though. For dense graphs, this would be slow.
- ek_dict = {} # dict of edge kernels
- if ds_attrs['edge_labeled']:
- # edge symb and non-synb labeled
- if ds_attrs['edge_attr_dim'] > 0:
- ke = edge_kernels['mix']
- for e1, e2 in product(
- g1.edges(data=True), g2.edges(data=True)):
- ek_temp = ke(e1[2][edge_label], e2[2][edge_label],
- e1[2]['attributes'], e2[2]['attributes'])
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- # edge symb labeled
- else:
- ke = edge_kernels['symb']
- for e1 in g1.edges(data=True):
- for e2 in g2.edges(data=True):
- ek_temp = ke(e1[2][edge_label], e2[2][edge_label])
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- else:
- # edge non-synb labeled
- if ds_attrs['edge_attr_dim'] > 0:
- ke = edge_kernels['nsymb']
- for e1 in g1.edges(data=True):
- for e2 in g2.edges(data=True):
- ek_temp = kn(e1[2]['attributes'], e2[2]['attributes'])
- ek_dict[((e1[0], e1[1]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[0], e2[1]))] = ek_temp
- ek_dict[((e1[0], e1[1]), (e2[1], e2[0]))] = ek_temp
- ek_dict[((e1[1], e1[0]), (e2[1], e2[0]))] = ek_temp
- # edge unlabeled
- else:
- pass
-
- # compute graph kernels
- if vk_dict:
- if ek_dict:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- kpath = vk_dict[(p1[0], p2[0])]
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= vk_dict[(p1[idx], p2[idx])] * \
- ek_dict[((p1[idx-1], p1[idx]),
- (p2[idx-1], p2[idx]))]
- if not kpath:
- break
- kernel += kpath # add up kernels of all paths
- else:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- kpath = vk_dict[(p1[0], p2[0])]
- if kpath:
- for idx in range(1, len(p1)):
- kpath *= vk_dict[(p1[idx], p2[idx])]
- if not kpath:
- break
- kernel += kpath # add up kernels of all paths
- else:
- if ek_dict:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- if len(p1) == 0:
- kernel += 1
- else:
- kpath = 1
- for idx in range(0, len(p1) - 1):
- kpath *= ek_dict[((p1[idx], p1[idx+1]),
- (p2[idx], p2[idx+1]))]
- if not kpath:
- break
- kernel += kpath # add up kernels of all paths
- else:
- for p1, p2 in product(spl1, spl2):
- if len(p1) == len(p2):
- kernel += 1
-
- kernel = kernel / (len(spl1) * len(spl2)) # calculate mean average
-
- # # ---- exact implementation of the Fast Computation of Shortest Path Kernel (FCSP), reference [2], sadly it is slower than the current implementation
- # # compute vertex kernel matrix
- # try:
- # vk_mat = np.zeros((nx.number_of_nodes(g1),
- # nx.number_of_nodes(g2)))
- # g1nl = enumerate(g1.nodes(data=True))
- # g2nl = enumerate(g2.nodes(data=True))
- # for i1, n1 in g1nl:
- # for i2, n2 in g2nl:
- # vk_mat[i1][i2] = kn(
- # n1[1][node_label], n2[1][node_label],
- # [n1[1]['attributes']], [n2[1]['attributes']])
-
- # range1 = range(0, len(edge_w_g[i]))
- # range2 = range(0, len(edge_w_g[j]))
- # for i1 in range1:
- # x1 = edge_x_g[i][i1]
- # y1 = edge_y_g[i][i1]
- # w1 = edge_w_g[i][i1]
- # for i2 in range2:
- # x2 = edge_x_g[j][i2]
- # y2 = edge_y_g[j][i2]
- # w2 = edge_w_g[j][i2]
- # ke = (w1 == w2)
- # if ke > 0:
- # kn1 = vk_mat[x1][x2] * vk_mat[y1][y2]
- # kn2 = vk_mat[x1][y2] * vk_mat[y1][x2]
- # Kmatrix += kn1 + kn2
- return kernel
-
-
- def wrapper_ssp_do(ds_attrs, node_label, edge_label, node_kernels,
- edge_kernels, itr):
- i = itr[0]
- j = itr[1]
- return i, j, structuralspkernel_do(G_gs[i], G_gs[j], G_spl[i], G_spl[j],
- ds_attrs, node_label, edge_label,
- node_kernels, edge_kernels)
-
-
- def get_shortest_paths(G, weight, directed):
- """Get all shortest paths of a graph.
-
- Parameters
- ----------
- G : NetworkX graphs
- The graphs whose paths are calculated.
- weight : string/None
- edge attribute used as weight to calculate the shortest path.
- directed: boolean
- Whether graph is directed.
-
- Return
- ------
- sp : list of list
- List of shortest paths of the graph, where each path is represented by a list of nodes.
- """
- sp = []
- for n1, n2 in combinations(G.nodes(), 2):
- try:
- spltemp = list(nx.all_shortest_paths(G, n1, n2, weight=weight))
- except nx.NetworkXNoPath: # nodes not connected
- # sp.append([])
- pass
- else:
- sp += spltemp
- # each edge walk is counted twice, starting from both its extreme nodes.
- if not directed:
- sp += [sptemp[::-1] for sptemp in spltemp]
-
- # add single nodes as length 0 paths.
- sp += [[n] for n in G.nodes()]
- return sp
-
-
- def wrapper_getSP(weight, directed, itr_item):
- g = itr_item[0]
- i = itr_item[1]
- return i, get_shortest_paths(g, weight, directed)
|