@@ -8,10 +8,8 @@ Created on Fri Sep 28 17:01:13 2018 | |||||
from libs import * | from libs import * | ||||
import multiprocessing | import multiprocessing | ||||
from sklearn.metrics.pairwise import rbf_kernel | |||||
from pygraph.kernels.commonWalkKernel import commonwalkkernel | from pygraph.kernels.commonWalkKernel import commonwalkkernel | ||||
from pygraph.utils.kernels import deltakernel, kernelproduct | |||||
dslist = [ | dslist = [ | ||||
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
@@ -0,0 +1,83 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Oct 5 19:19:33 2018 | |||||
@author: ljia | |||||
""" | |||||
from libs import * | |||||
import multiprocessing | |||||
from pygraph.kernels.treeletKernel import treeletkernel | |||||
from pygraph.utils.kernels import gaussiankernel, polynomialkernel | |||||
dslist = [ | |||||
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
'task': 'regression'}, # node symb | |||||
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression', | |||||
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', }, | |||||
# contains single node graph, node symb | |||||
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb | |||||
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled | |||||
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb | |||||
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'}, | |||||
# # node nsymb | |||||
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'}, | |||||
# node symb/nsymb | |||||
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'}, | |||||
# # node/edge symb | |||||
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb | |||||
# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb | |||||
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb | |||||
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb | |||||
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'}, | |||||
# | |||||
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb | |||||
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb | |||||
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb | |||||
# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb | |||||
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb | |||||
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb | |||||
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb | |||||
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf', | |||||
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb | |||||
# # not working below | |||||
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',}, | |||||
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',}, | |||||
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',}, | |||||
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',}, | |||||
] | |||||
estimator = treeletkernel | |||||
param_grid_precomputed = {'sub_kernel': [gaussiankernel, polynomialkernel]} | |||||
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)}, | |||||
{'alpha': np.logspace(-10, 10, num=41, base=10)}] | |||||
for ds in dslist: | |||||
print() | |||||
print(ds['name']) | |||||
model_selection_for_precomputed_kernel( | |||||
ds['dataset'], | |||||
estimator, | |||||
param_grid_precomputed, | |||||
(param_grid[1] if ('task' in ds and ds['task'] | |||||
== 'regression') else param_grid[0]), | |||||
(ds['task'] if 'task' in ds else 'classification'), | |||||
NUM_TRIALS=30, | |||||
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None), | |||||
extra_params=(ds['extra_params'] if 'extra_params' in ds else None), | |||||
ds_name=ds['name'], | |||||
n_jobs=multiprocessing.cpu_count(), | |||||
read_gm_from_file=False, | |||||
verbose=True) | |||||
print() |
@@ -10,7 +10,6 @@ from libs import * | |||||
import multiprocessing | import multiprocessing | ||||
from pygraph.kernels.untilHPathKernel import untilhpathkernel | from pygraph.kernels.untilHPathKernel import untilhpathkernel | ||||
from pygraph.utils.kernels import deltakernel, kernelproduct | |||||
dslist = [ | dslist = [ | ||||
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | ||||
@@ -11,13 +11,17 @@ and the iterative alternate minimizations (IAM) in reference [2]. | |||||
pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004. | pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004. | ||||
[2] Generalized median graph via iterative alternate minimization. | [2] Generalized median graph via iterative alternate minimization. | ||||
""" | """ | ||||
import sys | |||||
import numpy as np | import numpy as np | ||||
import multiprocessing | import multiprocessing | ||||
from tqdm import tqdm | from tqdm import tqdm | ||||
import networkx as nx | import networkx as nx | ||||
import matplotlib.pyplot as plt | import matplotlib.pyplot as plt | ||||
from iam import iam | |||||
from iam import iam, test_iam_with_more_graphs_as_init, test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations | |||||
sys.path.insert(0, "../") | |||||
from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||||
from pygraph.kernels.untilHPathKernel import untilhpathkernel | |||||
def gk_iam(Gn, alpha): | def gk_iam(Gn, alpha): | ||||
@@ -29,58 +33,59 @@ def gk_iam(Gn, alpha): | |||||
----- | ----- | ||||
Every time a better graph is acquired, the older one is replaced by it. | Every time a better graph is acquired, the older one is replaced by it. | ||||
""" | """ | ||||
# compute k nearest neighbors of phi in DN. | |||||
dis_list = [] # distance between g_star and each graph. | |||||
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||||
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||||
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
dis_list.append(dtemp) | |||||
# sort | |||||
sort_idx = np.argsort(dis_list) | |||||
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] | |||||
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||||
if dis_gs[0] == 0: # the exact pre-image. | |||||
print('The exact pre-image is found from the input dataset.') | |||||
return 0, g0hat | |||||
dhat = dis_gs[0] # the nearest distance | |||||
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||||
gihat_list = [] | |||||
# i = 1 | |||||
r = 1 | |||||
while r < r_max: | |||||
print('r =', r) | |||||
# found = False | |||||
Gs_nearest = Gk + gihat_list | |||||
g_tmp = iam(Gs_nearest) | |||||
# compute distance between phi and the new generated graph. | |||||
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||||
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
if dnew <= dhat: # the new distance is smaller | |||||
print('I am smaller!') | |||||
dhat = dnew | |||||
g_new = g_tmp.copy() # found better graph. | |||||
gihat_list = [g_new] | |||||
dis_gs.append(dhat) | |||||
r = 0 | |||||
else: | |||||
r += 1 | |||||
ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list) | |||||
return dhat, ghat | |||||
pass | |||||
# # compute k nearest neighbors of phi in DN. | |||||
# dis_list = [] # distance between g_star and each graph. | |||||
# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | |||||
# dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||||
# k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
# dis_list.append(dtemp) | |||||
# | |||||
# # sort | |||||
# sort_idx = np.argsort(dis_list) | |||||
# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]] | |||||
# g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN | |||||
# if dis_gs[0] == 0: # the exact pre-image. | |||||
# print('The exact pre-image is found from the input dataset.') | |||||
# return 0, g0hat | |||||
# dhat = dis_gs[0] # the nearest distance | |||||
# Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||||
# gihat_list = [] | |||||
# | |||||
## i = 1 | |||||
# r = 1 | |||||
# while r < r_max: | |||||
# print('r =', r) | |||||
## found = False | |||||
# Gs_nearest = Gk + gihat_list | |||||
# g_tmp = iam(Gs_nearest) | |||||
# | |||||
# # compute distance between phi and the new generated graph. | |||||
# knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||||
# p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
# n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
# dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||||
# knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
# if dnew <= dhat: # the new distance is smaller | |||||
# print('I am smaller!') | |||||
# dhat = dnew | |||||
# g_new = g_tmp.copy() # found better graph. | |||||
# gihat_list = [g_new] | |||||
# dis_gs.append(dhat) | |||||
# r = 0 | |||||
# else: | |||||
# r += 1 | |||||
# | |||||
# ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list) | |||||
# | |||||
# return dhat, ghat | |||||
def gk_iam_nearest(Gn, alpha): | |||||
def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max): | |||||
"""This function constructs graph pre-image by the iterative pre-image | """This function constructs graph pre-image by the iterative pre-image | ||||
framework in reference [1], algorithm 1, where the step of generating new | framework in reference [1], algorithm 1, where the step of generating new | ||||
graphs randomly is replaced by the IAM algorithm in reference [2]. | graphs randomly is replaced by the IAM algorithm in reference [2]. | ||||
@@ -94,10 +99,11 @@ def gk_iam_nearest(Gn, alpha): | |||||
# compute k nearest neighbors of phi in DN. | # compute k nearest neighbors of phi in DN. | ||||
dis_list = [] # distance between g_star and each graph. | dis_list = [] # distance between g_star and each graph. | ||||
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout): | ||||
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||||
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix) | |||||
# dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) * | |||||
# k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha * | |||||
# (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha * | |||||
# k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6]) | |||||
dis_list.append(dtemp) | dis_list.append(dtemp) | ||||
# sort | # sort | ||||
@@ -108,9 +114,12 @@ def gk_iam_nearest(Gn, alpha): | |||||
print('The exact pre-image is found from the input dataset.') | print('The exact pre-image is found from the input dataset.') | ||||
return 0, g0hat | return 0, g0hat | ||||
dhat = dis_gs[0] # the nearest distance | dhat = dis_gs[0] # the nearest distance | ||||
ghat = g0hat | |||||
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors | |||||
Gs_nearest = Gk | |||||
ghat = g0hat.copy() | |||||
Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors | |||||
for gi in Gk: | |||||
nx.draw_networkx(gi) | |||||
plt.show() | |||||
Gs_nearest = Gk.copy() | |||||
# gihat_list = [] | # gihat_list = [] | ||||
# i = 1 | # i = 1 | ||||
@@ -119,18 +128,29 @@ def gk_iam_nearest(Gn, alpha): | |||||
print('r =', r) | print('r =', r) | ||||
# found = False | # found = False | ||||
# Gs_nearest = Gk + gihat_list | # Gs_nearest = Gk + gihat_list | ||||
g_tmp = iam(Gs_nearest) | |||||
# g_tmp = iam(Gs_nearest) | |||||
g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1) | |||||
nx.draw_networkx(g_tmp) | |||||
plt.show() | |||||
# compute distance between phi and the new generated graph. | # compute distance between phi and the new generated graph. | ||||
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) * | |||||
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha * | |||||
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha * | |||||
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2]) | |||||
if dnew <= dhat: # the new distance is smaller | |||||
gi_list = [Gn[i] for i in idx_gi] | |||||
knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False) | |||||
dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew) | |||||
# dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] * | |||||
# knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] * | |||||
# alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] * | |||||
# k_g1_list[1] + alpha[1] * alpha[1] * k_list[1]) | |||||
if dnew <= dhat and g_tmp != ghat: # the new distance is smaller | |||||
print('I am smaller!') | print('I am smaller!') | ||||
print(str(dhat) + '->' + str(dnew)) | |||||
# nx.draw_networkx(ghat) | |||||
# plt.show() | |||||
# print('->') | |||||
# nx.draw_networkx(g_tmp) | |||||
# plt.show() | |||||
dhat = dnew | dhat = dnew | ||||
g_new = g_tmp.copy() # found better graph. | g_new = g_tmp.copy() # found better graph. | ||||
ghat = g_tmp.copy() | ghat = g_tmp.copy() | ||||
@@ -144,48 +164,205 @@ def gk_iam_nearest(Gn, alpha): | |||||
r += 1 | r += 1 | ||||
return dhat, ghat | return dhat, ghat | ||||
def dis_gstar(idx_g, idx_gi, alpha, Kmatrix): | |||||
term1 = Kmatrix[idx_g, idx_g] | |||||
term2 = 0 | |||||
for i, a in enumerate(alpha): | |||||
term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||||
term2 *= 2 | |||||
term3 = 0 | |||||
for i1, a1 in enumerate(alpha): | |||||
for i2, a2 in enumerate(alpha): | |||||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||||
return np.sqrt(term1 - term2 + term3) | |||||
def compute_kernel(Gn, graph_kernel, verbose): | |||||
if graph_kernel == 'marginalizedkernel': | |||||
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None, | |||||
p_quit=0.3, n_iteration=19, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
elif graph_kernel == 'untilhpathkernel': | |||||
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type', | |||||
depth=2, k_func='MinMax', compute_method='trie', | |||||
n_jobs=multiprocessing.cpu_count(), verbose=verbose) | |||||
# normalization | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | |||||
for i in range(len(Kmatrix)): | |||||
for j in range(i, len(Kmatrix)): | |||||
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) | |||||
Kmatrix[j][i] = Kmatrix[i][j] | |||||
return Kmatrix | |||||
def gram2distances(Kmatrix): | |||||
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) | |||||
for i1 in range(len(Kmatrix)): | |||||
for i2 in range(len(Kmatrix)): | |||||
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] | |||||
dmatrix = np.sqrt(dmatrix) | |||||
return dmatrix | |||||
# --------------------------- These are tests --------------------------------# | |||||
def test_who_is_the_closest_in_kernel_space(Gn): | |||||
idx_gi = [0, 6] | |||||
g1 = Gn[idx_gi[0]] | |||||
g2 = Gn[idx_gi[1]] | |||||
# create the "median" graph. | |||||
gnew = g2.copy() | |||||
gnew.remove_node(0) | |||||
nx.draw_networkx(gnew) | |||||
plt.show() | |||||
print(gnew.nodes(data=True)) | |||||
Gn = [gnew] + Gn | |||||
# compute gram matrix | |||||
Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True) | |||||
# the distance matrix | |||||
dmatrix = gram2distances(Kmatrix) | |||||
print(np.sort(dmatrix[idx_gi[0] + 1])) | |||||
print(np.argsort(dmatrix[idx_gi[0] + 1])) | |||||
print(np.sort(dmatrix[idx_gi[1] + 1])) | |||||
print(np.argsort(dmatrix[idx_gi[1] + 1])) | |||||
# for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2 | |||||
dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))] | |||||
print(np.sort(dis_median)) | |||||
print(np.argsort(dis_median)) | |||||
return | |||||
def test_who_is_the_closest_in_GED_space(Gn): | |||||
from iam import GED | |||||
idx_gi = [0, 6] | |||||
g1 = Gn[idx_gi[0]] | |||||
g2 = Gn[idx_gi[1]] | |||||
# create the "median" graph. | |||||
gnew = g2.copy() | |||||
gnew.remove_node(0) | |||||
nx.draw_networkx(gnew) | |||||
plt.show() | |||||
print(gnew.nodes(data=True)) | |||||
Gn = [gnew] + Gn | |||||
# compute GEDs | |||||
ged_matrix = np.zeros((len(Gn), len(Gn))) | |||||
for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout): | |||||
for i2 in range(len(Gn)): | |||||
dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib') | |||||
ged_matrix[i1, i2] = dis | |||||
print(np.sort(ged_matrix[idx_gi[0] + 1])) | |||||
print(np.argsort(ged_matrix[idx_gi[0] + 1])) | |||||
print(np.sort(ged_matrix[idx_gi[1] + 1])) | |||||
print(np.argsort(ged_matrix[idx_gi[1] + 1])) | |||||
# for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2 | |||||
dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))] | |||||
print(np.sort(dis_median)) | |||||
print(np.argsort(dis_median)) | |||||
return | |||||
def test_will_IAM_give_the_median_graph_we_wanted(Gn): | |||||
idx_gi = [0, 6] | |||||
g1 = Gn[idx_gi[0]].copy() | |||||
g2 = Gn[idx_gi[1]].copy() | |||||
# del Gn[idx_gi[0]] | |||||
# del Gn[idx_gi[1] - 1] | |||||
g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1) | |||||
# g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1) | |||||
nx.draw_networkx(g_median) | |||||
plt.show() | |||||
print(g_median.nodes(data=True)) | |||||
print(g_median.edges(data=True)) | |||||
def test_new_IAM_allGraph_deleteNodes(Gn): | |||||
idx_gi = [0, 6] | |||||
# g1 = Gn[idx_gi[0]].copy() | |||||
# g2 = Gn[idx_gi[1]].copy() | |||||
g1 = nx.Graph(name='haha') | |||||
g1.add_nodes_from([(2, {'atom': 'C'}), (3, {'atom': 'O'}), (4, {'atom': 'C'})]) | |||||
g1.add_edges_from([(2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})]) | |||||
g2 = nx.Graph(name='hahaha') | |||||
g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}), | |||||
(3, {'atom': 'O'}), (4, {'atom': 'C'})]) | |||||
g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}), | |||||
(2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})]) | |||||
# g2 = g1.copy() | |||||
# g2.add_nodes_from([(3, {'atom': 'O'})]) | |||||
# g2.add_nodes_from([(4, {'atom': 'C'})]) | |||||
# g2.add_edges_from([(1, 3, {'bond_type': '1'})]) | |||||
# g2.add_edges_from([(3, 4, {'bond_type': '1'})]) | |||||
# del Gn[idx_gi[0]] | |||||
# del Gn[idx_gi[1] - 1] | |||||
g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1) | |||||
# g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1) | |||||
nx.draw_networkx(g_median) | |||||
plt.show() | |||||
print(g_median.nodes(data=True)) | |||||
print(g_median.edges(data=True)) | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
import sys | |||||
sys.path.insert(0, "../") | |||||
from pygraph.kernels.marginalizedKernel import marginalizedkernel | |||||
from pygraph.utils.graphfiles import loadDataset | from pygraph.utils.graphfiles import loadDataset | ||||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | |||||
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb | |||||
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt', | |||||
# 'extra_params': {}} # node nsymb | |||||
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds', | |||||
# 'extra_params': {}} | |||||
ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds', | |||||
'extra_params': {}} # node symb | |||||
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params']) | ||||
# Gn = Gn[0:10] | |||||
# Gn = Gn[0:20] | |||||
test_new_IAM_allGraph_deleteNodes(Gn) | |||||
test_will_IAM_give_the_median_graph_we_wanted(Gn) | |||||
test_who_is_the_closest_in_GED_space(Gn) | |||||
test_who_is_the_closest_in_kernel_space(Gn) | |||||
lmbda = 0.03 # termination probalility | lmbda = 0.03 # termination probalility | ||||
r_max = 10 # recursions | r_max = 10 # recursions | ||||
l = 500 | l = 500 | ||||
alpha_range = np.linspace(0.1, 0.9, 9) | |||||
k = 5 # k nearest neighbors | |||||
alpha_range = np.linspace(0.5, 0.5, 1) | |||||
k = 20 # k nearest neighbors | |||||
# randomly select two molecules | # randomly select two molecules | ||||
np.random.seed(1) | np.random.seed(1) | ||||
idx1, idx2 = np.random.randint(0, len(Gn), 2) | |||||
g1 = Gn[idx1] | |||||
g2 = Gn[idx2] | |||||
idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2) | |||||
g1 = Gn[idx_gi[0]] | |||||
g2 = Gn[idx_gi[1]] | |||||
# g_tmp = iam([g1, g2]) | |||||
# nx.draw_networkx(g_tmp) | |||||
# plt.show() | |||||
# compute | # compute | ||||
k_list = [] # kernel between each graph and itself. | |||||
k_g1_list = [] # kernel between each graph and g1 | |||||
k_g2_list = [] # kernel between each graph and g2 | |||||
for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): | |||||
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None, | |||||
p_quit=lmbda, n_iteration=20, remove_totters=False, | |||||
n_jobs=multiprocessing.cpu_count(), verbose=False) | |||||
k_list.append(ktemp[0][0, 0]) | |||||
k_g1_list.append(ktemp[0][0, 1]) | |||||
k_g2_list.append(ktemp[0][0, 2]) | |||||
# k_list = [] # kernel between each graph and itself. | |||||
# k_g1_list = [] # kernel between each graph and g1 | |||||
# k_g2_list = [] # kernel between each graph and g2 | |||||
# for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout): | |||||
# ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False) | |||||
# k_list.append(ktemp[0][0, 0]) | |||||
# k_g1_list.append(ktemp[0][0, 1]) | |||||
# k_g2_list.append(ktemp[0][0, 2]) | |||||
km = compute_kernel(Gn, 'untilhpathkernel', True) | |||||
# k_list = np.diag(km) # kernel between each graph and itself. | |||||
# k_g1_list = km[idx_gi[0]] # kernel between each graph and g1 | |||||
# k_g2_list = km[idx_gi[1]] # kernel between each graph and g2 | |||||
g_best = [] | g_best = [] | ||||
dis_best = [] | dis_best = [] | ||||
# for each alpha | # for each alpha | ||||
for alpha in alpha_range: | for alpha in alpha_range: | ||||
print('alpha =', alpha) | print('alpha =', alpha) | ||||
dhat, ghat = gk_iam_nearest(Gn, alpha) | |||||
dhat, ghat = gk_iam_nearest(Gn, [alpha, 1 - alpha], idx_gi, km, k, r_max) | |||||
dis_best.append(dhat) | dis_best.append(dhat) | ||||
g_best.append(ghat) | g_best.append(ghat) | ||||
@@ -16,18 +16,17 @@ import librariesImport, script | |||||
sys.path.insert(0, "../") | sys.path.insert(0, "../") | ||||
from pygraph.utils.graphfiles import saveDataset | from pygraph.utils.graphfiles import saveDataset | ||||
from pygraph.utils.graphdataset import get_dataset_attributes | from pygraph.utils.graphdataset import get_dataset_attributes | ||||
from pygraph.utils.utils import graph_isIdentical | |||||
#from pygraph.utils.utils import graph_deepcopy | |||||
def iam(Gn, node_label='atom', edge_label='bond_type'): | |||||
def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type', | |||||
connected=True): | |||||
"""See my name, then you know what I do. | """See my name, then you know what I do. | ||||
""" | """ | ||||
# Gn = Gn[0:10] | # Gn = Gn[0:10] | ||||
Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] | Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] | ||||
c_er = 1 | |||||
c_es = 1 | |||||
c_ei = 1 | |||||
# phase 1: initilize. | # phase 1: initilize. | ||||
# compute set-median. | # compute set-median. | ||||
dis_min = np.inf | dis_min = np.inf | ||||
@@ -37,7 +36,7 @@ def iam(Gn, node_label='atom', edge_label='bond_type'): | |||||
dist_sum = 0 | dist_sum = 0 | ||||
pi_all.append([]) | pi_all.append([]) | ||||
for idx2, G_p_prime in enumerate(Gn): | for idx2, G_p_prime in enumerate(Gn): | ||||
dist_tmp, pi_tmp = GED(G_p, G_p_prime) | |||||
dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime) | |||||
pi_all[idx1].append(pi_tmp) | pi_all[idx1].append(pi_tmp) | ||||
dist_sum += dist_tmp | dist_sum += dist_tmp | ||||
if dist_sum < dis_min: | if dist_sum < dis_min: | ||||
@@ -50,7 +49,7 @@ def iam(Gn, node_label='atom', edge_label='bond_type'): | |||||
# phase 2: iteration. | # phase 2: iteration. | ||||
ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], | ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'], | ||||
edge_label=edge_label) | edge_label=edge_label) | ||||
for itr in range(0, 10): | |||||
for itr in range(0, 10): # @todo: the convergence condition? | |||||
G_new = G.copy() | G_new = G.copy() | ||||
# update vertex labels. | # update vertex labels. | ||||
# pre-compute h_i0 for each label. | # pre-compute h_i0 for each label. | ||||
@@ -138,34 +137,40 @@ def iam(Gn, node_label='atom', edge_label='bond_type'): | |||||
G_new.remove_edge(nd1, nd2) | G_new.remove_edge(nd1, nd2) | ||||
G = G_new.copy() | G = G_new.copy() | ||||
# update pi_p | |||||
pi_p = [] | |||||
for idx1, G_p in enumerate(Gn): | |||||
dist_tmp, pi_tmp, _ = GED(G, G_p) | |||||
pi_p.append(pi_tmp) | |||||
return G | return G | ||||
def GED(g1, g2, lib='gedlib'): | def GED(g1, g2, lib='gedlib'): | ||||
""" | """ | ||||
Compute GED. It is a dummy function for now. | |||||
Compute GED. | |||||
""" | """ | ||||
if lib == 'gedlib': | if lib == 'gedlib': | ||||
# transform dataset to the 'xml' file as the GedLib required. | |||||
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp') | saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp') | ||||
script.appel() | |||||
# script.appel() | |||||
script.PyRestartEnv() | script.PyRestartEnv() | ||||
script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml') | |||||
script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml') | |||||
listID = script.PyGetGraphIds() | listID = script.PyGetGraphIds() | ||||
script.PySetEditCost("CHEM_1") | |||||
script.PySetEditCost("CHEM_2") | |||||
script.PyInitEnv() | script.PyInitEnv() | ||||
script.PySetMethod("BIPARTITE", "") | script.PySetMethod("BIPARTITE", "") | ||||
script.PyInitMethod() | script.PyInitMethod() | ||||
g = listID[0] | g = listID[0] | ||||
h = listID[1] | h = listID[1] | ||||
script.PyRunMethod(g, h) | script.PyRunMethod(g, h) | ||||
liste = script.PyGetAllMap(g, h) | |||||
pi_forward, pi_backward = script.PyGetAllMap(g, h) | |||||
upper = script.PyGetUpperBound(g, h) | upper = script.PyGetUpperBound(g, h) | ||||
lower = script.PyGetLowerBound(g, h) | lower = script.PyGetLowerBound(g, h) | ||||
dis = upper + lower | |||||
pi = liste[0] | |||||
dis = (upper + lower) / 2 | |||||
return dis, pi | |||||
return dis, pi_forward, pi_backward | |||||
def get_node_labels(Gn, node_label): | def get_node_labels(Gn, node_label): | ||||
@@ -182,6 +187,434 @@ def get_edge_labels(Gn, edge_label): | |||||
return el | return el | ||||
# --------------------------- These are tests --------------------------------# | |||||
def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1, | |||||
node_label='atom', edge_label='bond_type'): | |||||
"""See my name, then you know what I do. | |||||
""" | |||||
from tqdm import tqdm | |||||
# Gn = Gn[0:10] | |||||
Gn = [nx.convert_node_labels_to_integers(g) for g in Gn] | |||||
# phase 1: initilize. | |||||
# compute set-median. | |||||
dis_min = np.inf | |||||
# pi_p = [] | |||||
pi_all_forward = [] | |||||
pi_all_backward = [] | |||||
for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout): | |||||
dist_sum = 0 | |||||
pi_all_forward.append([]) | |||||
pi_all_backward.append([]) | |||||
for idx2, G_p_prime in enumerate(Gn): | |||||
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime) | |||||
pi_all_forward[idx1].append(pi_tmp_forward) | |||||
pi_all_backward[idx1].append(pi_tmp_backward) | |||||
dist_sum += dist_tmp | |||||
if dist_sum <= dis_min: | |||||
dis_min = dist_sum | |||||
G = G_p.copy() | |||||
idx_min = idx1 | |||||
# list of edit operations. | |||||
pi_p_forward = pi_all_forward[idx_min] | |||||
pi_p_backward = pi_all_backward[idx_min] | |||||
# phase 2: iteration. | |||||
ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'], | |||||
edge_label=edge_label) | |||||
label_set = get_node_labels(Gn + [G], node_label) | |||||
for itr in range(0, 10): # @todo: the convergence condition? | |||||
G_new = G.copy() | |||||
# update vertex labels. | |||||
# pre-compute h_i0 for each label. | |||||
# for label in get_node_labels(Gn, node_label): | |||||
# print(label) | |||||
# for nd in G.nodes(data=True): | |||||
# pass | |||||
if not ds_attrs['node_attr_dim']: # labels are symbolic | |||||
for nd in G.nodes(): | |||||
h_i0_list = [] | |||||
label_list = [] | |||||
for label in label_set: | |||||
h_i0 = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p_forward[idx][nd] | |||||
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: | |||||
h_i0 += 1 | |||||
h_i0_list.append(h_i0) | |||||
label_list.append(label) | |||||
# choose one of the best randomly. | |||||
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() | |||||
idx_rdm = random.randint(0, len(idx_max) - 1) | |||||
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]] | |||||
else: # labels are non-symbolic | |||||
for nd in G.nodes(): | |||||
Si_norm = 0 | |||||
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p_forward[idx][nd] | |||||
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? | |||||
Si_norm += 1 | |||||
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) | |||||
phi_i_bar /= Si_norm | |||||
G_new.nodes[nd]['attributes'] = phi_i_bar | |||||
# update edge labels and adjacency matrix. | |||||
if ds_attrs['edge_labeled']: | |||||
for nd1, nd2, _ in G.edges(data=True): | |||||
h_ij0_list = [] | |||||
label_list = [] | |||||
for label in get_edge_labels(Gn, edge_label): | |||||
h_ij0 = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p_forward[idx][nd1] | |||||
pi_j = pi_p_forward[idx][nd2] | |||||
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and | |||||
g.has_edge(pi_i, pi_j) and | |||||
g.edges[pi_i, pi_j][edge_label] == label) | |||||
h_ij0 += h_ij0_p | |||||
h_ij0_list.append(h_ij0) | |||||
label_list.append(label) | |||||
# choose one of the best randomly. | |||||
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() | |||||
h_ij0_max = h_ij0_list[idx_max[0]] | |||||
idx_rdm = random.randint(0, len(idx_max) - 1) | |||||
best_label = label_list[idx_max[idx_rdm]] | |||||
# check whether a_ij is 0 or 1. | |||||
sij_norm = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p_forward[idx][nd1] | |||||
pi_j = pi_p_forward[idx][nd2] | |||||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||||
sij_norm += 1 | |||||
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||||
if not G_new.has_edge(nd1, nd2): | |||||
G_new.add_edge(nd1, nd2) | |||||
G_new.edges[nd1, nd2][edge_label] = best_label | |||||
else: | |||||
if G_new.has_edge(nd1, nd2): | |||||
G_new.remove_edge(nd1, nd2) | |||||
else: # if edges are unlabeled | |||||
# @todo: works only for undirected graphs. | |||||
for nd1 in range(nx.number_of_nodes(G)): | |||||
for nd2 in range(nd1 + 1, nx.number_of_nodes(G)): | |||||
sij_norm = 0 | |||||
for idx, g in enumerate(Gn): | |||||
pi_i = pi_p_forward[idx][nd1] | |||||
pi_j = pi_p_forward[idx][nd2] | |||||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||||
sij_norm += 1 | |||||
if sij_norm > len(Gn) * c_er / (c_er + c_ei): | |||||
if not G_new.has_edge(nd1, nd2): | |||||
G_new.add_edge(nd1, nd2) | |||||
elif sij_norm < len(Gn) * c_er / (c_er + c_ei): | |||||
if G_new.has_edge(nd1, nd2): | |||||
G_new.remove_edge(nd1, nd2) | |||||
# do not change anything when equal. | |||||
G = G_new.copy() | |||||
# update pi_p | |||||
pi_p_forward = [] | |||||
for G_p in Gn: | |||||
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p) | |||||
pi_p_forward.append(pi_tmp_forward) | |||||
return G | |||||
def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations( | |||||
Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, node_label='atom', | |||||
edge_label='bond_type', connected=True): | |||||
"""See my name, then you know what I do. | |||||
""" | |||||
from tqdm import tqdm | |||||
# Gn_median = Gn_median[0:10] | |||||
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median] | |||||
node_ir = sys.maxsize * 2 # Max number for c++, corresponding to the node remove and insertion. | |||||
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable. | |||||
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate, | |||||
attr_names=['edge_labeled', 'node_attr_dim'], | |||||
edge_label=edge_label) | |||||
def generate_graph(G, pi_p_forward, label_set): | |||||
G_new_list = [G.copy()] # all "best" graphs generated in this iteration. | |||||
# nx.draw_networkx(G) | |||||
# import matplotlib.pyplot as plt | |||||
# plt.show() | |||||
# print(pi_p_forward) | |||||
# update vertex labels. | |||||
# pre-compute h_i0 for each label. | |||||
# for label in get_node_labels(Gn, node_label): | |||||
# print(label) | |||||
# for nd in G.nodes(data=True): | |||||
# pass | |||||
if not ds_attrs['node_attr_dim']: # labels are symbolic | |||||
for ndi, (nd, _) in enumerate(G.nodes(data=True)): | |||||
h_i0_list = [] | |||||
label_list = [] | |||||
for label in label_set: | |||||
h_i0 = 0 | |||||
for idx, g in enumerate(Gn_median): | |||||
pi_i = pi_p_forward[idx][ndi] | |||||
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label: | |||||
h_i0 += 1 | |||||
h_i0_list.append(h_i0) | |||||
label_list.append(label) | |||||
# case when the node is to be removed. | |||||
h_i0_remove = 0 | |||||
for idx, g in enumerate(Gn_median): | |||||
pi_i = pi_p_forward[idx][ndi] | |||||
if pi_i == node_ir: | |||||
h_i0_remove += 1 | |||||
h_i0_list.append(h_i0_remove) | |||||
label_list.append(label_r) | |||||
# get the best labels. | |||||
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist() | |||||
nlabel_best = [label_list[idx] for idx in idx_max] | |||||
# generate "best" graphs with regard to "best" node labels. | |||||
G_new_list_nd = [] | |||||
for g in G_new_list: | |||||
for nl in nlabel_best: | |||||
g_tmp = g.copy() | |||||
if nl == label_r: | |||||
g_tmp.remove_node(nd) | |||||
else: | |||||
g_tmp.nodes[nd][node_label] = nl | |||||
G_new_list_nd.append(g_tmp) | |||||
# nx.draw_networkx(g_tmp) | |||||
# import matplotlib.pyplot as plt | |||||
# plt.show() | |||||
# print(g_tmp.nodes(data=True)) | |||||
# print(g_tmp.edges(data=True)) | |||||
G_new_list = G_new_list_nd[:] | |||||
else: # labels are non-symbolic | |||||
for nd in G.nodes(): | |||||
Si_norm = 0 | |||||
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])]) | |||||
for idx, g in enumerate(Gn_median): | |||||
pi_i = pi_p_forward[idx][nd] | |||||
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0? | |||||
Si_norm += 1 | |||||
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']]) | |||||
phi_i_bar /= Si_norm | |||||
G_new.nodes[nd]['attributes'] = phi_i_bar | |||||
# update edge labels and adjacency matrix. | |||||
if ds_attrs['edge_labeled']: | |||||
for nd1, nd2, _ in G.edges(data=True): | |||||
h_ij0_list = [] | |||||
label_list = [] | |||||
for label in get_edge_labels(Gn_median, edge_label): | |||||
h_ij0 = 0 | |||||
for idx, g in enumerate(Gn_median): | |||||
pi_i = pi_p_forward[idx][nd1] | |||||
pi_j = pi_p_forward[idx][nd2] | |||||
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and | |||||
g.has_edge(pi_i, pi_j) and | |||||
g.edges[pi_i, pi_j][edge_label] == label) | |||||
h_ij0 += h_ij0_p | |||||
h_ij0_list.append(h_ij0) | |||||
label_list.append(label) | |||||
# choose one of the best randomly. | |||||
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist() | |||||
h_ij0_max = h_ij0_list[idx_max[0]] | |||||
idx_rdm = random.randint(0, len(idx_max) - 1) | |||||
best_label = label_list[idx_max[idx_rdm]] | |||||
# check whether a_ij is 0 or 1. | |||||
sij_norm = 0 | |||||
for idx, g in enumerate(Gn_median): | |||||
pi_i = pi_p_forward[idx][nd1] | |||||
pi_j = pi_p_forward[idx][nd2] | |||||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||||
sij_norm += 1 | |||||
if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es): | |||||
if not G_new.has_edge(nd1, nd2): | |||||
G_new.add_edge(nd1, nd2) | |||||
G_new.edges[nd1, nd2][edge_label] = best_label | |||||
else: | |||||
if G_new.has_edge(nd1, nd2): | |||||
G_new.remove_edge(nd1, nd2) | |||||
else: # if edges are unlabeled | |||||
# @todo: works only for undirected graphs. | |||||
nd_list = [n for n in G.nodes()] | |||||
for g_tmp in G_new_list: | |||||
for nd1i in range(nx.number_of_nodes(G)): | |||||
nd1 = nd_list[nd1i] | |||||
for nd2i in range(nd1i + 1, nx.number_of_nodes(G)): | |||||
nd2 = nd_list[nd2i] | |||||
sij_norm = 0 | |||||
for idx, g in enumerate(Gn_median): | |||||
pi_i = pi_p_forward[idx][nd1i] | |||||
pi_j = pi_p_forward[idx][nd2i] | |||||
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j): | |||||
sij_norm += 1 | |||||
if sij_norm > len(Gn_median) * c_er / (c_er + c_ei): | |||||
# @todo: should we consider if nd1 and nd2 in g_tmp? | |||||
# or just add the edge anyway? | |||||
if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \ | |||||
and not g_tmp.has_edge(nd1, nd2): | |||||
g_tmp.add_edge(nd1, nd2) | |||||
elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei): | |||||
if g_tmp.has_edge(nd1, nd2): | |||||
g_tmp.remove_edge(nd1, nd2) | |||||
# do not change anything when equal. | |||||
# find the best graph generated in this iteration and update pi_p. | |||||
# @todo: should we update all graphs generated or just the best ones? | |||||
dis_list, pi_forward_list = median_distance(G_new_list, Gn_median) | |||||
# @todo: should we remove the identical and connectivity check? | |||||
# Don't know which is faster. | |||||
G_new_list, idx_list = remove_duplicates(G_new_list) | |||||
pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||||
# if connected == True: | |||||
# G_new_list, idx_list = remove_disconnected(G_new_list) | |||||
# pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||||
# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist() | |||||
# dis_min = dis_list[idx_min_tmp_list[0]] | |||||
# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list] | |||||
# G_new_list = [G_new_list[idx] for idx in idx_min_list] | |||||
for g in G_new_list: | |||||
import matplotlib.pyplot as plt | |||||
nx.draw_networkx(g) | |||||
plt.show() | |||||
print(g.nodes(data=True)) | |||||
print(g.edges(data=True)) | |||||
return G_new_list, pi_forward_list | |||||
def median_distance(Gn, Gn_median, measure='ged', verbose=False): | |||||
dis_list = [] | |||||
pi_forward_list = [] | |||||
for idx, G in tqdm(enumerate(Gn), desc='computing median distances', | |||||
file=sys.stdout) if verbose else enumerate(Gn): | |||||
dis_sum = 0 | |||||
pi_forward_list.append([]) | |||||
for G_p in Gn_median: | |||||
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p) | |||||
pi_forward_list[idx].append(pi_tmp_forward) | |||||
dis_sum += dis_tmp | |||||
dis_list.append(dis_sum) | |||||
return dis_list, pi_forward_list | |||||
def best_median_graphs(Gn_candidate, dis_all, pi_all_forward): | |||||
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() | |||||
dis_min = dis_all[idx_min_list[0]] | |||||
pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list] | |||||
G_min_list = [Gn_candidate[idx] for idx in idx_min_list] | |||||
return G_min_list, pi_forward_min_list, dis_min | |||||
def iteration_proc(G, pi_p_forward): | |||||
G_list = [G] | |||||
pi_forward_list = [pi_p_forward] | |||||
# iterations. | |||||
for itr in range(0, 10): # @todo: the convergence condition? | |||||
# print('itr is', itr) | |||||
G_new_list = [] | |||||
pi_forward_new_list = [] | |||||
for idx, G in enumerate(G_list): | |||||
label_set = get_node_labels(Gn_median + [G], node_label) | |||||
G_tmp_list, pi_forward_tmp_list = generate_graph( | |||||
G, pi_forward_list[idx], label_set) | |||||
G_new_list += G_tmp_list | |||||
pi_forward_new_list += pi_forward_tmp_list | |||||
G_list = G_new_list[:] | |||||
pi_forward_list = pi_forward_new_list[:] | |||||
G_list, idx_list = remove_duplicates(G_list) | |||||
pi_forward_list = [pi_forward_list[idx] for idx in idx_list] | |||||
# import matplotlib.pyplot as plt | |||||
# for g in G_list: | |||||
# nx.draw_networkx(g) | |||||
# plt.show() | |||||
# print(g.nodes(data=True)) | |||||
# print(g.edges(data=True)) | |||||
return G_list, pi_forward_list # do we return all graphs or the best ones? | |||||
def remove_duplicates(Gn): | |||||
"""Remove duplicate graphs from list. | |||||
""" | |||||
Gn_new = [] | |||||
idx_list = [] | |||||
for idx, g in enumerate(Gn): | |||||
dupl = False | |||||
for g_new in Gn_new: | |||||
if graph_isIdentical(g_new, g): | |||||
dupl = True | |||||
break | |||||
if not dupl: | |||||
Gn_new.append(g) | |||||
idx_list.append(idx) | |||||
return Gn_new, idx_list | |||||
def remove_disconnected(Gn): | |||||
"""Remove disconnected graphs from list. | |||||
""" | |||||
Gn_new = [] | |||||
idx_list = [] | |||||
for idx, g in enumerate(Gn): | |||||
if nx.is_connected(g): | |||||
Gn_new.append(g) | |||||
idx_list.append(idx) | |||||
return Gn_new, idx_list | |||||
# phase 1: initilize. | |||||
# compute set-median. | |||||
dis_min = np.inf | |||||
dis_all, pi_all_forward = median_distance(Gn_candidate[::-1], Gn_median) | |||||
# find all smallest distances. | |||||
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist() | |||||
dis_min = dis_all[idx_min_list[0]] | |||||
# phase 2: iteration. | |||||
G_list = [] | |||||
for idx_min in idx_min_list[::-1]: | |||||
# print('idx_min is', idx_min) | |||||
G = Gn_candidate[idx_min].copy() | |||||
# list of edit operations. | |||||
pi_p_forward = pi_all_forward[idx_min] | |||||
# pi_p_backward = pi_all_backward[idx_min] | |||||
Gi_list, pi_i_forward_list = iteration_proc(G, pi_p_forward) | |||||
G_list += Gi_list | |||||
G_list, _ = remove_duplicates(G_list) | |||||
if connected == True: | |||||
G_list, _ = remove_disconnected(G_list) | |||||
import matplotlib.pyplot as plt | |||||
for g in G_list: | |||||
nx.draw_networkx(g) | |||||
plt.show() | |||||
print(g.nodes(data=True)) | |||||
print(g.edges(data=True)) | |||||
# get the best median graphs | |||||
dis_all, pi_all_forward = median_distance(G_list, Gn_median) | |||||
G_min_list, pi_forward_min_list, dis_min = best_median_graphs( | |||||
G_list, dis_all, pi_all_forward) | |||||
for g in G_min_list: | |||||
nx.draw_networkx(g) | |||||
plt.show() | |||||
print(g.nodes(data=True)) | |||||
print(g.edges(data=True)) | |||||
return G_min_list | |||||
if __name__ == '__main__': | if __name__ == '__main__': | ||||
from pygraph.utils.graphfiles import loadDataset | from pygraph.utils.graphfiles import loadDataset | ||||
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat', | ||||
@@ -0,0 +1,430 @@ | |||||
""" | |||||
@author: linlin | |||||
@references: Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47. | |||||
""" | |||||
import sys | |||||
sys.path.insert(0, "../") | |||||
import time | |||||
from collections import Counter | |||||
from itertools import chain | |||||
from functools import partial | |||||
from multiprocessing import Pool | |||||
from tqdm import tqdm | |||||
import networkx as nx | |||||
import numpy as np | |||||
from pygraph.utils.graphdataset import get_dataset_attributes | |||||
from pygraph.utils.parallel import parallel_gm | |||||
def treeletkernel(*args, | |||||
sub_kernel, | |||||
node_label='atom', | |||||
edge_label='bond_type', | |||||
n_jobs=None, | |||||
verbose=True): | |||||
"""Calculate treelet graph kernels between graphs. | |||||
Parameters | |||||
---------- | |||||
Gn : List of NetworkX graph | |||||
List of graphs between which the kernels are calculated. | |||||
/ | |||||
G1, G2 : NetworkX graphs | |||||
Two graphs between which the kernel is calculated. | |||||
sub_kernel : function | |||||
The sub-kernel between 2 real number vectors. Each vector counts the | |||||
numbers of isomorphic treelets in a graph. | |||||
node_label : string | |||||
Node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
Edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
Kmatrix : Numpy matrix | |||||
Kernel matrix, each element of which is the treelet kernel between 2 praphs. | |||||
""" | |||||
# pre-process | |||||
Gn = args[0] if len(args) == 1 else [args[0], args[1]] | |||||
Kmatrix = np.zeros((len(Gn), len(Gn))) | |||||
ds_attrs = get_dataset_attributes(Gn, | |||||
attr_names=['node_labeled', 'edge_labeled', 'is_directed'], | |||||
node_label=node_label, edge_label=edge_label) | |||||
labeled = False | |||||
if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']: | |||||
labeled = True | |||||
if not ds_attrs['node_labeled']: | |||||
for G in Gn: | |||||
nx.set_node_attributes(G, '0', 'atom') | |||||
if not ds_attrs['edge_labeled']: | |||||
for G in Gn: | |||||
nx.set_edge_attributes(G, '0', 'bond_type') | |||||
start_time = time.time() | |||||
# ---- use pool.imap_unordered to parallel and track progress. ---- | |||||
# get all canonical keys of all graphs before calculating kernels to save | |||||
# time, but this may cost a lot of memory for large dataset. | |||||
pool = Pool(n_jobs) | |||||
itr = zip(Gn, range(0, len(Gn))) | |||||
if len(Gn) < 100 * n_jobs: | |||||
chunksize = int(len(Gn) / n_jobs) + 1 | |||||
else: | |||||
chunksize = 100 | |||||
canonkeys = [[] for _ in range(len(Gn))] | |||||
getps_partial = partial(wrapper_get_canonkeys, node_label, edge_label, | |||||
labeled, ds_attrs['is_directed']) | |||||
if verbose: | |||||
iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize), | |||||
desc='getting canonkeys', file=sys.stdout) | |||||
else: | |||||
iterator = pool.imap_unordered(getps_partial, itr, chunksize) | |||||
for i, ck in iterator: | |||||
canonkeys[i] = ck | |||||
pool.close() | |||||
pool.join() | |||||
# compute kernels. | |||||
def init_worker(canonkeys_toshare): | |||||
global G_canonkeys | |||||
G_canonkeys = canonkeys_toshare | |||||
do_partial = partial(wrapper_treeletkernel_do, sub_kernel) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | |||||
glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose) | |||||
run_time = time.time() - start_time | |||||
if verbose: | |||||
print("\n --- treelet kernel matrix of size %d built in %s seconds ---" | |||||
% (len(Gn), run_time)) | |||||
return Kmatrix, run_time | |||||
def _treeletkernel_do(canonkey1, canonkey2, sub_kernel): | |||||
"""Calculate treelet graph kernel between 2 graphs. | |||||
Parameters | |||||
---------- | |||||
canonkey1, canonkey2 : list | |||||
List of canonical keys in 2 graphs, where each key is represented by a string. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Treelet Kernel between 2 graphs. | |||||
""" | |||||
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | |||||
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | |||||
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys]) | |||||
kernel = np.sum(np.exp(-np.square(vector1 - vector2) / 2)) | |||||
# kernel = sub_kernel(vector1, vector2) | |||||
return kernel | |||||
def wrapper_treeletkernel_do(sub_kernel, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, _treeletkernel_do(G_canonkeys[i], G_canonkeys[j], sub_kernel) | |||||
def get_canonkeys(G, node_label, edge_label, labeled, is_directed): | |||||
"""Generate canonical keys of all treelets in a graph. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which keys are generated. | |||||
node_label : string | |||||
node attribute used as label. The default node label is atom. | |||||
edge_label : string | |||||
edge attribute used as label. The default edge label is bond_type. | |||||
labeled : boolean | |||||
Whether the graphs are labeled. The default is True. | |||||
Return | |||||
------ | |||||
canonkey/canonkey_l : dict | |||||
For unlabeled graphs, canonkey is a dictionary which records amount of | |||||
every tree pattern. For labeled graphs, canonkey_l is one which keeps | |||||
track of amount of every treelet. | |||||
""" | |||||
patterns = {} # a dictionary which consists of lists of patterns for all graphlet. | |||||
canonkey = {} # canonical key, a dictionary which records amount of every tree pattern. | |||||
### structural analysis ### | |||||
### In this section, a list of patterns is generated for each graphlet, | |||||
### where every pattern is represented by nodes ordered by Morgan's | |||||
### extended labeling. | |||||
# linear patterns | |||||
patterns['0'] = G.nodes() | |||||
canonkey['0'] = nx.number_of_nodes(G) | |||||
for i in range(1, 6): # for i in range(1, 6): | |||||
patterns[str(i)] = find_all_paths(G, i, is_directed) | |||||
canonkey[str(i)] = len(patterns[str(i)]) | |||||
# n-star patterns | |||||
patterns['3star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3] | |||||
patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4] | |||||
patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5] | |||||
# n-star patterns | |||||
canonkey['6'] = len(patterns['3star']) | |||||
canonkey['8'] = len(patterns['4star']) | |||||
canonkey['d'] = len(patterns['5star']) | |||||
# pattern 7 | |||||
patterns['7'] = [] # the 1st line of Table 1 in Ref [1] | |||||
for pattern in patterns['3star']: | |||||
for i in range(1, len(pattern)): # for each neighbor of node 0 | |||||
if G.degree(pattern[i]) >= 2: | |||||
pattern_t = pattern[:] | |||||
# set the node with degree >= 2 as the 4th node | |||||
pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] | |||||
for neighborx in G[pattern[i]]: | |||||
if neighborx != pattern[0]: | |||||
new_pattern = pattern_t + [neighborx] | |||||
patterns['7'].append(new_pattern) | |||||
canonkey['7'] = len(patterns['7']) | |||||
# pattern 11 | |||||
patterns['11'] = [] # the 4th line of Table 1 in Ref [1] | |||||
for pattern in patterns['4star']: | |||||
for i in range(1, len(pattern)): | |||||
if G.degree(pattern[i]) >= 2: | |||||
pattern_t = pattern[:] | |||||
pattern_t[i], pattern_t[4] = pattern_t[4], pattern_t[i] | |||||
for neighborx in G[pattern[i]]: | |||||
if neighborx != pattern[0]: | |||||
new_pattern = pattern_t + [ neighborx ] | |||||
patterns['11'].append(new_pattern) | |||||
canonkey['b'] = len(patterns['11']) | |||||
# pattern 12 | |||||
patterns['12'] = [] # the 5th line of Table 1 in Ref [1] | |||||
rootlist = [] # a list of root nodes, whose extended labels are 3 | |||||
for pattern in patterns['3star']: | |||||
if pattern[0] not in rootlist: # prevent to count the same pattern twice from each of the two root nodes | |||||
rootlist.append(pattern[0]) | |||||
for i in range(1, len(pattern)): | |||||
if G.degree(pattern[i]) >= 3: | |||||
rootlist.append(pattern[i]) | |||||
pattern_t = pattern[:] | |||||
pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] | |||||
for neighborx1 in G[pattern[i]]: | |||||
if neighborx1 != pattern[0]: | |||||
for neighborx2 in G[pattern[i]]: | |||||
if neighborx1 > neighborx2 and neighborx2 != pattern[0]: | |||||
new_pattern = pattern_t + [neighborx1] + [neighborx2] | |||||
# new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ] | |||||
patterns['12'].append(new_pattern) | |||||
canonkey['c'] = int(len(patterns['12']) / 2) | |||||
# pattern 9 | |||||
patterns['9'] = [] # the 2nd line of Table 1 in Ref [1] | |||||
for pattern in patterns['3star']: | |||||
for pairs in [ [neighbor1, neighbor2] for neighbor1 in G[pattern[0]] if G.degree(neighbor1) >= 2 \ | |||||
for neighbor2 in G[pattern[0]] if G.degree(neighbor2) >= 2 if neighbor1 > neighbor2 ]: | |||||
pattern_t = pattern[:] | |||||
# move nodes with extended labels 4 to specific position to correspond to their children | |||||
pattern_t[pattern_t.index(pairs[0])], pattern_t[2] = pattern_t[2], pattern_t[pattern_t.index(pairs[0])] | |||||
pattern_t[pattern_t.index(pairs[1])], pattern_t[3] = pattern_t[3], pattern_t[pattern_t.index(pairs[1])] | |||||
for neighborx1 in G[pairs[0]]: | |||||
if neighborx1 != pattern[0]: | |||||
for neighborx2 in G[pairs[1]]: | |||||
if neighborx2 != pattern[0]: | |||||
new_pattern = pattern_t + [neighborx1] + [neighborx2] | |||||
patterns['9'].append(new_pattern) | |||||
canonkey['9'] = len(patterns['9']) | |||||
# pattern 10 | |||||
patterns['10'] = [] # the 3rd line of Table 1 in Ref [1] | |||||
for pattern in patterns['3star']: | |||||
for i in range(1, len(pattern)): | |||||
if G.degree(pattern[i]) >= 2: | |||||
for neighborx in G[pattern[i]]: | |||||
if neighborx != pattern[0] and G.degree(neighborx) >= 2: | |||||
pattern_t = pattern[:] | |||||
pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i] | |||||
new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ] | |||||
patterns['10'].extend(new_patterns) | |||||
canonkey['a'] = len(patterns['10']) | |||||
### labeling information ### | |||||
### In this section, a list of canonical keys is generated for every | |||||
### pattern obtained in the structural analysis section above, which is a | |||||
### string corresponding to a unique treelet. A dictionary is built to keep | |||||
### track of the amount of every treelet. | |||||
if labeled == True: | |||||
canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet. | |||||
# linear patterns | |||||
canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values())) | |||||
for key in canonkey_t: | |||||
canonkey_l['0' + key] = canonkey_t[key] | |||||
for i in range(1, 6): # for i in range(1, 6): | |||||
treelet = [] | |||||
for pattern in patterns[str(i)]: | |||||
canonlist = list(chain.from_iterable((G.node[node][node_label], \ | |||||
G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1]))) | |||||
canonlist.append(G.node[pattern[-1]][node_label]) | |||||
canonkey_t = ''.join(canonlist) | |||||
canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1] | |||||
treelet.append(str(i) + canonkey_t) | |||||
canonkey_l.update(Counter(treelet)) | |||||
# n-star patterns | |||||
for i in range(3, 6): | |||||
treelet = [] | |||||
for pattern in patterns[str(i) + 'star']: | |||||
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ] | |||||
canonlist.sort() | |||||
canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist) | |||||
treelet.append(canonkey_t) | |||||
canonkey_l.update(Counter(treelet)) | |||||
# pattern 7 | |||||
treelet = [] | |||||
for pattern in patterns['7']: | |||||
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||||
canonlist.sort() | |||||
canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ | |||||
+ G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ | |||||
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] | |||||
treelet.append(canonkey_t) | |||||
canonkey_l.update(Counter(treelet)) | |||||
# pattern 11 | |||||
treelet = [] | |||||
for pattern in patterns['11']: | |||||
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ] | |||||
canonlist.sort() | |||||
canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \ | |||||
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \ | |||||
+ G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] | |||||
treelet.append(canonkey_t) | |||||
canonkey_l.update(Counter(treelet)) | |||||
# pattern 10 | |||||
treelet = [] | |||||
for pattern in patterns['10']: | |||||
canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label] | |||||
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||||
canonlist.sort() | |||||
canonkey0 = ''.join(canonlist) | |||||
canonkey_t = 'a' + G.node[pattern[3]][node_label] \ | |||||
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \ | |||||
+ G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ | |||||
+ canonkey4 + canonkey0 | |||||
treelet.append(canonkey_t) | |||||
canonkey_l.update(Counter(treelet)) | |||||
# pattern 12 | |||||
treelet = [] | |||||
for pattern in patterns['12']: | |||||
canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ] | |||||
canonlist0.sort() | |||||
canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ] | |||||
canonlist3.sort() | |||||
# 2 possible key can be generated from 2 nodes with extended label 3, select the one with lower lexicographic order. | |||||
canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \ | |||||
+ ''.join(canonlist0) \ | |||||
+ G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \ | |||||
+ ''.join(canonlist3) | |||||
canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \ | |||||
+ ''.join(canonlist3) \ | |||||
+ G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \ | |||||
+ ''.join(canonlist0) | |||||
treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2) | |||||
canonkey_l.update(Counter(treelet)) | |||||
# pattern 9 | |||||
treelet = [] | |||||
for pattern in patterns['9']: | |||||
canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label] | |||||
canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label] | |||||
prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label] | |||||
prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] | |||||
if prekey2 + canonkey2 < prekey3 + canonkey3: | |||||
canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ | |||||
+ prekey2 + prekey3 + canonkey2 + canonkey3 | |||||
else: | |||||
canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \ | |||||
+ prekey3 + prekey2 + canonkey3 + canonkey2 | |||||
treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t) | |||||
canonkey_l.update(Counter(treelet)) | |||||
return canonkey_l | |||||
return canonkey | |||||
def wrapper_get_canonkeys(node_label, edge_label, labeled, is_directed, itr_item): | |||||
g = itr_item[0] | |||||
i = itr_item[1] | |||||
return i, get_canonkeys(g, node_label, edge_label, labeled, is_directed) | |||||
def find_paths(G, source_node, length): | |||||
"""Find all paths with a certain length those start from a source node. | |||||
A recursive depth first search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
source_node : integer | |||||
The number of the node from where all paths start. | |||||
length : integer | |||||
The length of paths. | |||||
Return | |||||
------ | |||||
path : list of list | |||||
List of paths retrieved, where each path is represented by a list of nodes. | |||||
""" | |||||
if length == 0: | |||||
return [[source_node]] | |||||
path = [[source_node] + path for neighbor in G[source_node] \ | |||||
for path in find_paths(G, neighbor, length - 1) if source_node not in path] | |||||
return path | |||||
def find_all_paths(G, length, is_directed): | |||||
"""Find all paths with a certain length in a graph. A recursive depth first | |||||
search is applied. | |||||
Parameters | |||||
---------- | |||||
G : NetworkX graphs | |||||
The graph in which paths are searched. | |||||
length : integer | |||||
The length of paths. | |||||
Return | |||||
------ | |||||
path : list of list | |||||
List of paths retrieved, where each path is represented by a list of nodes. | |||||
""" | |||||
all_paths = [] | |||||
for node in G: | |||||
all_paths.extend(find_paths(G, node, length)) | |||||
if not is_directed: | |||||
# For each path, two presentations are retrieved from its two extremities. | |||||
# Remove one of them. | |||||
all_paths_r = [path[::-1] for path in all_paths] | |||||
for idx, path in enumerate(all_paths[:-1]): | |||||
for path2 in all_paths_r[idx+1::]: | |||||
if path == path2: | |||||
all_paths[idx] = [] | |||||
break | |||||
all_paths = list(filter(lambda a: a != [], all_paths)) | |||||
return all_paths |
@@ -31,6 +31,7 @@ def untilhpathkernel(*args, | |||||
n_jobs=None, | n_jobs=None, | ||||
verbose=True): | verbose=True): | ||||
"""Calculate path graph kernels up to depth/hight h between graphs. | """Calculate path graph kernels up to depth/hight h between graphs. | ||||
Parameters | Parameters | ||||
---------- | ---------- | ||||
Gn : List of NetworkX graph | Gn : List of NetworkX graph | ||||
@@ -124,7 +125,7 @@ def untilhpathkernel(*args, | |||||
def init_worker(trie_toshare): | def init_worker(trie_toshare): | ||||
global G_trie | global G_trie | ||||
G_trie = trie_toshare | G_trie = trie_toshare | ||||
do_partial = partial(wrapper_uhpath_do_trie, k_func) | |||||
do_partial = partial(wrapper_uhpath_do_trie, k_func) | |||||
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker, | ||||
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) | glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose) | ||||
else: | else: | ||||
@@ -84,7 +84,7 @@ def loadGXL(filename): | |||||
return g | return g | ||||
def saveGXL(graph, filename, method='benoit'): | |||||
def saveGXL(graph, filename, method='gedlib'): | |||||
if method == 'benoit': | if method == 'benoit': | ||||
import xml.etree.ElementTree as ET | import xml.etree.ElementTree as ET | ||||
root_node = ET.Element('gxl') | root_node = ET.Element('gxl') | ||||
@@ -124,23 +124,24 @@ def saveGXL(graph, filename, method='benoit'): | |||||
tree.write(filename) | tree.write(filename) | ||||
elif method == 'gedlib': | elif method == 'gedlib': | ||||
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | # reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22 | ||||
pass | |||||
# gxl_file = open(filename, 'w') | |||||
# gxl_file.write("<?xml version=\"1.0\"?>\n") | |||||
# gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||||
# gxl_file.write("<gxl>\n") | |||||
# gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n") | |||||
# for v in graph: | |||||
# gxl_file.write("<node id=\"_" + str(v) + "\">\n") | |||||
# gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n") | |||||
# gxl_file.write("</node>\n") | |||||
# for edge in self.edge_list: | |||||
# gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n") | |||||
# gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n") | |||||
# gxl_file.write("</edge>\n") | |||||
# gxl_file.write("</graph>\n") | |||||
# gxl_file.write("</gxl>\n") | |||||
# gxl_file.close() | |||||
# pass | |||||
gxl_file = open(filename, 'w') | |||||
gxl_file.write("<?xml version=\"1.0\"?>\n") | |||||
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n") | |||||
gxl_file.write("<gxl>\n") | |||||
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n") | |||||
for v, attrs in graph.nodes(data=True): | |||||
gxl_file.write("<node id=\"_" + str(v) + "\">\n") | |||||
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>\n") | |||||
gxl_file.write("</node>\n") | |||||
for v1, v2, attrs in graph.edges(data=True): | |||||
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">\n") | |||||
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>\n") | |||||
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>\n") | |||||
gxl_file.write("</edge>\n") | |||||
gxl_file.write("</graph>\n") | |||||
gxl_file.write("</gxl>\n") | |||||
gxl_file.close() | |||||
def loadSDF(filename): | def loadSDF(filename): | ||||
@@ -57,6 +57,27 @@ def gaussiankernel(x, y, gamma=None): | |||||
return kernel | return kernel | ||||
def polynomialkernel(x, y, d=1, c=0): | |||||
"""Polynomial kernel. | |||||
Compute the polynomial kernel between x and y: | |||||
K(x, y) = (x^Ty)^d + c. | |||||
Parameters | |||||
---------- | |||||
x, y : array | |||||
d : integer, default 1 | |||||
c : float, default 0 | |||||
Returns | |||||
------- | |||||
kernel : float | |||||
""" | |||||
return np.dot(x, y) ** d + c | |||||
def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1): | ||||
"""Sum of a pair of kernels. | """Sum of a pair of kernels. | ||||
@@ -110,3 +131,7 @@ def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1): | |||||
else: | else: | ||||
kernel = lamda * k1(d11, d12) * k2(d21, d22) | kernel = lamda * k1(d11, d12) * k2(d21, d22) | ||||
return kernel | return kernel | ||||
if __name__ == '__main__': | |||||
o = polynomialkernel([1, 2], [3, 4], 2, 3) |
@@ -145,7 +145,8 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
# Kmatrix = np.random.rand(2250, 2250) | # Kmatrix = np.random.rand(2250, 2250) | ||||
# current_run_time = 0.1 | # current_run_time = 0.1 | ||||
# remove graphs whose kernels with themselves are zeros | |||||
# remove graphs whose kernels with themselves are zeros | |||||
# @todo: y not changed accordingly? | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | Kmatrix_diag = Kmatrix.diagonal().copy() | ||||
nb_g_ignore = 0 | nb_g_ignore = 0 | ||||
for idxk, diag in enumerate(Kmatrix_diag): | for idxk, diag in enumerate(Kmatrix_diag): | ||||
@@ -154,6 +155,7 @@ def model_selection_for_precomputed_kernel(datafile, | |||||
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) | Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1) | ||||
nb_g_ignore += 1 | nb_g_ignore += 1 | ||||
# normalization | # normalization | ||||
# @todo: works only for undirected graph? | |||||
Kmatrix_diag = Kmatrix.diagonal().copy() | Kmatrix_diag = Kmatrix.diagonal().copy() | ||||
for i in range(len(Kmatrix)): | for i in range(len(Kmatrix)): | ||||
for j in range(i, len(Kmatrix)): | for j in range(i, len(Kmatrix)): | ||||
@@ -1,5 +1,6 @@ | |||||
import networkx as nx | import networkx as nx | ||||
import numpy as np | import numpy as np | ||||
from copy import deepcopy | |||||
#from itertools import product | #from itertools import product | ||||
# from tqdm import tqdm | # from tqdm import tqdm | ||||
@@ -183,3 +184,61 @@ def direct_product(G1, G2, node_label, edge_label): | |||||
# gt = nx.convert_node_labels_to_integers( | # gt = nx.convert_node_labels_to_integers( | ||||
# gt, first_label=0, label_attribute='label_orignal') | # gt, first_label=0, label_attribute='label_orignal') | ||||
return gt | return gt | ||||
def graph_deepcopy(G): | |||||
"""Deep copy a graph, including deep copy of all nodes, edges and | |||||
attributes of the graph, nodes and edges. | |||||
Note | |||||
---- | |||||
It is the same as the NetworkX function graph.copy(), as far as I know. | |||||
""" | |||||
# add graph attributes. | |||||
labels = {} | |||||
for k, v in G.graph.items(): | |||||
labels[k] = deepcopy(v) | |||||
if G.is_directed(): | |||||
G_copy = nx.DiGraph(**labels) | |||||
else: | |||||
G_copy = nx.Graph(**labels) | |||||
# add nodes | |||||
for nd, attrs in G.nodes(data=True): | |||||
labels = {} | |||||
for k, v in attrs.items(): | |||||
labels[k] = deepcopy(v) | |||||
G_copy.add_node(nd, **labels) | |||||
# add edges. | |||||
for nd1, nd2, attrs in G.edges(data=True): | |||||
labels = {} | |||||
for k, v in attrs.items(): | |||||
labels[k] = deepcopy(v) | |||||
G_copy.add_edge(nd1, nd2, **labels) | |||||
return G_copy | |||||
def graph_isIdentical(G1, G2): | |||||
"""Check if two graphs are identical, including: same nodes, edges, node | |||||
labels/attributes, edge labels/attributes. | |||||
Notes | |||||
---- | |||||
1. The type of graphs has to be the same. | |||||
2. Global/Graph attributes are neglected as they may contain names for graphs. | |||||
""" | |||||
# check nodes. | |||||
nlist1 = [n for n in G1.nodes(data=True)] | |||||
nlist2 = [n for n in G2.nodes(data=True)] | |||||
if not nlist1 == nlist2: | |||||
return False | |||||
# check edges. | |||||
elist1 = [n for n in G1.edges(data=True)] | |||||
elist2 = [n for n in G2.edges(data=True)] | |||||
if not elist1 == elist2: | |||||
return False | |||||
# check graph attributes. | |||||
return True |