Browse Source

add the treelet kernel.

v0.1
jajupmochi 6 years ago
parent
commit
92c74448ec
12 changed files with 2782 additions and 1276 deletions
  1. +0
    -2
      notebooks/run_commonwalkkernel.py
  2. +83
    -0
      notebooks/run_treeletkernel.py
  3. +0
    -1
      notebooks/run_untilhpathkernel.py
  4. +1447
    -1149
      notebooks/utils/plot_all_graphs.ipynb
  5. +266
    -89
      preimage/gk_iam.py
  6. +448
    -15
      preimage/iam.py
  7. +430
    -0
      pygraph/kernels/treeletKernel.py
  8. +2
    -1
      pygraph/kernels/untilHPathKernel.py
  9. +19
    -18
      pygraph/utils/graphfiles.py
  10. +25
    -0
      pygraph/utils/kernels.py
  11. +3
    -1
      pygraph/utils/model_selection_precomputed.py
  12. +59
    -0
      pygraph/utils/utils.py

+ 0
- 2
notebooks/run_commonwalkkernel.py View File

@@ -8,10 +8,8 @@ Created on Fri Sep 28 17:01:13 2018

from libs import *
import multiprocessing
from sklearn.metrics.pairwise import rbf_kernel

from pygraph.kernels.commonWalkKernel import commonwalkkernel
from pygraph.utils.kernels import deltakernel, kernelproduct

dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',


+ 83
- 0
notebooks/run_treeletkernel.py View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Oct 5 19:19:33 2018

@author: ljia
"""

from libs import *
import multiprocessing

from pygraph.kernels.treeletKernel import treeletkernel
from pygraph.utils.kernels import gaussiankernel, polynomialkernel

dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'task': 'regression'}, # node symb
{'name': 'Alkane', 'dataset': '../datasets/Alkane/dataset.ds', 'task': 'regression',
'dataset_y': '../datasets/Alkane/dataset_boiling_point_names.txt', },
# contains single node graph, node symb
{'name': 'MAO', 'dataset': '../datasets/MAO/dataset.ds', }, # node/edge symb
{'name': 'PAH', 'dataset': '../datasets/PAH/dataset.ds', }, # unlabeled
{'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}}, # node/edge symb
# {'name': 'Letter-med', 'dataset': '../datasets/Letter-med/Letter-med_A.txt'},
# # node nsymb
{'name': 'ENZYMES', 'dataset': '../datasets/ENZYMES_txt/ENZYMES_A_sparse.txt'},
# node symb/nsymb
# {'name': 'Mutagenicity', 'dataset': '../datasets/Mutagenicity/Mutagenicity_A.txt'},
# # node/edge symb
# {'name': 'D&D', 'dataset': '../datasets/D&D/DD.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 1, 2, 1, -1]}}, # node symb

# {'name': 'COIL-DEL', 'dataset': '../datasets/COIL-DEL/COIL-DEL_A.txt'}, # edge symb, node nsymb
# # # {'name': 'BZR', 'dataset': '../datasets/BZR_txt/BZR_A_sparse.txt'}, # node symb/nsymb
# # # {'name': 'COX2', 'dataset': '../datasets/COX2_txt/COX2_A_sparse.txt'}, # node symb/nsymb
# {'name': 'Fingerprint', 'dataset': '../datasets/Fingerprint/Fingerprint_A.txt'},
#
# # {'name': 'DHFR', 'dataset': '../datasets/DHFR_txt/DHFR_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'SYNTHETIC', 'dataset': '../datasets/SYNTHETIC_txt/SYNTHETIC_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'MSRC9', 'dataset': '../datasets/MSRC_9_txt/MSRC_9_A.txt'}, # node symb
# # {'name': 'MSRC21', 'dataset': '../datasets/MSRC_21_txt/MSRC_21_A.txt'}, # node symb
# # {'name': 'FIRSTMM_DB', 'dataset': '../datasets/FIRSTMM_DB/FIRSTMM_DB_A.txt'}, # node symb/nsymb ,edge nsymb

# # {'name': 'PROTEINS', 'dataset': '../datasets/PROTEINS_txt/PROTEINS_A_sparse.txt'}, # node symb/nsymb
# # {'name': 'PROTEINS_full', 'dataset': '../datasets/PROTEINS_full_txt/PROTEINS_full_A_sparse.txt'}, # node symb/nsymb
{'name': 'AIDS', 'dataset': '../datasets/AIDS/AIDS_A.txt'}, # node symb/nsymb, edge symb
# {'name': 'NCI1', 'dataset': '../datasets/NCI1/NCI1.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI109', 'dataset': '../datasets/NCI109/NCI109.mat',
# 'extra_params': {'am_sp_al_nl_el': [1, 1, 2, 0, -1]}}, # node symb
# {'name': 'NCI-HIV', 'dataset': '../datasets/NCI-HIV/AIDO99SD.sdf',
# 'dataset_y': '../datasets/NCI-HIV/aids_conc_may04.txt',}, # node/edge symb

# # not working below
# {'name': 'PTC_FM', 'dataset': '../datasets/PTC/Train/FM.ds',},
# {'name': 'PTC_FR', 'dataset': '../datasets/PTC/Train/FR.ds',},
# {'name': 'PTC_MM', 'dataset': '../datasets/PTC/Train/MM.ds',},
# {'name': 'PTC_MR', 'dataset': '../datasets/PTC/Train/MR.ds',},
]
estimator = treeletkernel
param_grid_precomputed = {'sub_kernel': [gaussiankernel, polynomialkernel]}
param_grid = [{'C': np.logspace(-10, 10, num=41, base=10)},
{'alpha': np.logspace(-10, 10, num=41, base=10)}]

for ds in dslist:
print()
print(ds['name'])
model_selection_for_precomputed_kernel(
ds['dataset'],
estimator,
param_grid_precomputed,
(param_grid[1] if ('task' in ds and ds['task']
== 'regression') else param_grid[0]),
(ds['task'] if 'task' in ds else 'classification'),
NUM_TRIALS=30,
datafile_y=(ds['dataset_y'] if 'dataset_y' in ds else None),
extra_params=(ds['extra_params'] if 'extra_params' in ds else None),
ds_name=ds['name'],
n_jobs=multiprocessing.cpu_count(),
read_gm_from_file=False,
verbose=True)
print()

+ 0
- 1
notebooks/run_untilhpathkernel.py View File

@@ -10,7 +10,6 @@ from libs import *
import multiprocessing

from pygraph.kernels.untilHPathKernel import untilhpathkernel
from pygraph.utils.kernels import deltakernel, kernelproduct

dslist = [
{'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',


+ 1447
- 1149
notebooks/utils/plot_all_graphs.ipynb
File diff suppressed because it is too large
View File


+ 266
- 89
preimage/gk_iam.py View File

@@ -11,13 +11,17 @@ and the iterative alternate minimizations (IAM) in reference [2].
pre-images. In Joint Pattern Re ognition Symposium , pages 253-261. Springer, 2004.
[2] Generalized median graph via iterative alternate minimization.
"""
import sys
import numpy as np
import multiprocessing
from tqdm import tqdm
import networkx as nx
import matplotlib.pyplot as plt

from iam import iam
from iam import iam, test_iam_with_more_graphs_as_init, test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations
sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.kernels.untilHPathKernel import untilhpathkernel


def gk_iam(Gn, alpha):
@@ -29,58 +33,59 @@ def gk_iam(Gn, alpha):
-----
Every time a better graph is acquired, the older one is replaced by it.
"""
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
dis_list.append(dtemp)
# sort
sort_idx = np.argsort(dis_list)
dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
if dis_gs[0] == 0: # the exact pre-image.
print('The exact pre-image is found from the input dataset.')
return 0, g0hat
dhat = dis_gs[0] # the nearest distance
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
gihat_list = []
# i = 1
r = 1
while r < r_max:
print('r =', r)
# found = False
Gs_nearest = Gk + gihat_list
g_tmp = iam(Gs_nearest)
# compute distance between phi and the new generated graph.
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
if dnew <= dhat: # the new distance is smaller
print('I am smaller!')
dhat = dnew
g_new = g_tmp.copy() # found better graph.
gihat_list = [g_new]
dis_gs.append(dhat)
r = 0
else:
r += 1
ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
return dhat, ghat
pass
# # compute k nearest neighbors of phi in DN.
# dis_list = [] # distance between g_star and each graph.
# for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
# dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
# k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
# dis_list.append(dtemp)
#
# # sort
# sort_idx = np.argsort(dis_list)
# dis_gs = [dis_list[idis] for idis in sort_idx[0:k]]
# g0hat = Gn[sort_idx[0]] # the nearest neighbor of phi in DN
# if dis_gs[0] == 0: # the exact pre-image.
# print('The exact pre-image is found from the input dataset.')
# return 0, g0hat
# dhat = dis_gs[0] # the nearest distance
# Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
# gihat_list = []
#
## i = 1
# r = 1
# while r < r_max:
# print('r =', r)
## found = False
# Gs_nearest = Gk + gihat_list
# g_tmp = iam(Gs_nearest)
#
# # compute distance between phi and the new generated graph.
# knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
# p_quit=lmbda, n_iteration=20, remove_totters=False,
# n_jobs=multiprocessing.cpu_count(), verbose=False)
# dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
# knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
# (1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
# k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
# if dnew <= dhat: # the new distance is smaller
# print('I am smaller!')
# dhat = dnew
# g_new = g_tmp.copy() # found better graph.
# gihat_list = [g_new]
# dis_gs.append(dhat)
# r = 0
# else:
# r += 1
#
# ghat = ([g0hat] if len(gihat_list) == 0 else gihat_list)
#
# return dhat, ghat


def gk_iam_nearest(Gn, alpha):
def gk_iam_nearest(Gn, alpha, idx_gi, Kmatrix, k, r_max):
"""This function constructs graph pre-image by the iterative pre-image
framework in reference [1], algorithm 1, where the step of generating new
graphs randomly is replaced by the IAM algorithm in reference [2].
@@ -94,10 +99,11 @@ def gk_iam_nearest(Gn, alpha):
# compute k nearest neighbors of phi in DN.
dis_list = [] # distance between g_star and each graph.
for ig, g in tqdm(enumerate(Gn), desc='computing distances', file=sys.stdout):
dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
k_g2_list[ig]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix)
# dtemp = k_list[ig] - 2 * (alpha * k_g1_list[ig] + (1 - alpha) *
# k_g2_list[ig]) + (alpha * alpha * k_list[0] + alpha *
# (1 - alpha) * k_g2_list[0] + (1 - alpha) * alpha *
# k_g1_list[6] + (1 - alpha) * (1 - alpha) * k_list[6])
dis_list.append(dtemp)
# sort
@@ -108,9 +114,12 @@ def gk_iam_nearest(Gn, alpha):
print('The exact pre-image is found from the input dataset.')
return 0, g0hat
dhat = dis_gs[0] # the nearest distance
ghat = g0hat
Gk = [Gn[ig] for ig in sort_idx[0:k]] # the k nearest neighbors
Gs_nearest = Gk
ghat = g0hat.copy()
Gk = [Gn[ig].copy() for ig in sort_idx[0:k]] # the k nearest neighbors
for gi in Gk:
nx.draw_networkx(gi)
plt.show()
Gs_nearest = Gk.copy()
# gihat_list = []
# i = 1
@@ -119,18 +128,29 @@ def gk_iam_nearest(Gn, alpha):
print('r =', r)
# found = False
# Gs_nearest = Gk + gihat_list
g_tmp = iam(Gs_nearest)
# g_tmp = iam(Gs_nearest)
g_tmp = test_iam_with_more_graphs_as_init(Gs_nearest, Gs_nearest, c_ei=1, c_er=1, c_es=1)
nx.draw_networkx(g_tmp)
plt.show()
# compute distance between phi and the new generated graph.
knew = marginalizedkernel([g_tmp, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
dnew = knew[0][0, 0] - 2 * (alpha * knew[0][0, 1] + (1 - alpha) *
knew[0][0, 2]) + (alpha * alpha * k_list[idx1] + alpha *
(1 - alpha) * k_g2_list[idx1] + (1 - alpha) * alpha *
k_g1_list[idx2] + (1 - alpha) * (1 - alpha) * k_list[idx2])
if dnew <= dhat: # the new distance is smaller
gi_list = [Gn[i] for i in idx_gi]
knew = compute_kernel([g_tmp] + gi_list, 'untilhpathkernel', False)
dnew = dis_gstar(0, range(1, len(gi_list) + 1), alpha, knew)
# dnew = knew[0, 0] - 2 * (alpha[0] * knew[0, 1] + alpha[1] *
# knew[0, 2]) + (alpha[0] * alpha[0] * k_list[0] + alpha[0] *
# alpha[1] * k_g2_list[0] + alpha[1] * alpha[0] *
# k_g1_list[1] + alpha[1] * alpha[1] * k_list[1])
if dnew <= dhat and g_tmp != ghat: # the new distance is smaller
print('I am smaller!')
print(str(dhat) + '->' + str(dnew))
# nx.draw_networkx(ghat)
# plt.show()
# print('->')
# nx.draw_networkx(g_tmp)
# plt.show()
dhat = dnew
g_new = g_tmp.copy() # found better graph.
ghat = g_tmp.copy()
@@ -144,48 +164,205 @@ def gk_iam_nearest(Gn, alpha):
r += 1
return dhat, ghat


def dis_gstar(idx_g, idx_gi, alpha, Kmatrix):
term1 = Kmatrix[idx_g, idx_g]
term2 = 0
for i, a in enumerate(alpha):
term2 += a * Kmatrix[idx_g, idx_gi[i]]
term2 *= 2
term3 = 0
for i1, a1 in enumerate(alpha):
for i2, a2 in enumerate(alpha):
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]]
return np.sqrt(term1 - term2 + term3)


def compute_kernel(Gn, graph_kernel, verbose):
if graph_kernel == 'marginalizedkernel':
Kmatrix, _ = marginalizedkernel(Gn, node_label='atom', edge_label=None,
p_quit=0.3, n_iteration=19, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
elif graph_kernel == 'untilhpathkernel':
Kmatrix, _ = untilhpathkernel(Gn, node_label='atom', edge_label='bond_type',
depth=2, k_func='MinMax', compute_method='trie',
n_jobs=multiprocessing.cpu_count(), verbose=verbose)
# normalization
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j])
Kmatrix[j][i] = Kmatrix[i][j]
return Kmatrix

def gram2distances(Kmatrix):
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix)))
for i1 in range(len(Kmatrix)):
for i2 in range(len(Kmatrix)):
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2]
dmatrix = np.sqrt(dmatrix)
return dmatrix

# --------------------------- These are tests --------------------------------#
def test_who_is_the_closest_in_kernel_space(Gn):
idx_gi = [0, 6]
g1 = Gn[idx_gi[0]]
g2 = Gn[idx_gi[1]]
# create the "median" graph.
gnew = g2.copy()
gnew.remove_node(0)
nx.draw_networkx(gnew)
plt.show()
print(gnew.nodes(data=True))
Gn = [gnew] + Gn
# compute gram matrix
Kmatrix = compute_kernel(Gn, 'untilhpathkernel', True)
# the distance matrix
dmatrix = gram2distances(Kmatrix)
print(np.sort(dmatrix[idx_gi[0] + 1]))
print(np.argsort(dmatrix[idx_gi[0] + 1]))
print(np.sort(dmatrix[idx_gi[1] + 1]))
print(np.argsort(dmatrix[idx_gi[1] + 1]))
# for all g in Gn, compute (d(g1, g) + d(g2, g)) / 2
dis_median = [(dmatrix[i, idx_gi[0] + 1] + dmatrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
print(np.sort(dis_median))
print(np.argsort(dis_median))
return


def test_who_is_the_closest_in_GED_space(Gn):
from iam import GED
idx_gi = [0, 6]
g1 = Gn[idx_gi[0]]
g2 = Gn[idx_gi[1]]
# create the "median" graph.
gnew = g2.copy()
gnew.remove_node(0)
nx.draw_networkx(gnew)
plt.show()
print(gnew.nodes(data=True))
Gn = [gnew] + Gn
# compute GEDs
ged_matrix = np.zeros((len(Gn), len(Gn)))
for i1 in tqdm(range(len(Gn)), desc='computing GEDs', file=sys.stdout):
for i2 in range(len(Gn)):
dis, _, _ = GED(Gn[i1], Gn[i2], lib='gedlib')
ged_matrix[i1, i2] = dis
print(np.sort(ged_matrix[idx_gi[0] + 1]))
print(np.argsort(ged_matrix[idx_gi[0] + 1]))
print(np.sort(ged_matrix[idx_gi[1] + 1]))
print(np.argsort(ged_matrix[idx_gi[1] + 1]))
# for all g in Gn, compute (GED(g1, g) + GED(g2, g)) / 2
dis_median = [(ged_matrix[i, idx_gi[0] + 1] + ged_matrix[i, idx_gi[1] + 1]) / 2 for i in range(len(Gn))]
print(np.sort(dis_median))
print(np.argsort(dis_median))
return


def test_will_IAM_give_the_median_graph_we_wanted(Gn):
idx_gi = [0, 6]
g1 = Gn[idx_gi[0]].copy()
g2 = Gn[idx_gi[1]].copy()
# del Gn[idx_gi[0]]
# del Gn[idx_gi[1] - 1]
g_median = test_iam_with_more_graphs_as_init([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
# g_median = test_iam_with_more_graphs_as_init(Gn, Gn, c_ei=1, c_er=1, c_es=1)
nx.draw_networkx(g_median)
plt.show()
print(g_median.nodes(data=True))
print(g_median.edges(data=True))
def test_new_IAM_allGraph_deleteNodes(Gn):
idx_gi = [0, 6]
# g1 = Gn[idx_gi[0]].copy()
# g2 = Gn[idx_gi[1]].copy()

g1 = nx.Graph(name='haha')
g1.add_nodes_from([(2, {'atom': 'C'}), (3, {'atom': 'O'}), (4, {'atom': 'C'})])
g1.add_edges_from([(2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
g2 = nx.Graph(name='hahaha')
g2.add_nodes_from([(0, {'atom': 'C'}), (1, {'atom': 'O'}), (2, {'atom': 'C'}),
(3, {'atom': 'O'}), (4, {'atom': 'C'})])
g2.add_edges_from([(0, 1, {'bond_type': '1'}), (1, 2, {'bond_type': '1'}),
(2, 3, {'bond_type': '1'}), (3, 4, {'bond_type': '1'})])
# g2 = g1.copy()
# g2.add_nodes_from([(3, {'atom': 'O'})])
# g2.add_nodes_from([(4, {'atom': 'C'})])
# g2.add_edges_from([(1, 3, {'bond_type': '1'})])
# g2.add_edges_from([(3, 4, {'bond_type': '1'})])

# del Gn[idx_gi[0]]
# del Gn[idx_gi[1] - 1]
g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations([g1, g2], [g1, g2], c_ei=1, c_er=1, c_es=1)
# g_median = test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(Gn, Gn, c_ei=1, c_er=1, c_es=1)
nx.draw_networkx(g_median)
plt.show()
print(g_median.nodes(data=True))
print(g_median.edges(data=True))


if __name__ == '__main__':
import sys
sys.path.insert(0, "../")
from pygraph.kernels.marginalizedKernel import marginalizedkernel
from pygraph.utils.graphfiles import loadDataset
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',
# 'extra_params': {'am_sp_al_nl_el': [0, 0, 3, 1, 2]}} # node/edge symb
# ds = {'name': 'Letter-high', 'dataset': '../datasets/Letter-high/Letter-high_A.txt',
# 'extra_params': {}} # node nsymb
# ds = {'name': 'Acyclic', 'dataset': '../datasets/monoterpenoides/trainset_9.ds',
# 'extra_params': {}}
ds = {'name': 'Acyclic', 'dataset': '../datasets/acyclic/dataset_bps.ds',
'extra_params': {}} # node symb
Gn, y_all = loadDataset(ds['dataset'], extra_params=ds['extra_params'])
# Gn = Gn[0:10]
# Gn = Gn[0:20]
test_new_IAM_allGraph_deleteNodes(Gn)
test_will_IAM_give_the_median_graph_we_wanted(Gn)
test_who_is_the_closest_in_GED_space(Gn)
test_who_is_the_closest_in_kernel_space(Gn)
lmbda = 0.03 # termination probalility
r_max = 10 # recursions
l = 500
alpha_range = np.linspace(0.1, 0.9, 9)
k = 5 # k nearest neighbors
alpha_range = np.linspace(0.5, 0.5, 1)
k = 20 # k nearest neighbors
# randomly select two molecules
np.random.seed(1)
idx1, idx2 = np.random.randint(0, len(Gn), 2)
g1 = Gn[idx1]
g2 = Gn[idx2]
idx_gi = [0, 6] # np.random.randint(0, len(Gn), 2)
g1 = Gn[idx_gi[0]]
g2 = Gn[idx_gi[1]]
# g_tmp = iam([g1, g2])
# nx.draw_networkx(g_tmp)
# plt.show()
# compute
k_list = [] # kernel between each graph and itself.
k_g1_list = [] # kernel between each graph and g1
k_g2_list = [] # kernel between each graph and g2
for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout):
ktemp = marginalizedkernel([g, g1, g2], node_label='atom', edge_label=None,
p_quit=lmbda, n_iteration=20, remove_totters=False,
n_jobs=multiprocessing.cpu_count(), verbose=False)
k_list.append(ktemp[0][0, 0])
k_g1_list.append(ktemp[0][0, 1])
k_g2_list.append(ktemp[0][0, 2])
# k_list = [] # kernel between each graph and itself.
# k_g1_list = [] # kernel between each graph and g1
# k_g2_list = [] # kernel between each graph and g2
# for ig, g in tqdm(enumerate(Gn), desc='computing self kernels', file=sys.stdout):
# ktemp = compute_kernel([g, g1, g2], 'marginalizedkernel', False)
# k_list.append(ktemp[0][0, 0])
# k_g1_list.append(ktemp[0][0, 1])
# k_g2_list.append(ktemp[0][0, 2])
km = compute_kernel(Gn, 'untilhpathkernel', True)
# k_list = np.diag(km) # kernel between each graph and itself.
# k_g1_list = km[idx_gi[0]] # kernel between each graph and g1
# k_g2_list = km[idx_gi[1]] # kernel between each graph and g2

g_best = []
dis_best = []
# for each alpha
for alpha in alpha_range:
print('alpha =', alpha)
dhat, ghat = gk_iam_nearest(Gn, alpha)
dhat, ghat = gk_iam_nearest(Gn, [alpha, 1 - alpha], idx_gi, km, k, r_max)
dis_best.append(dhat)
g_best.append(ghat)


+ 448
- 15
preimage/iam.py View File

@@ -16,18 +16,17 @@ import librariesImport, script
sys.path.insert(0, "../")
from pygraph.utils.graphfiles import saveDataset
from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.utils import graph_isIdentical
#from pygraph.utils.utils import graph_deepcopy


def iam(Gn, node_label='atom', edge_label='bond_type'):
def iam(Gn, c_ei=3, c_er=3, c_es=1, node_label='atom', edge_label='bond_type',
connected=True):
"""See my name, then you know what I do.
"""
# Gn = Gn[0:10]
Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
c_er = 1
c_es = 1
c_ei = 1
# phase 1: initilize.
# compute set-median.
dis_min = np.inf
@@ -37,7 +36,7 @@ def iam(Gn, node_label='atom', edge_label='bond_type'):
dist_sum = 0
pi_all.append([])
for idx2, G_p_prime in enumerate(Gn):
dist_tmp, pi_tmp = GED(G_p, G_p_prime)
dist_tmp, pi_tmp, _ = GED(G_p, G_p_prime)
pi_all[idx1].append(pi_tmp)
dist_sum += dist_tmp
if dist_sum < dis_min:
@@ -50,7 +49,7 @@ def iam(Gn, node_label='atom', edge_label='bond_type'):
# phase 2: iteration.
ds_attrs = get_dataset_attributes(Gn, attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label)
for itr in range(0, 10):
for itr in range(0, 10): # @todo: the convergence condition?
G_new = G.copy()
# update vertex labels.
# pre-compute h_i0 for each label.
@@ -138,34 +137,40 @@ def iam(Gn, node_label='atom', edge_label='bond_type'):
G_new.remove_edge(nd1, nd2)
G = G_new.copy()
# update pi_p
pi_p = []
for idx1, G_p in enumerate(Gn):
dist_tmp, pi_tmp, _ = GED(G, G_p)
pi_p.append(pi_tmp)
return G


def GED(g1, g2, lib='gedlib'):
"""
Compute GED. It is a dummy function for now.
Compute GED.
"""
if lib == 'gedlib':
# transform dataset to the 'xml' file as the GedLib required.
saveDataset([g1, g2], [None, None], group='xml', filename='ged_tmp/tmp')
script.appel()
# script.appel()
script.PyRestartEnv()
script.PyLoadGXLGraph('ged_tmp/', 'collections/tmp.xml')
script.PyLoadGXLGraph('ged_tmp/', 'ged_tmp/tmp.xml')
listID = script.PyGetGraphIds()
script.PySetEditCost("CHEM_1")
script.PySetEditCost("CHEM_2")
script.PyInitEnv()
script.PySetMethod("BIPARTITE", "")
script.PyInitMethod()
g = listID[0]
h = listID[1]
script.PyRunMethod(g, h)
liste = script.PyGetAllMap(g, h)
pi_forward, pi_backward = script.PyGetAllMap(g, h)
upper = script.PyGetUpperBound(g, h)
lower = script.PyGetLowerBound(g, h)
dis = upper + lower
pi = liste[0]
dis = (upper + lower) / 2
return dis, pi
return dis, pi_forward, pi_backward


def get_node_labels(Gn, node_label):
@@ -182,6 +187,434 @@ def get_edge_labels(Gn, edge_label):
return el


# --------------------------- These are tests --------------------------------#
def test_iam_with_more_graphs_as_init(Gn, G_candidate, c_ei=3, c_er=3, c_es=1,
node_label='atom', edge_label='bond_type'):
"""See my name, then you know what I do.
"""
from tqdm import tqdm
# Gn = Gn[0:10]
Gn = [nx.convert_node_labels_to_integers(g) for g in Gn]
# phase 1: initilize.
# compute set-median.
dis_min = np.inf
# pi_p = []
pi_all_forward = []
pi_all_backward = []
for idx1, G_p in tqdm(enumerate(G_candidate), desc='computing GEDs', file=sys.stdout):
dist_sum = 0
pi_all_forward.append([])
pi_all_backward.append([])
for idx2, G_p_prime in enumerate(Gn):
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G_p, G_p_prime)
pi_all_forward[idx1].append(pi_tmp_forward)
pi_all_backward[idx1].append(pi_tmp_backward)
dist_sum += dist_tmp
if dist_sum <= dis_min:
dis_min = dist_sum
G = G_p.copy()
idx_min = idx1
# list of edit operations.
pi_p_forward = pi_all_forward[idx_min]
pi_p_backward = pi_all_backward[idx_min]
# phase 2: iteration.
ds_attrs = get_dataset_attributes(Gn + [G], attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label)
label_set = get_node_labels(Gn + [G], node_label)
for itr in range(0, 10): # @todo: the convergence condition?
G_new = G.copy()
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
# print(label)
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for nd in G.nodes():
h_i0_list = []
label_list = []
for label in label_set:
h_i0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
idx_rdm = random.randint(0, len(idx_max) - 1)
G_new.nodes[nd][node_label] = label_list[idx_max[idx_rdm]]
else: # labels are non-symbolic
for nd in G.nodes():
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new.nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
for nd1, nd2, _ in G.edges(data=True):
h_ij0_list = []
label_list = []
for label in get_edge_labels(Gn, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
g.has_edge(pi_i, pi_j) and
g.edges[pi_i, pi_j][edge_label] == label)
h_ij0 += h_ij0_p
h_ij0_list.append(h_ij0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
h_ij0_max = h_ij0_list[idx_max[0]]
idx_rdm = random.randint(0, len(idx_max) - 1)
best_label = label_list[idx_max[idx_rdm]]
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max > len(Gn) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
G_new.edges[nd1, nd2][edge_label] = best_label
else:
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
# @todo: works only for undirected graphs.
for nd1 in range(nx.number_of_nodes(G)):
for nd2 in range(nd1 + 1, nx.number_of_nodes(G)):
sij_norm = 0
for idx, g in enumerate(Gn):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn) * c_er / (c_er + c_ei):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
elif sij_norm < len(Gn) * c_er / (c_er + c_ei):
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
# do not change anything when equal.
G = G_new.copy()
# update pi_p
pi_p_forward = []
for G_p in Gn:
dist_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
pi_p_forward.append(pi_tmp_forward)
return G


def test_iam_moreGraphsAsInit_tryAllPossibleBestGraphs_deleteNodesInIterations(
Gn_median, Gn_candidate, c_ei=3, c_er=3, c_es=1, node_label='atom',
edge_label='bond_type', connected=True):
"""See my name, then you know what I do.
"""
from tqdm import tqdm
# Gn_median = Gn_median[0:10]
# Gn_median = [nx.convert_node_labels_to_integers(g) for g in Gn_median]
node_ir = sys.maxsize * 2 # Max number for c++, corresponding to the node remove and insertion.
label_r = 'thanksdanny' # the label for node remove. # @todo: make this label unrepeatable.
ds_attrs = get_dataset_attributes(Gn_median + Gn_candidate,
attr_names=['edge_labeled', 'node_attr_dim'],
edge_label=edge_label)

def generate_graph(G, pi_p_forward, label_set):
G_new_list = [G.copy()] # all "best" graphs generated in this iteration.
# nx.draw_networkx(G)
# import matplotlib.pyplot as plt
# plt.show()
# print(pi_p_forward)
# update vertex labels.
# pre-compute h_i0 for each label.
# for label in get_node_labels(Gn, node_label):
# print(label)
# for nd in G.nodes(data=True):
# pass
if not ds_attrs['node_attr_dim']: # labels are symbolic
for ndi, (nd, _) in enumerate(G.nodes(data=True)):
h_i0_list = []
label_list = []
for label in label_set:
h_i0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if g.has_node(pi_i) and g.nodes[pi_i][node_label] == label:
h_i0 += 1
h_i0_list.append(h_i0)
label_list.append(label)
# case when the node is to be removed.
h_i0_remove = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][ndi]
if pi_i == node_ir:
h_i0_remove += 1
h_i0_list.append(h_i0_remove)
label_list.append(label_r)
# get the best labels.
idx_max = np.argwhere(h_i0_list == np.max(h_i0_list)).flatten().tolist()
nlabel_best = [label_list[idx] for idx in idx_max]
# generate "best" graphs with regard to "best" node labels.
G_new_list_nd = []
for g in G_new_list:
for nl in nlabel_best:
g_tmp = g.copy()
if nl == label_r:
g_tmp.remove_node(nd)
else:
g_tmp.nodes[nd][node_label] = nl
G_new_list_nd.append(g_tmp)
# nx.draw_networkx(g_tmp)
# import matplotlib.pyplot as plt
# plt.show()
# print(g_tmp.nodes(data=True))
# print(g_tmp.edges(data=True))
G_new_list = G_new_list_nd[:]

else: # labels are non-symbolic
for nd in G.nodes():
Si_norm = 0
phi_i_bar = np.array([0.0 for _ in range(ds_attrs['node_attr_dim'])])
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd]
if g.has_node(pi_i): #@todo: what if no g has node? phi_i_bar = 0?
Si_norm += 1
phi_i_bar += np.array([float(itm) for itm in g.nodes[pi_i]['attributes']])
phi_i_bar /= Si_norm
G_new.nodes[nd]['attributes'] = phi_i_bar
# update edge labels and adjacency matrix.
if ds_attrs['edge_labeled']:
for nd1, nd2, _ in G.edges(data=True):
h_ij0_list = []
label_list = []
for label in get_edge_labels(Gn_median, edge_label):
h_ij0 = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
h_ij0_p = (g.has_node(pi_i) and g.has_node(pi_j) and
g.has_edge(pi_i, pi_j) and
g.edges[pi_i, pi_j][edge_label] == label)
h_ij0 += h_ij0_p
h_ij0_list.append(h_ij0)
label_list.append(label)
# choose one of the best randomly.
idx_max = np.argwhere(h_ij0_list == np.max(h_ij0_list)).flatten().tolist()
h_ij0_max = h_ij0_list[idx_max[0]]
idx_rdm = random.randint(0, len(idx_max) - 1)
best_label = label_list[idx_max[idx_rdm]]
# check whether a_ij is 0 or 1.
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1]
pi_j = pi_p_forward[idx][nd2]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if h_ij0_max > len(Gn_median) * c_er / c_es + sij_norm * (1 - (c_er + c_ei) / c_es):
if not G_new.has_edge(nd1, nd2):
G_new.add_edge(nd1, nd2)
G_new.edges[nd1, nd2][edge_label] = best_label
else:
if G_new.has_edge(nd1, nd2):
G_new.remove_edge(nd1, nd2)
else: # if edges are unlabeled
# @todo: works only for undirected graphs.
nd_list = [n for n in G.nodes()]
for g_tmp in G_new_list:
for nd1i in range(nx.number_of_nodes(G)):
nd1 = nd_list[nd1i]
for nd2i in range(nd1i + 1, nx.number_of_nodes(G)):
nd2 = nd_list[nd2i]
sij_norm = 0
for idx, g in enumerate(Gn_median):
pi_i = pi_p_forward[idx][nd1i]
pi_j = pi_p_forward[idx][nd2i]
if g.has_node(pi_i) and g.has_node(pi_j) and g.has_edge(pi_i, pi_j):
sij_norm += 1
if sij_norm > len(Gn_median) * c_er / (c_er + c_ei):
# @todo: should we consider if nd1 and nd2 in g_tmp?
# or just add the edge anyway?
if g_tmp.has_node(nd1) and g_tmp.has_node(nd2) \
and not g_tmp.has_edge(nd1, nd2):
g_tmp.add_edge(nd1, nd2)
elif sij_norm < len(Gn_median) * c_er / (c_er + c_ei):
if g_tmp.has_edge(nd1, nd2):
g_tmp.remove_edge(nd1, nd2)
# do not change anything when equal.
# find the best graph generated in this iteration and update pi_p.
# @todo: should we update all graphs generated or just the best ones?
dis_list, pi_forward_list = median_distance(G_new_list, Gn_median)
# @todo: should we remove the identical and connectivity check?
# Don't know which is faster.
G_new_list, idx_list = remove_duplicates(G_new_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# if connected == True:
# G_new_list, idx_list = remove_disconnected(G_new_list)
# pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# idx_min_list = np.argwhere(dis_list == np.min(dis_list)).flatten().tolist()
# dis_min = dis_list[idx_min_tmp_list[0]]
# pi_forward_list = [pi_forward_list[idx] for idx in idx_min_list]
# G_new_list = [G_new_list[idx] for idx in idx_min_list]
for g in G_new_list:
import matplotlib.pyplot as plt
nx.draw_networkx(g)
plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
return G_new_list, pi_forward_list
def median_distance(Gn, Gn_median, measure='ged', verbose=False):
dis_list = []
pi_forward_list = []
for idx, G in tqdm(enumerate(Gn), desc='computing median distances',
file=sys.stdout) if verbose else enumerate(Gn):
dis_sum = 0
pi_forward_list.append([])
for G_p in Gn_median:
dis_tmp, pi_tmp_forward, pi_tmp_backward = GED(G, G_p)
pi_forward_list[idx].append(pi_tmp_forward)
dis_sum += dis_tmp
dis_list.append(dis_sum)
return dis_list, pi_forward_list
def best_median_graphs(Gn_candidate, dis_all, pi_all_forward):
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
dis_min = dis_all[idx_min_list[0]]
pi_forward_min_list = [pi_all_forward[idx] for idx in idx_min_list]
G_min_list = [Gn_candidate[idx] for idx in idx_min_list]
return G_min_list, pi_forward_min_list, dis_min
def iteration_proc(G, pi_p_forward):
G_list = [G]
pi_forward_list = [pi_p_forward]
# iterations.
for itr in range(0, 10): # @todo: the convergence condition?
# print('itr is', itr)
G_new_list = []
pi_forward_new_list = []
for idx, G in enumerate(G_list):
label_set = get_node_labels(Gn_median + [G], node_label)
G_tmp_list, pi_forward_tmp_list = generate_graph(
G, pi_forward_list[idx], label_set)
G_new_list += G_tmp_list
pi_forward_new_list += pi_forward_tmp_list
G_list = G_new_list[:]
pi_forward_list = pi_forward_new_list[:]
G_list, idx_list = remove_duplicates(G_list)
pi_forward_list = [pi_forward_list[idx] for idx in idx_list]
# import matplotlib.pyplot as plt
# for g in G_list:
# nx.draw_networkx(g)
# plt.show()
# print(g.nodes(data=True))
# print(g.edges(data=True))
return G_list, pi_forward_list # do we return all graphs or the best ones?
def remove_duplicates(Gn):
"""Remove duplicate graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
dupl = False
for g_new in Gn_new:
if graph_isIdentical(g_new, g):
dupl = True
break
if not dupl:
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list
def remove_disconnected(Gn):
"""Remove disconnected graphs from list.
"""
Gn_new = []
idx_list = []
for idx, g in enumerate(Gn):
if nx.is_connected(g):
Gn_new.append(g)
idx_list.append(idx)
return Gn_new, idx_list

# phase 1: initilize.
# compute set-median.
dis_min = np.inf
dis_all, pi_all_forward = median_distance(Gn_candidate[::-1], Gn_median)
# find all smallest distances.
idx_min_list = np.argwhere(dis_all == np.min(dis_all)).flatten().tolist()
dis_min = dis_all[idx_min_list[0]]
# phase 2: iteration.
G_list = []
for idx_min in idx_min_list[::-1]:
# print('idx_min is', idx_min)
G = Gn_candidate[idx_min].copy()
# list of edit operations.
pi_p_forward = pi_all_forward[idx_min]
# pi_p_backward = pi_all_backward[idx_min]
Gi_list, pi_i_forward_list = iteration_proc(G, pi_p_forward)
G_list += Gi_list
G_list, _ = remove_duplicates(G_list)
if connected == True:
G_list, _ = remove_disconnected(G_list)

import matplotlib.pyplot as plt
for g in G_list:
nx.draw_networkx(g)
plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
# get the best median graphs
dis_all, pi_all_forward = median_distance(G_list, Gn_median)
G_min_list, pi_forward_min_list, dis_min = best_median_graphs(
G_list, dis_all, pi_all_forward)
for g in G_min_list:
nx.draw_networkx(g)
plt.show()
print(g.nodes(data=True))
print(g.edges(data=True))
return G_min_list


if __name__ == '__main__':
from pygraph.utils.graphfiles import loadDataset
ds = {'name': 'MUTAG', 'dataset': '../datasets/MUTAG/MUTAG.mat',


+ 430
- 0
pygraph/kernels/treeletKernel.py View File

@@ -0,0 +1,430 @@
"""
@author: linlin
@references: Gaüzère B, Brun L, Villemin D. Two new graphs kernels in chemoinformatics. Pattern Recognition Letters. 2012 Nov 1;33(15):2038-47.
"""

import sys
sys.path.insert(0, "../")
import time
from collections import Counter
from itertools import chain
from functools import partial
from multiprocessing import Pool
from tqdm import tqdm

import networkx as nx
import numpy as np

from pygraph.utils.graphdataset import get_dataset_attributes
from pygraph.utils.parallel import parallel_gm

def treeletkernel(*args,
sub_kernel,
node_label='atom',
edge_label='bond_type',
n_jobs=None,
verbose=True):
"""Calculate treelet graph kernels between graphs.

Parameters
----------
Gn : List of NetworkX graph
List of graphs between which the kernels are calculated.
/
G1, G2 : NetworkX graphs
Two graphs between which the kernel is calculated.
sub_kernel : function
The sub-kernel between 2 real number vectors. Each vector counts the
numbers of isomorphic treelets in a graph.
node_label : string
Node attribute used as label. The default node label is atom.
edge_label : string
Edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.

Return
------
Kmatrix : Numpy matrix
Kernel matrix, each element of which is the treelet kernel between 2 praphs.
"""
# pre-process
Gn = args[0] if len(args) == 1 else [args[0], args[1]]
Kmatrix = np.zeros((len(Gn), len(Gn)))
ds_attrs = get_dataset_attributes(Gn,
attr_names=['node_labeled', 'edge_labeled', 'is_directed'],
node_label=node_label, edge_label=edge_label)
labeled = False
if ds_attrs['node_labeled'] or ds_attrs['edge_labeled']:
labeled = True
if not ds_attrs['node_labeled']:
for G in Gn:
nx.set_node_attributes(G, '0', 'atom')
if not ds_attrs['edge_labeled']:
for G in Gn:
nx.set_edge_attributes(G, '0', 'bond_type')
start_time = time.time()
# ---- use pool.imap_unordered to parallel and track progress. ----
# get all canonical keys of all graphs before calculating kernels to save
# time, but this may cost a lot of memory for large dataset.
pool = Pool(n_jobs)
itr = zip(Gn, range(0, len(Gn)))
if len(Gn) < 100 * n_jobs:
chunksize = int(len(Gn) / n_jobs) + 1
else:
chunksize = 100
canonkeys = [[] for _ in range(len(Gn))]
getps_partial = partial(wrapper_get_canonkeys, node_label, edge_label,
labeled, ds_attrs['is_directed'])
if verbose:
iterator = tqdm(pool.imap_unordered(getps_partial, itr, chunksize),
desc='getting canonkeys', file=sys.stdout)
else:
iterator = pool.imap_unordered(getps_partial, itr, chunksize)
for i, ck in iterator:
canonkeys[i] = ck
pool.close()
pool.join()
# compute kernels.
def init_worker(canonkeys_toshare):
global G_canonkeys
G_canonkeys = canonkeys_toshare
do_partial = partial(wrapper_treeletkernel_do, sub_kernel)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(canonkeys,), n_jobs=n_jobs, verbose=verbose)
run_time = time.time() - start_time
if verbose:
print("\n --- treelet kernel matrix of size %d built in %s seconds ---"
% (len(Gn), run_time))
return Kmatrix, run_time


def _treeletkernel_do(canonkey1, canonkey2, sub_kernel):
"""Calculate treelet graph kernel between 2 graphs.
Parameters
----------
canonkey1, canonkey2 : list
List of canonical keys in 2 graphs, where each key is represented by a string.
Return
------
kernel : float
Treelet Kernel between 2 graphs.
"""
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys])
vector2 = np.array([(canonkey2[key] if (key in canonkey2.keys()) else 0) for key in keys])
kernel = np.sum(np.exp(-np.square(vector1 - vector2) / 2))
# kernel = sub_kernel(vector1, vector2)
return kernel


def wrapper_treeletkernel_do(sub_kernel, itr):
i = itr[0]
j = itr[1]
return i, j, _treeletkernel_do(G_canonkeys[i], G_canonkeys[j], sub_kernel)


def get_canonkeys(G, node_label, edge_label, labeled, is_directed):
"""Generate canonical keys of all treelets in a graph.
Parameters
----------
G : NetworkX graphs
The graph in which keys are generated.
node_label : string
node attribute used as label. The default node label is atom.
edge_label : string
edge attribute used as label. The default edge label is bond_type.
labeled : boolean
Whether the graphs are labeled. The default is True.
Return
------
canonkey/canonkey_l : dict
For unlabeled graphs, canonkey is a dictionary which records amount of
every tree pattern. For labeled graphs, canonkey_l is one which keeps
track of amount of every treelet.
"""
patterns = {} # a dictionary which consists of lists of patterns for all graphlet.
canonkey = {} # canonical key, a dictionary which records amount of every tree pattern.

### structural analysis ###
### In this section, a list of patterns is generated for each graphlet,
### where every pattern is represented by nodes ordered by Morgan's
### extended labeling.
# linear patterns
patterns['0'] = G.nodes()
canonkey['0'] = nx.number_of_nodes(G)
for i in range(1, 6): # for i in range(1, 6):
patterns[str(i)] = find_all_paths(G, i, is_directed)
canonkey[str(i)] = len(patterns[str(i)])

# n-star patterns
patterns['3star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 3]
patterns['4star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 4]
patterns['5star'] = [[node] + [neighbor for neighbor in G[node]] for node in G.nodes() if G.degree(node) == 5]
# n-star patterns
canonkey['6'] = len(patterns['3star'])
canonkey['8'] = len(patterns['4star'])
canonkey['d'] = len(patterns['5star'])

# pattern 7
patterns['7'] = [] # the 1st line of Table 1 in Ref [1]
for pattern in patterns['3star']:
for i in range(1, len(pattern)): # for each neighbor of node 0
if G.degree(pattern[i]) >= 2:
pattern_t = pattern[:]
# set the node with degree >= 2 as the 4th node
pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i]
for neighborx in G[pattern[i]]:
if neighborx != pattern[0]:
new_pattern = pattern_t + [neighborx]
patterns['7'].append(new_pattern)
canonkey['7'] = len(patterns['7'])

# pattern 11
patterns['11'] = [] # the 4th line of Table 1 in Ref [1]
for pattern in patterns['4star']:
for i in range(1, len(pattern)):
if G.degree(pattern[i]) >= 2:
pattern_t = pattern[:]
pattern_t[i], pattern_t[4] = pattern_t[4], pattern_t[i]
for neighborx in G[pattern[i]]:
if neighborx != pattern[0]:
new_pattern = pattern_t + [ neighborx ]
patterns['11'].append(new_pattern)
canonkey['b'] = len(patterns['11'])

# pattern 12
patterns['12'] = [] # the 5th line of Table 1 in Ref [1]
rootlist = [] # a list of root nodes, whose extended labels are 3
for pattern in patterns['3star']:
if pattern[0] not in rootlist: # prevent to count the same pattern twice from each of the two root nodes
rootlist.append(pattern[0])
for i in range(1, len(pattern)):
if G.degree(pattern[i]) >= 3:
rootlist.append(pattern[i])
pattern_t = pattern[:]
pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i]
for neighborx1 in G[pattern[i]]:
if neighborx1 != pattern[0]:
for neighborx2 in G[pattern[i]]:
if neighborx1 > neighborx2 and neighborx2 != pattern[0]:
new_pattern = pattern_t + [neighborx1] + [neighborx2]
# new_patterns = [ pattern + [neighborx1] + [neighborx2] for neighborx1 in G[pattern[i]] if neighborx1 != pattern[0] for neighborx2 in G[pattern[i]] if (neighborx1 > neighborx2 and neighborx2 != pattern[0]) ]
patterns['12'].append(new_pattern)
canonkey['c'] = int(len(patterns['12']) / 2)

# pattern 9
patterns['9'] = [] # the 2nd line of Table 1 in Ref [1]
for pattern in patterns['3star']:
for pairs in [ [neighbor1, neighbor2] for neighbor1 in G[pattern[0]] if G.degree(neighbor1) >= 2 \
for neighbor2 in G[pattern[0]] if G.degree(neighbor2) >= 2 if neighbor1 > neighbor2 ]:
pattern_t = pattern[:]
# move nodes with extended labels 4 to specific position to correspond to their children
pattern_t[pattern_t.index(pairs[0])], pattern_t[2] = pattern_t[2], pattern_t[pattern_t.index(pairs[0])]
pattern_t[pattern_t.index(pairs[1])], pattern_t[3] = pattern_t[3], pattern_t[pattern_t.index(pairs[1])]
for neighborx1 in G[pairs[0]]:
if neighborx1 != pattern[0]:
for neighborx2 in G[pairs[1]]:
if neighborx2 != pattern[0]:
new_pattern = pattern_t + [neighborx1] + [neighborx2]
patterns['9'].append(new_pattern)
canonkey['9'] = len(patterns['9'])

# pattern 10
patterns['10'] = [] # the 3rd line of Table 1 in Ref [1]
for pattern in patterns['3star']:
for i in range(1, len(pattern)):
if G.degree(pattern[i]) >= 2:
for neighborx in G[pattern[i]]:
if neighborx != pattern[0] and G.degree(neighborx) >= 2:
pattern_t = pattern[:]
pattern_t[i], pattern_t[3] = pattern_t[3], pattern_t[i]
new_patterns = [ pattern_t + [neighborx] + [neighborxx] for neighborxx in G[neighborx] if neighborxx != pattern[i] ]
patterns['10'].extend(new_patterns)
canonkey['a'] = len(patterns['10'])

### labeling information ###
### In this section, a list of canonical keys is generated for every
### pattern obtained in the structural analysis section above, which is a
### string corresponding to a unique treelet. A dictionary is built to keep
### track of the amount of every treelet.
if labeled == True:
canonkey_l = {} # canonical key, a dictionary which keeps track of amount of every treelet.

# linear patterns
canonkey_t = Counter(list(nx.get_node_attributes(G, node_label).values()))
for key in canonkey_t:
canonkey_l['0' + key] = canonkey_t[key]

for i in range(1, 6): # for i in range(1, 6):
treelet = []
for pattern in patterns[str(i)]:
canonlist = list(chain.from_iterable((G.node[node][node_label], \
G[node][pattern[idx+1]][edge_label]) for idx, node in enumerate(pattern[:-1])))
canonlist.append(G.node[pattern[-1]][node_label])
canonkey_t = ''.join(canonlist)
canonkey_t = canonkey_t if canonkey_t < canonkey_t[::-1] else canonkey_t[::-1]
treelet.append(str(i) + canonkey_t)
canonkey_l.update(Counter(treelet))

# n-star patterns
for i in range(3, 6):
treelet = []
for pattern in patterns[str(i) + 'star']:
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:] ]
canonlist.sort()
canonkey_t = ('d' if i == 5 else str(i * 2)) + G.node[pattern[0]][node_label] + ''.join(canonlist)
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))

# pattern 7
treelet = []
for pattern in patterns['7']:
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
canonlist.sort()
canonkey_t = '7' + G.node[pattern[0]][node_label] + ''.join(canonlist) \
+ G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label]
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))

# pattern 11
treelet = []
for pattern in patterns['11']:
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:4] ]
canonlist.sort()
canonkey_t = 'b' + G.node[pattern[0]][node_label] + ''.join(canonlist) \
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[0]][edge_label] \
+ G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label]
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))

# pattern 10
treelet = []
for pattern in patterns['10']:
canonkey4 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[4]][edge_label]
canonlist = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
canonlist.sort()
canonkey0 = ''.join(canonlist)
canonkey_t = 'a' + G.node[pattern[3]][node_label] \
+ G.node[pattern[4]][node_label] + G[pattern[4]][pattern[3]][edge_label] \
+ G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \
+ canonkey4 + canonkey0
treelet.append(canonkey_t)
canonkey_l.update(Counter(treelet))

# pattern 12
treelet = []
for pattern in patterns['12']:
canonlist0 = [ G.node[leaf][node_label] + G[leaf][pattern[0]][edge_label] for leaf in pattern[1:3] ]
canonlist0.sort()
canonlist3 = [ G.node[leaf][node_label] + G[leaf][pattern[3]][edge_label] for leaf in pattern[4:6] ]
canonlist3.sort()
# 2 possible key can be generated from 2 nodes with extended label 3, select the one with lower lexicographic order.
canonkey_t1 = 'c' + G.node[pattern[0]][node_label] \
+ ''.join(canonlist0) \
+ G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label] \
+ ''.join(canonlist3)

canonkey_t2 = 'c' + G.node[pattern[3]][node_label] \
+ ''.join(canonlist3) \
+ G.node[pattern[0]][node_label] + G[pattern[0]][pattern[3]][edge_label] \
+ ''.join(canonlist0)

treelet.append(canonkey_t1 if canonkey_t1 < canonkey_t2 else canonkey_t2)
canonkey_l.update(Counter(treelet))

# pattern 9
treelet = []
for pattern in patterns['9']:
canonkey2 = G.node[pattern[4]][node_label] + G[pattern[4]][pattern[2]][edge_label]
canonkey3 = G.node[pattern[5]][node_label] + G[pattern[5]][pattern[3]][edge_label]
prekey2 = G.node[pattern[2]][node_label] + G[pattern[2]][pattern[0]][edge_label]
prekey3 = G.node[pattern[3]][node_label] + G[pattern[3]][pattern[0]][edge_label]
if prekey2 + canonkey2 < prekey3 + canonkey3:
canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \
+ prekey2 + prekey3 + canonkey2 + canonkey3
else:
canonkey_t = G.node[pattern[1]][node_label] + G[pattern[1]][pattern[0]][edge_label] \
+ prekey3 + prekey2 + canonkey3 + canonkey2
treelet.append('9' + G.node[pattern[0]][node_label] + canonkey_t)
canonkey_l.update(Counter(treelet))

return canonkey_l

return canonkey


def wrapper_get_canonkeys(node_label, edge_label, labeled, is_directed, itr_item):
g = itr_item[0]
i = itr_item[1]
return i, get_canonkeys(g, node_label, edge_label, labeled, is_directed)

def find_paths(G, source_node, length):
"""Find all paths with a certain length those start from a source node.
A recursive depth first search is applied.
Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
source_node : integer
The number of the node from where all paths start.
length : integer
The length of paths.
Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
if length == 0:
return [[source_node]]
path = [[source_node] + path for neighbor in G[source_node] \
for path in find_paths(G, neighbor, length - 1) if source_node not in path]
return path


def find_all_paths(G, length, is_directed):
"""Find all paths with a certain length in a graph. A recursive depth first
search is applied.
Parameters
----------
G : NetworkX graphs
The graph in which paths are searched.
length : integer
The length of paths.
Return
------
path : list of list
List of paths retrieved, where each path is represented by a list of nodes.
"""
all_paths = []
for node in G:
all_paths.extend(find_paths(G, node, length))
if not is_directed:
# For each path, two presentations are retrieved from its two extremities.
# Remove one of them.
all_paths_r = [path[::-1] for path in all_paths]
for idx, path in enumerate(all_paths[:-1]):
for path2 in all_paths_r[idx+1::]:
if path == path2:
all_paths[idx] = []
break
all_paths = list(filter(lambda a: a != [], all_paths))
return all_paths

+ 2
- 1
pygraph/kernels/untilHPathKernel.py View File

@@ -31,6 +31,7 @@ def untilhpathkernel(*args,
n_jobs=None,
verbose=True):
"""Calculate path graph kernels up to depth/hight h between graphs.
Parameters
----------
Gn : List of NetworkX graph
@@ -124,7 +125,7 @@ def untilhpathkernel(*args,
def init_worker(trie_toshare):
global G_trie
G_trie = trie_toshare
do_partial = partial(wrapper_uhpath_do_trie, k_func)
do_partial = partial(wrapper_uhpath_do_trie, k_func)
parallel_gm(do_partial, Kmatrix, Gn, init_worker=init_worker,
glbv=(all_paths,), n_jobs=n_jobs, verbose=verbose)
else:


+ 19
- 18
pygraph/utils/graphfiles.py View File

@@ -84,7 +84,7 @@ def loadGXL(filename):
return g


def saveGXL(graph, filename, method='benoit'):
def saveGXL(graph, filename, method='gedlib'):
if method == 'benoit':
import xml.etree.ElementTree as ET
root_node = ET.Element('gxl')
@@ -124,23 +124,24 @@ def saveGXL(graph, filename, method='benoit'):
tree.write(filename)
elif method == 'gedlib':
# reference: https://github.com/dbblumenthal/gedlib/blob/master/data/generate_molecules.py#L22
pass
# gxl_file = open(filename, 'w')
# gxl_file.write("<?xml version=\"1.0\"?>\n")
# gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
# gxl_file.write("<gxl>\n")
# gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"false\" edgemode=\"undirected\">\n")
# for v in graph:
# gxl_file.write("<node id=\"_" + str(v) + "\">\n")
# gxl_file.write("<attr name=\"chem\"><int>" + str(self.node_labels[node]) + "</int></attr>\n")
# gxl_file.write("</node>\n")
# for edge in self.edge_list:
# gxl_file.write("<edge from=\"_" + str(edge[0]) + "\" to=\"_" + str(edge[1]) + "\">\n")
# gxl_file.write("<attr name=\"valence\"><int>1</int></attr>\n")
# gxl_file.write("</edge>\n")
# gxl_file.write("</graph>\n")
# gxl_file.write("</gxl>\n")
# gxl_file.close()
# pass
gxl_file = open(filename, 'w')
gxl_file.write("<?xml version=\"1.0\"?>\n")
gxl_file.write("<!DOCTYPE gxl SYSTEM \"http://www.gupro.de/GXL/gxl-1.0.dtd\">\n")
gxl_file.write("<gxl>\n")
gxl_file.write("<graph id=\"" + str(graph.graph['name']) + "\" edgeids=\"true\" edgemode=\"undirected\">\n")
for v, attrs in graph.nodes(data=True):
gxl_file.write("<node id=\"_" + str(v) + "\">\n")
gxl_file.write("<attr name=\"" + "chem" + "\"><int>" + str(attrs['atom']) + "</int></attr>\n")
gxl_file.write("</node>\n")
for v1, v2, attrs in graph.edges(data=True):
gxl_file.write("<edge from=\"_" + str(v1) + "\" to=\"_" + str(v2) + "\">\n")
# gxl_file.write("<attr name=\"valence\"><int>" + str(attrs['bond_type']) + "</int></attr>\n")
gxl_file.write("<attr name=\"valence\"><int>" + "1" + "</int></attr>\n")
gxl_file.write("</edge>\n")
gxl_file.write("</graph>\n")
gxl_file.write("</gxl>\n")
gxl_file.close()


def loadSDF(filename):


+ 25
- 0
pygraph/utils/kernels.py View File

@@ -57,6 +57,27 @@ def gaussiankernel(x, y, gamma=None):
return kernel


def polynomialkernel(x, y, d=1, c=0):
"""Polynomial kernel.
Compute the polynomial kernel between x and y:

K(x, y) = (x^Ty)^d + c.

Parameters
----------
x, y : array

d : integer, default 1
c : float, default 0

Returns
-------
kernel : float
"""
return np.dot(x, y) ** d + c


def kernelsum(k1, k2, d11, d12, d21=None, d22=None, lamda1=1, lamda2=1):
"""Sum of a pair of kernels.

@@ -110,3 +131,7 @@ def kernelproduct(k1, k2, d11, d12, d21=None, d22=None, lamda=1):
else:
kernel = lamda * k1(d11, d12) * k2(d21, d22)
return kernel


if __name__ == '__main__':
o = polynomialkernel([1, 2], [3, 4], 2, 3)

+ 3
- 1
pygraph/utils/model_selection_precomputed.py View File

@@ -145,7 +145,8 @@ def model_selection_for_precomputed_kernel(datafile,
# Kmatrix = np.random.rand(2250, 2250)
# current_run_time = 0.1
# remove graphs whose kernels with themselves are zeros
# remove graphs whose kernels with themselves are zeros
# @todo: y not changed accordingly?
Kmatrix_diag = Kmatrix.diagonal().copy()
nb_g_ignore = 0
for idxk, diag in enumerate(Kmatrix_diag):
@@ -154,6 +155,7 @@ def model_selection_for_precomputed_kernel(datafile,
Kmatrix = np.delete(Kmatrix, (idxk - nb_g_ignore), axis=1)
nb_g_ignore += 1
# normalization
# @todo: works only for undirected graph?
Kmatrix_diag = Kmatrix.diagonal().copy()
for i in range(len(Kmatrix)):
for j in range(i, len(Kmatrix)):


+ 59
- 0
pygraph/utils/utils.py View File

@@ -1,5 +1,6 @@
import networkx as nx
import numpy as np
from copy import deepcopy
#from itertools import product

# from tqdm import tqdm
@@ -183,3 +184,61 @@ def direct_product(G1, G2, node_label, edge_label):
# gt = nx.convert_node_labels_to_integers(
# gt, first_label=0, label_attribute='label_orignal')
return gt


def graph_deepcopy(G):
"""Deep copy a graph, including deep copy of all nodes, edges and
attributes of the graph, nodes and edges.
Note
----
It is the same as the NetworkX function graph.copy(), as far as I know.
"""
# add graph attributes.
labels = {}
for k, v in G.graph.items():
labels[k] = deepcopy(v)
if G.is_directed():
G_copy = nx.DiGraph(**labels)
else:
G_copy = nx.Graph(**labels)
# add nodes
for nd, attrs in G.nodes(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_node(nd, **labels)
# add edges.
for nd1, nd2, attrs in G.edges(data=True):
labels = {}
for k, v in attrs.items():
labels[k] = deepcopy(v)
G_copy.add_edge(nd1, nd2, **labels)
return G_copy


def graph_isIdentical(G1, G2):
"""Check if two graphs are identical, including: same nodes, edges, node
labels/attributes, edge labels/attributes.
Notes
----
1. The type of graphs has to be the same.
2. Global/Graph attributes are neglected as they may contain names for graphs.
"""
# check nodes.
nlist1 = [n for n in G1.nodes(data=True)]
nlist2 = [n for n in G2.nodes(data=True)]
if not nlist1 == nlist2:
return False
# check edges.
elist1 = [n for n in G1.edges(data=True)]
elist2 = [n for n in G2.edges(data=True)]
if not elist1 == elist2:
return False
# check graph attributes.
return True

Loading…
Cancel
Save