@@ -0,0 +1,73 @@ | |||||
# -*- coding: utf-8 -*- | |||||
"""compute_distance_in_kernel_space.ipynb | |||||
Automatically generated by Colaboratory. | |||||
Original file is located at | |||||
https://colab.research.google.com/drive/17tZP6IrineQmzo9sRtfZOnHpHx6HnlMA | |||||
**This script demonstrates how to compute distance in kernel space between the image of a graph and the mean of images of a group of graphs.** | |||||
--- | |||||
**0. Install `graphkit-learn`.** | |||||
""" | |||||
"""**1. Get dataset.**""" | |||||
from gklearn.utils import Dataset | |||||
# Predefined dataset name, use dataset "MUTAG". | |||||
ds_name = 'MUTAG' | |||||
# Initialize a Dataset. | |||||
dataset = Dataset() | |||||
# Load predefined dataset "MUTAG". | |||||
dataset.load_predefined_dataset(ds_name) | |||||
len(dataset.graphs) | |||||
"""**2. Compute graph kernel.**""" | |||||
from gklearn.kernels import PathUpToH | |||||
import multiprocessing | |||||
# Initailize parameters for graph kernel computation. | |||||
kernel_options = {'depth': 3, | |||||
'k_func': 'MinMax', | |||||
'compute_method': 'trie' | |||||
} | |||||
# Initialize graph kernel. | |||||
graph_kernel = PathUpToH(node_labels=dataset.node_labels, # list of node label names. | |||||
edge_labels=dataset.edge_labels, # list of edge label names. | |||||
ds_infos=dataset.get_dataset_infos(keys=['directed']), # dataset information required for computation. | |||||
**kernel_options, # options for computation. | |||||
) | |||||
# Compute Gram matrix. | |||||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||||
parallel='imap_unordered', # or None. | |||||
n_jobs=multiprocessing.cpu_count(), # number of parallel jobs. | |||||
normalize=True, # whether to return normalized Gram matrix. | |||||
verbose=2 # whether to print out results. | |||||
) | |||||
"""**3. Compute distance in kernel space.** | |||||
Given a dataset $\mathcal{G}_N$, compute the distance in kernel space between the image of $G_1 \in \mathcal{G}_N$ and the mean of images of $\mathcal{G}_k \subset \mathcal{G}_N$. | |||||
""" | |||||
from gklearn.preimage.utils import compute_k_dis | |||||
# Index of $G_1$. | |||||
idx_1 = 10 | |||||
# Indices of graphs in $\mathcal{G}_k$. | |||||
idx_graphs = range(0, 10) | |||||
# Compute the distance in kernel space. | |||||
dis_k = compute_k_dis(idx_1, | |||||
idx_graphs, | |||||
[1 / len(idx_graphs)] * len(idx_graphs), # weights for images of graphs in $\mathcal{G}_k$; all equal when computing the mean. | |||||
gram_matrix, # gram matrix of al graphs. | |||||
withterm3=False | |||||
) | |||||
print(dis_k) |
@@ -0,0 +1,87 @@ | |||||
# -*- coding: utf-8 -*- | |||||
"""compute_graph_kernel.ipynb | |||||
Automatically generated by Colaboratory. | |||||
Original file is located at | |||||
https://colab.research.google.com/drive/17Q2QCl9CAtDweGF8LiWnWoN2laeJqT0u | |||||
**This script demonstrates how to compute a graph kernel.** | |||||
--- | |||||
**0. Install `graphkit-learn`.** | |||||
""" | |||||
"""**1. Get dataset.**""" | |||||
from gklearn.utils import Dataset | |||||
# Predefined dataset name, use dataset "MUTAG". | |||||
ds_name = 'MUTAG' | |||||
# Initialize a Dataset. | |||||
dataset = Dataset() | |||||
# Load predefined dataset "MUTAG". | |||||
dataset.load_predefined_dataset(ds_name) | |||||
len(dataset.graphs) | |||||
"""**2. Compute graph kernel.**""" | |||||
from gklearn.kernels import PathUpToH | |||||
# Initailize parameters for graph kernel computation. | |||||
kernel_options = {'depth': 3, | |||||
'k_func': 'MinMax', | |||||
'compute_method': 'trie' | |||||
} | |||||
# Initialize graph kernel. | |||||
graph_kernel = PathUpToH(node_labels=dataset.node_labels, # list of node label names. | |||||
edge_labels=dataset.edge_labels, # list of edge label names. | |||||
ds_infos=dataset.get_dataset_infos(keys=['directed']), # dataset information required for computation. | |||||
**kernel_options, # options for computation. | |||||
) | |||||
print('done.') | |||||
import multiprocessing | |||||
import matplotlib.pyplot as plt | |||||
# Compute Gram matrix. | |||||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||||
parallel='imap_unordered', # or None. | |||||
n_jobs=multiprocessing.cpu_count(), # number of parallel jobs. | |||||
normalize=True, # whether to return normalized Gram matrix. | |||||
verbose=2 # whether to print out results. | |||||
) | |||||
# Print results. | |||||
print() | |||||
print(gram_matrix) | |||||
print(run_time) | |||||
plt.imshow(gram_matrix) | |||||
import multiprocessing | |||||
# Compute grah kernels between a graph and a list of graphs. | |||||
kernel_list, run_time = graph_kernel.compute(dataset.graphs, # a list of graphs. | |||||
dataset.graphs[0], # a single graph. | |||||
parallel='imap_unordered', # or None. | |||||
n_jobs=multiprocessing.cpu_count(), # number of parallel jobs. | |||||
verbose=2 # whether to print out results. | |||||
) | |||||
# Print results. | |||||
print() | |||||
print(kernel_list) | |||||
print(run_time) | |||||
import multiprocessing | |||||
# Compute a grah kernel between two graphs. | |||||
kernel, run_time = graph_kernel.compute(dataset.graphs[0], # a single graph. | |||||
dataset.graphs[1], # another single graph. | |||||
verbose=2 # whether to print out results. | |||||
) | |||||
# Print results. | |||||
print() | |||||
print(kernel) | |||||
print(run_time) |
@@ -0,0 +1,115 @@ | |||||
# -*- coding: utf-8 -*- | |||||
"""example_median_preimege_generator.ipynb | |||||
Automatically generated by Colaboratory. | |||||
Original file is located at | |||||
https://colab.research.google.com/drive/1PIDvHOcmiLEQ5Np3bgBDdu0kLOquOMQK | |||||
**This script demonstrates how to generate a graph preimage using Boria's method.** | |||||
--- | |||||
""" | |||||
"""**1. Get dataset.**""" | |||||
from gklearn.utils import Dataset, split_dataset_by_target | |||||
# Predefined dataset name, use dataset "MAO". | |||||
ds_name = 'MAO' | |||||
# The node/edge labels that will not be used in the computation. | |||||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
# Initialize a Dataset. | |||||
dataset_all = Dataset() | |||||
# Load predefined dataset "MAO". | |||||
dataset_all.load_predefined_dataset(ds_name) | |||||
# Remove irrelevant labels. | |||||
dataset_all.remove_labels(**irrelevant_labels) | |||||
# Split the whole dataset according to the classification targets. | |||||
datasets = split_dataset_by_target(dataset_all) | |||||
# Get the first class of graphs, whose median preimage will be computed. | |||||
dataset = datasets[0] | |||||
len(dataset.graphs) | |||||
"""**2. Set parameters.**""" | |||||
import multiprocessing | |||||
# Parameters for MedianPreimageGenerator (our method). | |||||
mpg_options = {'fit_method': 'k-graphs', # how to fit edit costs. "k-graphs" means use all graphs in median set when fitting. | |||||
'init_ecc': [4, 4, 2, 1, 1, 1], # initial edit costs. | |||||
'ds_name': ds_name, # name of the dataset. | |||||
'parallel': True, # whether the parallel scheme is to be used. | |||||
'time_limit_in_sec': 0, # maximum time limit to compute the preimage. If set to 0 then no limit. | |||||
'max_itrs': 100, # maximum iteration limit to optimize edit costs. If set to 0 then no limit. | |||||
'max_itrs_without_update': 3, # If the times that edit costs is not update is more than this number, then the optimization stops. | |||||
'epsilon_residual': 0.01, # In optimization, the residual is only considered changed if the change is bigger than this number. | |||||
'epsilon_ec': 0.1, # In optimization, the edit costs are only considered changed if the changes are bigger than this number. | |||||
'verbose': 2 # whether to print out results. | |||||
} | |||||
# Parameters for graph kernel computation. | |||||
kernel_options = {'name': 'PathUpToH', # use path kernel up to length h. | |||||
'depth': 9, | |||||
'k_func': 'MinMax', | |||||
'compute_method': 'trie', | |||||
'parallel': 'imap_unordered', # or None | |||||
'n_jobs': multiprocessing.cpu_count(), | |||||
'normalize': True, # whether to use normalized Gram matrix to optimize edit costs. | |||||
'verbose': 2 # whether to print out results. | |||||
} | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||||
'initial_solutions': 10, # when bigger than 1, then the method is considered mIPFP. | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
'attr_distance': 'euclidean', # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'ratio_runs_from_initial_solutions': 1, | |||||
'threads': multiprocessing.cpu_count(), # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
# Parameters for MedianGraphEstimator (Boria's method). | |||||
mge_options = {'init_type': 'MEDOID', # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD. | |||||
'random_inits': 10, # number of random initialization when 'init_type' = 'RANDOM'. | |||||
'time_limit': 600, # maximum time limit to compute the generalized median. If set to 0 then no limit. | |||||
'verbose': 2, # whether to print out results. | |||||
'refine': False # whether to refine the final SODs or not. | |||||
} | |||||
print('done.') | |||||
"""**3. Run median preimage generator.**""" | |||||
from gklearn.preimage import MedianPreimageGenerator | |||||
# Create median preimage generator instance. | |||||
mpg = MedianPreimageGenerator() | |||||
# Add dataset. | |||||
mpg.dataset = dataset | |||||
# Set parameters. | |||||
mpg.set_options(**mpg_options.copy()) | |||||
mpg.kernel_options = kernel_options.copy() | |||||
mpg.ged_options = ged_options.copy() | |||||
mpg.mge_options = mge_options.copy() | |||||
# Run. | |||||
mpg.run() | |||||
"""**4. Get results.**""" | |||||
# Get results. | |||||
import pprint | |||||
pp = pprint.PrettyPrinter(indent=4) # pretty print | |||||
results = mpg.get_results() | |||||
pp.pprint(results) | |||||
# Draw generated graphs. | |||||
def draw_graph(graph): | |||||
import matplotlib.pyplot as plt | |||||
import networkx as nx | |||||
plt.figure() | |||||
pos = nx.spring_layout(graph) | |||||
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'atom_symbol'), font_color='w', width=3, with_labels=True) | |||||
plt.show() | |||||
plt.clf() | |||||
plt.close() | |||||
draw_graph(mpg.set_median) | |||||
draw_graph(mpg.gen_median) |
@@ -8,8 +8,9 @@ __author__ = "Linlin Jia" | |||||
__date__ = "November 2018" | __date__ = "November 2018" | ||||
from gklearn.kernels.graph_kernel import GraphKernel | from gklearn.kernels.graph_kernel import GraphKernel | ||||
from gklearn.kernels.structural_sp import StructuralSP | |||||
from gklearn.kernels.marginalized import Marginalized | |||||
from gklearn.kernels.shortest_path import ShortestPath | from gklearn.kernels.shortest_path import ShortestPath | ||||
from gklearn.kernels.structural_sp import StructuralSP | |||||
from gklearn.kernels.path_up_to_h import PathUpToH | from gklearn.kernels.path_up_to_h import PathUpToH | ||||
from gklearn.kernels.treelet import Treelet | from gklearn.kernels.treelet import Treelet | ||||
from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree | from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree |
@@ -0,0 +1,338 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Wed Jun 3 22:22:57 2020 | |||||
@author: ljia | |||||
@references: | |||||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||||
labeled graphs. In Proceedings of the 20th International Conference on | |||||
Machine Learning, Washington, DC, United States, 2003. | |||||
[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and | |||||
Jean-Philippe Vert. Extensions of marginalized graph kernels. In | |||||
Proceedings of the twenty-first international conference on Machine | |||||
learning, page 70. ACM, 2004. | |||||
""" | |||||
import sys | |||||
from multiprocessing import Pool | |||||
from tqdm import tqdm | |||||
import numpy as np | |||||
import networkx as nx | |||||
from gklearn.utils import SpecialLabel | |||||
from gklearn.utils.kernels import deltakernel | |||||
from gklearn.utils.parallel import parallel_gm, parallel_me | |||||
from gklearn.utils.utils import untotterTransformation | |||||
from gklearn.kernels import GraphKernel | |||||
class Marginalized(GraphKernel): | |||||
def __init__(self, **kwargs): | |||||
GraphKernel.__init__(self) | |||||
self.__node_labels = kwargs.get('node_labels', []) | |||||
self.__edge_labels = kwargs.get('edge_labels', []) | |||||
self.__p_quit = kwargs.get('p_quit', 0.5) | |||||
self.__n_iteration = kwargs.get('n_iteration', 10) | |||||
self.__remove_totters = kwargs.get('remove_totters', False) | |||||
self.__ds_infos = kwargs.get('ds_infos', {}) | |||||
self.__n_iteration = int(self.__n_iteration) | |||||
def _compute_gm_series(self): | |||||
self.__add_dummy_labels(self._graphs) | |||||
if self.__remove_totters: | |||||
if self._verbose >= 2: | |||||
iterator = tqdm(self._graphs, desc='removing tottering', file=sys.stdout) | |||||
else: | |||||
iterator = self._graphs | |||||
# @todo: this may not work. | |||||
self._graphs = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator] | |||||
# compute Gram matrix. | |||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||||
from itertools import combinations_with_replacement | |||||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||||
if self._verbose >= 2: | |||||
iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) | |||||
else: | |||||
iterator = itr | |||||
for i, j in iterator: | |||||
kernel = self.__kernel_do(self._graphs[i], self._graphs[j]) | |||||
gram_matrix[i][j] = kernel | |||||
gram_matrix[j][i] = kernel # @todo: no directed graph considered? | |||||
return gram_matrix | |||||
def _compute_gm_imap_unordered(self): | |||||
self.__add_dummy_labels(self._graphs) | |||||
if self.__remove_totters: | |||||
pool = Pool(self._n_jobs) | |||||
itr = range(0, len(self._graphs)) | |||||
if len(self._graphs) < 100 * self._n_jobs: | |||||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||||
else: | |||||
chunksize = 100 | |||||
remove_fun = self._wrapper_untotter | |||||
if self._verbose >= 2: | |||||
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), | |||||
desc='removing tottering', file=sys.stdout) | |||||
else: | |||||
iterator = pool.imap_unordered(remove_fun, itr, chunksize) | |||||
for i, g in iterator: | |||||
self._graphs[i] = g | |||||
pool.close() | |||||
pool.join() | |||||
# compute Gram matrix. | |||||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||||
def init_worker(gn_toshare): | |||||
global G_gn | |||||
G_gn = gn_toshare | |||||
do_fun = self._wrapper_kernel_do | |||||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||||
return gram_matrix | |||||
def _compute_kernel_list_series(self, g1, g_list): | |||||
self.__add_dummy_labels(g_list + [g1]) | |||||
if self.__remove_totters: | |||||
g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. | |||||
if self._verbose >= 2: | |||||
iterator = tqdm(g_list, desc='removing tottering', file=sys.stdout) | |||||
else: | |||||
iterator = g_list | |||||
# @todo: this may not work. | |||||
g_list = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator] | |||||
# compute kernel list. | |||||
kernel_list = [None] * len(g_list) | |||||
if self._verbose >= 2: | |||||
iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) | |||||
else: | |||||
iterator = range(len(g_list)) | |||||
for i in iterator: | |||||
kernel = self.__kernel_do(g1, g_list[i]) | |||||
kernel_list[i] = kernel | |||||
return kernel_list | |||||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||||
self.__add_dummy_labels(g_list + [g1]) | |||||
if self.__remove_totters: | |||||
g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. | |||||
pool = Pool(self._n_jobs) | |||||
itr = range(0, len(g_list)) | |||||
if len(g_list) < 100 * self._n_jobs: | |||||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||||
else: | |||||
chunksize = 100 | |||||
remove_fun = self._wrapper_untotter | |||||
if self._verbose >= 2: | |||||
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), | |||||
desc='removing tottering', file=sys.stdout) | |||||
else: | |||||
iterator = pool.imap_unordered(remove_fun, itr, chunksize) | |||||
for i, g in iterator: | |||||
g_list[i] = g | |||||
pool.close() | |||||
pool.join() | |||||
# compute kernel list. | |||||
kernel_list = [None] * len(g_list) | |||||
def init_worker(g1_toshare, g_list_toshare): | |||||
global G_g1, G_g_list | |||||
G_g1 = g1_toshare | |||||
G_g_list = g_list_toshare | |||||
do_fun = self._wrapper_kernel_list_do | |||||
def func_assign(result, var_to_assign): | |||||
var_to_assign[result[0]] = result[1] | |||||
itr = range(len(g_list)) | |||||
len_itr = len(g_list) | |||||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||||
n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) | |||||
return kernel_list | |||||
def _wrapper_kernel_list_do(self, itr): | |||||
return itr, self.__kernel_do(G_g1, G_g_list[itr]) | |||||
def _compute_single_kernel_series(self, g1, g2): | |||||
self.__add_dummy_labels([g1] + [g2]) | |||||
if self.__remove_totters: | |||||
g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. | |||||
g2 = untotterTransformation(g2, self.__node_label, self.__edge_label) | |||||
kernel = self.__kernel_do(g1, g2) | |||||
return kernel | |||||
def __kernel_do(self, g1, g2): | |||||
"""Calculate marginalized graph kernel between 2 graphs. | |||||
Parameters | |||||
---------- | |||||
g1, g2 : NetworkX graphs | |||||
2 graphs between which the kernel is calculated. | |||||
Return | |||||
------ | |||||
kernel : float | |||||
Marginalized kernel between 2 graphs. | |||||
""" | |||||
# init parameters | |||||
kernel = 0 | |||||
num_nodes_G1 = nx.number_of_nodes(g1) | |||||
num_nodes_G2 = nx.number_of_nodes(g2) | |||||
# the initial probability distribution in the random walks generating step | |||||
# (uniform distribution over |G|) | |||||
p_init_G1 = 1 / num_nodes_G1 | |||||
p_init_G2 = 1 / num_nodes_G2 | |||||
q = self.__p_quit * self.__p_quit | |||||
r1 = q | |||||
# # initial R_inf | |||||
# # matrix to save all the R_inf for all pairs of nodes | |||||
# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
# | |||||
# # calculate R_inf with a simple interative method | |||||
# for i in range(1, n_iteration): | |||||
# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) | |||||
# R_inf_new.fill(r1) | |||||
# | |||||
# # calculate R_inf for each pair of nodes | |||||
# for node1 in g1.nodes(data=True): | |||||
# neighbor_n1 = g1[node1[0]] | |||||
# # the transition probability distribution in the random walks | |||||
# # generating step (uniform distribution over the vertices adjacent | |||||
# # to the current vertex) | |||||
# if len(neighbor_n1) > 0: | |||||
# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||||
# for node2 in g2.nodes(data=True): | |||||
# neighbor_n2 = g2[node2[0]] | |||||
# if len(neighbor_n2) > 0: | |||||
# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||||
# | |||||
# for neighbor1 in neighbor_n1: | |||||
# for neighbor2 in neighbor_n2: | |||||
# t = p_trans_n1 * p_trans_n2 * \ | |||||
# deltakernel(g1.node[neighbor1][node_label], | |||||
# g2.node[neighbor2][node_label]) * \ | |||||
# deltakernel( | |||||
# neighbor_n1[neighbor1][edge_label], | |||||
# neighbor_n2[neighbor2][edge_label]) | |||||
# | |||||
# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ | |||||
# neighbor2] # ref [1] equation (8) | |||||
# R_inf[:] = R_inf_new | |||||
# | |||||
# # add elements of R_inf up and calculate kernel | |||||
# for node1 in g1.nodes(data=True): | |||||
# for node2 in g2.nodes(data=True): | |||||
# s = p_init_G1 * p_init_G2 * deltakernel( | |||||
# node1[1][node_label], node2[1][node_label]) | |||||
# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) | |||||
R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||||
# initial R_inf, the 1st iteration. | |||||
for node1 in g1.nodes(): | |||||
for node2 in g2.nodes(): | |||||
# R_inf[(node1[0], node2[0])] = r1 | |||||
if len(g1[node1]) > 0: | |||||
if len(g2[node2]) > 0: | |||||
R_inf[(node1, node2)] = r1 | |||||
else: | |||||
R_inf[(node1, node2)] = self.__p_quit | |||||
else: | |||||
if len(g2[node2]) > 0: | |||||
R_inf[(node1, node2)] = self.__p_quit | |||||
else: | |||||
R_inf[(node1, node2)] = 1 | |||||
# compute all transition probability first. | |||||
t_dict = {} | |||||
if self.__n_iteration > 1: | |||||
for node1 in g1.nodes(): | |||||
neighbor_n1 = g1[node1] | |||||
# the transition probability distribution in the random walks | |||||
# generating step (uniform distribution over the vertices adjacent | |||||
# to the current vertex) | |||||
if len(neighbor_n1) > 0: | |||||
p_trans_n1 = (1 - self.__p_quit) / len(neighbor_n1) | |||||
for node2 in g2.nodes(): | |||||
neighbor_n2 = g2[node2] | |||||
if len(neighbor_n2) > 0: | |||||
p_trans_n2 = (1 - self.__p_quit) / len(neighbor_n2) | |||||
for neighbor1 in neighbor_n1: | |||||
for neighbor2 in neighbor_n2: | |||||
t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||||
p_trans_n1 * p_trans_n2 * \ | |||||
deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self.__node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self.__node_labels)) * \ | |||||
deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self.__edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self.__edge_labels)) | |||||
# calculate R_inf with a simple interative method | |||||
for i in range(2, self.__n_iteration + 1): | |||||
R_inf_old = R_inf.copy() | |||||
# calculate R_inf for each pair of nodes | |||||
for node1 in g1.nodes(): | |||||
neighbor_n1 = g1[node1] | |||||
# the transition probability distribution in the random walks | |||||
# generating step (uniform distribution over the vertices adjacent | |||||
# to the current vertex) | |||||
if len(neighbor_n1) > 0: | |||||
for node2 in g2.nodes(): | |||||
neighbor_n2 = g2[node2] | |||||
if len(neighbor_n2) > 0: | |||||
R_inf[(node1, node2)] = r1 | |||||
for neighbor1 in neighbor_n1: | |||||
for neighbor2 in neighbor_n2: | |||||
R_inf[(node1, node2)] += \ | |||||
(t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||||
R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||||
# add elements of R_inf up and calculate kernel | |||||
for (n1, n2), value in R_inf.items(): | |||||
s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self.__node_labels), tuple(g2.nodes[n2][nl] for nl in self.__node_labels)) | |||||
kernel += s * value # ref [1] equation (6) | |||||
return kernel | |||||
def _wrapper_kernel_do(self, itr): | |||||
i = itr[0] | |||||
j = itr[1] | |||||
return i, j, self.__kernel_do(G_gn[i], G_gn[j]) | |||||
def _wrapper_untotter(self, i): | |||||
return i, untotterTransformation(self._graphs[i], self.__node_label, self.__edge_label) # @todo: this may not work. | |||||
def __add_dummy_labels(self, Gn): | |||||
if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY): | |||||
for i in range(len(Gn)): | |||||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||||
self.__node_labels = [SpecialLabel.DUMMY] | |||||
if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY): | |||||
for i in range(len(Gn)): | |||||
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||||
self.__edge_labels = [SpecialLabel.DUMMY] |
@@ -195,7 +195,7 @@ class Treelet(GraphKernel): | |||||
Return | Return | ||||
------ | ------ | ||||
kernel : float | kernel : float | ||||
Treelet Kernel between 2 graphs. | |||||
Treelet kernel between 2 graphs. | |||||
""" | """ | ||||
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | ||||
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | ||||
@@ -12,4 +12,6 @@ __date__ = "March 2020" | |||||
from gklearn.preimage.preimage_generator import PreimageGenerator | from gklearn.preimage.preimage_generator import PreimageGenerator | ||||
from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator | from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator | ||||
from gklearn.preimage.random_preimage_generator import RandomPreimageGenerator | |||||
from gklearn.preimage.kernel_knn_cv import kernel_knn_cv | from gklearn.preimage.kernel_knn_cv import kernel_knn_cv | ||||
from gklearn.preimage.generate_random_preimages_by_class import generate_random_preimages_by_class |
@@ -0,0 +1,262 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Mon Jun 1 11:37:57 2020 | |||||
@author: ljia | |||||
""" | |||||
import multiprocessing | |||||
import numpy as np | |||||
import networkx as nx | |||||
import os | |||||
from gklearn.utils.graphfiles import saveGXL | |||||
from gklearn.preimage import RandomPreimageGenerator | |||||
from gklearn.utils import Dataset | |||||
dir_root = '../results/xp_random_preimage_generation/' | |||||
def xp_random_preimage_generation(kernel_name): | |||||
""" | |||||
Experiment similar to the one in Bakir's paper. A test to check if RandomPreimageGenerator class works correctly. | |||||
Returns | |||||
------- | |||||
None. | |||||
""" | |||||
alpha1_list = np.linspace(0, 1, 11) | |||||
k_dis_datasets = [] | |||||
k_dis_preimages = [] | |||||
preimages = [] | |||||
bests_from_dataset = [] | |||||
for alpha1 in alpha1_list: | |||||
print('alpha1 =', alpha1, ':\n') | |||||
# set parameters. | |||||
ds_name = 'MUTAG' | |||||
rpg_options = {'k': 5, | |||||
'r_max': 10, # | |||||
'l': 500, | |||||
'alphas': None, | |||||
'parallel': True, | |||||
'verbose': 2} | |||||
if kernel_name == 'PathUpToH': | |||||
kernel_options = {'name': 'PathUpToH', | |||||
'depth': 2, # | |||||
'k_func': 'MinMax', # | |||||
'compute_method': 'trie', | |||||
'parallel': 'imap_unordered', | |||||
# 'parallel': None, | |||||
'n_jobs': multiprocessing.cpu_count(), | |||||
'normalize': True, | |||||
'verbose': 0} | |||||
elif kernel_name == 'Marginalized': | |||||
kernel_options = {'name': 'Marginalized', | |||||
'p_quit': 0.8, # | |||||
'n_iteration': 7, # | |||||
'remove_totters': False, | |||||
'parallel': 'imap_unordered', | |||||
# 'parallel': None, | |||||
'n_jobs': multiprocessing.cpu_count(), | |||||
'normalize': True, | |||||
'verbose': 0} | |||||
edge_required = True | |||||
irrelevant_labels = {'edge_labels': ['label_0']} | |||||
cut_range = None | |||||
# create/get Gram matrix. | |||||
dir_save = dir_root + ds_name + '.' + kernel_options['name'] + '/' | |||||
if not os.path.exists(dir_save): | |||||
os.makedirs(dir_save) | |||||
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' | |||||
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) | |||||
if gmfile_exist: | |||||
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. | |||||
gram_matrix_unnorm = gmfile['gram_matrix_unnorm'] | |||||
time_precompute_gm = gmfile['run_time'] | |||||
# 1. get dataset. | |||||
print('1. getting dataset...') | |||||
dataset_all = Dataset() | |||||
dataset_all.load_predefined_dataset(ds_name) | |||||
dataset_all.trim_dataset(edge_required=edge_required) | |||||
if irrelevant_labels is not None: | |||||
dataset_all.remove_labels(**irrelevant_labels) | |||||
if cut_range is not None: | |||||
dataset_all.cut_graphs(cut_range) | |||||
# # add two "random" graphs. | |||||
# g1 = nx.Graph() | |||||
# g1.add_nodes_from(range(0, 16), label_0='0') | |||||
# g1.add_nodes_from(range(16, 25), label_0='1') | |||||
# g1.add_node(25, label_0='2') | |||||
# g1.add_nodes_from([26, 27], label_0='3') | |||||
# g1.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (5, 0), (4, 9), (12, 3), (10, 13), (13, 14), (14, 15), (15, 8), (0, 16), (1, 17), (2, 18), (12, 19), (11, 20), (13, 21), (15, 22), (7, 23), (6, 24), (14, 25), (25, 26), (25, 27)]) | |||||
# g2 = nx.Graph() | |||||
# g2.add_nodes_from(range(0, 12), label_0='0') | |||||
# g2.add_nodes_from(range(12, 19), label_0='1') | |||||
# g2.add_nodes_from([19, 20, 21], label_0='2') | |||||
# g2.add_nodes_from([22, 23], label_0='3') | |||||
# g2.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 19), (19, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 20), (20, 7), (5, 0), (4, 8), (0, 12), (1, 13), (2, 14), (9, 15), (10, 16), (11, 17), (6, 18), (3, 21), (21, 22), (21, 23)]) | |||||
# dataset_all.load_graphs([g1, g2] + dataset_all.graphs, targets=None) | |||||
# 2. initialize rpg and setting parameters. | |||||
print('2. initializing rpg and setting parameters...') | |||||
# nb_graphs = len(dataset_all.graphs) - 2 | |||||
# rpg_options['alphas'] = [alpha1, 1 - alpha1] + [0] * nb_graphs | |||||
nb_graphs = len(dataset_all.graphs) | |||||
alphas = [0] * nb_graphs | |||||
alphas[1] = alpha1 | |||||
alphas[6] = 1 - alpha1 | |||||
rpg_options['alphas'] = alphas | |||||
if gmfile_exist: | |||||
rpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm | |||||
rpg_options['runtime_precompute_gm'] = time_precompute_gm | |||||
rpg = RandomPreimageGenerator() | |||||
rpg.dataset = dataset_all | |||||
rpg.set_options(**rpg_options.copy()) | |||||
rpg.kernel_options = kernel_options.copy() | |||||
# 3. compute preimage. | |||||
print('3. computing preimage...') | |||||
rpg.run() | |||||
results = rpg.get_results() | |||||
k_dis_datasets.append(results['k_dis_dataset']) | |||||
k_dis_preimages.append(results['k_dis_preimage']) | |||||
bests_from_dataset.append(rpg.best_from_dataset) | |||||
preimages.append(rpg.preimage) | |||||
# 4. save results. | |||||
# write Gram matrices to file. | |||||
if not gmfile_exist: | |||||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm=rpg.gram_matrix_unnorm, run_time=results['runtime_precompute_gm']) | |||||
# save graphs. | |||||
fn_best_dataset = dir_save + 'g_best_dataset.' + 'alpha1_' + str(alpha1)[0:3] | |||||
saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||||
node_labels=dataset_all.node_labels, edge_labels=dataset_all.edge_labels, | |||||
node_attrs=dataset_all.node_attrs, edge_attrs=dataset_all.edge_attrs) | |||||
fn_preimage = dir_save + 'g_preimage.' + 'alpha1_' + str(alpha1)[0:3] | |||||
saveGXL(rpg.preimage, fn_preimage + '.gxl', method='default', | |||||
node_labels=dataset_all.node_labels, edge_labels=dataset_all.edge_labels, | |||||
node_attrs=dataset_all.node_attrs, edge_attrs=dataset_all.edge_attrs) | |||||
# draw graphs. | |||||
__draw_graph(rpg.best_from_dataset, fn_best_dataset) | |||||
__draw_graph(rpg.preimage, fn_preimage) | |||||
# save distances. | |||||
np.savez(dir_save + 'distances.' + ds_name + '.' + kernel_options['name'], k_dis_datasets=k_dis_datasets, k_dis_preimages=k_dis_preimages) | |||||
# plot results figure. | |||||
__plot_results(alpha1_list, k_dis_datasets, k_dis_preimages, dir_save) | |||||
print('\ncomplete.\n') | |||||
return k_dis_datasets, k_dis_preimages, bests_from_dataset, preimages | |||||
def __draw_graph(graph, file_prefix): | |||||
# import matplotlib | |||||
# matplotlib.use('agg') | |||||
import matplotlib.pyplot as plt | |||||
plt.figure() | |||||
pos = nx.spring_layout(graph) | |||||
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'label_0'), font_color='w', width=3, with_labels=True) | |||||
plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||||
# plt.show() | |||||
plt.clf() | |||||
plt.close() | |||||
def __plot_results(alpha1_list, k_dis_datasets, k_dis_preimages, dir_save): | |||||
import matplotlib.pyplot as plt | |||||
fig, ax = plt.subplots(1, 1, figsize=(7, 4.5)) | |||||
ind = np.arange(len(alpha1_list)) # the x locations for the groups | |||||
width = 0.35 # the width of the bars: can also be len(x) sequence | |||||
ax.bar(ind, k_dis_preimages, width, label='Reconstructed pre-image', zorder=3, color='#133AAC') | |||||
ax.set_xlabel(r'$\alpha \in [0,1]$') | |||||
ax.set_ylabel(r'$d(g_i,g^\star(\alpha))$') | |||||
#ax.set_title('Runtime of the shortest path kernel on all datasets') | |||||
plt.xticks(ind, [str(i)[0:3] for i in alpha1_list]) | |||||
#ax.set_yticks(np.logspace(-16, -3, num=20, base=10)) | |||||
#ax.set_ylim(bottom=1e-15) | |||||
ax.grid(axis='y', zorder=0) | |||||
ax.spines['top'].set_visible(False) | |||||
ax.spines['bottom'].set_visible(False) | |||||
ax.spines['left'].set_visible(False) | |||||
ax.spines['right'].set_visible(False) | |||||
ax.xaxis.set_ticks_position('none') | |||||
ax.plot(ind, k_dis_datasets, 'b.-', label=r'Nearest neighbor in $D_N$', color='orange', zorder=4) | |||||
ax.yaxis.set_ticks_position('none') | |||||
fig.subplots_adjust(bottom=.2) | |||||
fig.legend(loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||||
plt.savefig(dir_save + 'distances in kernel space.eps', format='eps', dpi=300, | |||||
transparent=True, bbox_inches='tight') | |||||
plt.show() | |||||
plt.clf() | |||||
plt.close() | |||||
if __name__ == '__main__': | |||||
# kernel_name = 'PathUpToH' | |||||
kernel_name = 'Marginalized' | |||||
k_dis_datasets, k_dis_preimages, bests_from_dataset, preimages = xp_random_preimage_generation(kernel_name) | |||||
# # save graphs. | |||||
# dir_save = dir_root + 'MUTAG.PathUpToH/' | |||||
# for i, alpha1 in enumerate(np.linspace(0, 1, 11)): | |||||
# fn_best_dataset = dir_save + 'g_best_dataset.' + 'alpha1_' + str(alpha1)[0:3] | |||||
# saveGXL(bests_from_dataset[i], fn_best_dataset + '.gxl', method='default', | |||||
# node_labels=['label_0'], edge_labels=[], | |||||
# node_attrs=[], edge_attrs=[]) | |||||
# fn_preimage = dir_save + 'g_preimage.' + 'alpha1_' + str(alpha1)[0:3] | |||||
# saveGXL(preimages[i], fn_preimage + '.gxl', method='default', | |||||
# node_labels=['label_0'], edge_labels=[], | |||||
# node_attrs=[], edge_attrs=[]) | |||||
# # draw graphs. | |||||
# dir_save = dir_root + 'MUTAG.PathUpToH/' | |||||
# for i, alpha1 in enumerate(np.linspace(0, 1, 11)): | |||||
# fn_best_dataset = dir_save + 'g_best_dataset.' + 'alpha1_' + str(alpha1)[0:3] | |||||
# __draw_graph(bests_from_dataset[i], fn_best_dataset) | |||||
# fn_preimage = dir_save + 'g_preimage.' + 'alpha1_' + str(alpha1)[0:3] | |||||
# __draw_graph(preimages[i], fn_preimage) | |||||
# # plot results figure. | |||||
# alpha1_list = np.linspace(0, 1, 11) | |||||
# dir_save = dir_root + 'MUTAG.PathUpToH/' | |||||
# __plot_results(alpha1_list, k_dis_datasets, k_dis_preimages, dir_save) | |||||
# k_dis_datasets = [0.0, | |||||
# 0.08882515554098754, | |||||
# 0.17765031108197632, | |||||
# 0.2664754666229643, | |||||
# 0.35530062216395264, | |||||
# 0.44412577770494066, | |||||
# 0.35530062216395236, | |||||
# 0.2664754666229643, | |||||
# 0.17765031108197632, | |||||
# 0.08882515554098878, | |||||
# 0.0] | |||||
# k_dis_preimages = [0.0, | |||||
# 0.08882515554098754, | |||||
# 0.17765031108197632, | |||||
# 0.2664754666229643, | |||||
# 0.35530062216395264, | |||||
# 0.44412577770494066, | |||||
# 0.35530062216395236, | |||||
# 0.2664754666229643, | |||||
# 0.17765031108197632, | |||||
# 0.08882515554098878, | |||||
# 0.0] |
@@ -0,0 +1,176 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri Jun 12 10:30:17 2020 | |||||
@author: ljia | |||||
This script constructs simple preimages to test preimage methods and find bugs and shortcomings in them. | |||||
""" | |||||
def xp_simple_preimage(): | |||||
import numpy as np | |||||
"""**1. Get dataset.**""" | |||||
from gklearn.utils import Dataset, split_dataset_by_target | |||||
# Predefined dataset name, use dataset "MAO". | |||||
ds_name = 'MAO' | |||||
# The node/edge labels that will not be used in the computation. | |||||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||||
# Initialize a Dataset. | |||||
dataset_all = Dataset() | |||||
# Load predefined dataset "MAO". | |||||
dataset_all.load_predefined_dataset(ds_name) | |||||
# Remove irrelevant labels. | |||||
dataset_all.remove_labels(**irrelevant_labels) | |||||
# Split the whole dataset according to the classification targets. | |||||
datasets = split_dataset_by_target(dataset_all) | |||||
# Get the first class of graphs, whose median preimage will be computed. | |||||
dataset = datasets[0] | |||||
len(dataset.graphs) | |||||
"""**2. Set parameters.**""" | |||||
import multiprocessing | |||||
# Parameters for MedianPreimageGenerator (our method). | |||||
mpg_options = {'fit_method': 'k-graphs', # how to fit edit costs. "k-graphs" means use all graphs in median set when fitting. | |||||
'init_ecc': [4, 4, 2, 1, 1, 1], # initial edit costs. | |||||
'ds_name': ds_name, # name of the dataset. | |||||
'parallel': True, # whether the parallel scheme is to be used. | |||||
'time_limit_in_sec': 0, # maximum time limit to compute the preimage. If set to 0 then no limit. | |||||
'max_itrs': 10, # maximum iteration limit to optimize edit costs. If set to 0 then no limit. | |||||
'max_itrs_without_update': 3, # If the times that edit costs is not update is more than this number, then the optimization stops. | |||||
'epsilon_residual': 0.01, # In optimization, the residual is only considered changed if the change is bigger than this number. | |||||
'epsilon_ec': 0.1, # In optimization, the edit costs are only considered changed if the changes are bigger than this number. | |||||
'verbose': 2 # whether to print out results. | |||||
} | |||||
# Parameters for graph kernel computation. | |||||
kernel_options = {'name': 'PathUpToH', # use path kernel up to length h. | |||||
'depth': 9, | |||||
'k_func': 'MinMax', | |||||
'compute_method': 'trie', | |||||
'parallel': 'imap_unordered', # or None | |||||
'n_jobs': multiprocessing.cpu_count(), | |||||
'normalize': True, # whether to use normalized Gram matrix to optimize edit costs. | |||||
'verbose': 2 # whether to print out results. | |||||
} | |||||
# Parameters for GED computation. | |||||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||||
'initial_solutions': 10, # when bigger than 1, then the method is considered mIPFP. | |||||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||||
'attr_distance': 'euclidean', # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||||
'ratio_runs_from_initial_solutions': 1, | |||||
'threads': multiprocessing.cpu_count(), # parallel threads. Do not work if mpg_options['parallel'] = False. | |||||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||||
} | |||||
# Parameters for MedianGraphEstimator (Boria's method). | |||||
mge_options = {'init_type': 'MEDOID', # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD. | |||||
'random_inits': 10, # number of random initialization when 'init_type' = 'RANDOM'. | |||||
'time_limit': 600, # maximum time limit to compute the generalized median. If set to 0 then no limit. | |||||
'verbose': 2, # whether to print out results. | |||||
'refine': False # whether to refine the final SODs or not. | |||||
} | |||||
print('done.') | |||||
"""**3. Compute the Gram matrix and distance matrix.**""" | |||||
from gklearn.utils.utils import get_graph_kernel_by_name | |||||
# Get a graph kernel instance. | |||||
graph_kernel = get_graph_kernel_by_name(kernel_options['name'], | |||||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs, | |||||
ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||||
kernel_options=kernel_options) | |||||
# Compute Gram matrix. | |||||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options) | |||||
# Compute distance matrix. | |||||
from gklearn.utils import compute_distance_matrix | |||||
dis_mat, _, _, _ = compute_distance_matrix(gram_matrix) | |||||
print('done.') | |||||
"""**4. Find the candidate graph.**""" | |||||
from gklearn.preimage.utils import compute_k_dis | |||||
# Number of the nearest neighbors. | |||||
k_neighbors = 10 | |||||
# For each graph G in dataset, compute the distance between its image \Phi(G) and the mean of its neighbors' images. | |||||
dis_min = np.inf # the minimum distance between possible \Phi(G) and the mean of its neighbors. | |||||
for idx, G in enumerate(dataset.graphs): | |||||
# Find the k nearest neighbors of G. | |||||
dis_list = dis_mat[idx] # distance between \Phi(G) and image of each graphs. | |||||
idx_sort = np.argsort(dis_list) # sort distances and get the sorted indices. | |||||
idx_nearest = idx_sort[1:k_neighbors+1] # indices of the k-nearest neighbors. | |||||
dis_k_nearest = [dis_list[i] for i in idx_nearest] # k-nearest distances, except the 0. | |||||
G_k_nearest = [dataset.graphs[i] for i in idx_nearest] # k-nearest neighbors. | |||||
# Compute the distance between \Phi(G) and the mean of its neighbors. | |||||
dis_tmp = compute_k_dis(idx, # the index of G in Gram matrix. | |||||
idx_nearest, # the indices of the neighbors | |||||
[1 / k_neighbors] * k_neighbors, # coefficients for neighbors. | |||||
gram_matrix, | |||||
withterm3=False) | |||||
# Check if the new distance is smallers. | |||||
if dis_tmp < dis_min: | |||||
dis_min = dis_tmp | |||||
G_cand = G | |||||
G_neighbors = G_k_nearest | |||||
print('The minimum distance is', dis_min) | |||||
"""**5. Run median preimage generator.**""" | |||||
from gklearn.preimage import MedianPreimageGenerator | |||||
# Set the dataset as the k-nearest neighbors. | |||||
dataset.load_graphs(G_neighbors) | |||||
# Create median preimage generator instance. | |||||
mpg = MedianPreimageGenerator() | |||||
# Add dataset. | |||||
mpg.dataset = dataset | |||||
# Set parameters. | |||||
mpg.set_options(**mpg_options.copy()) | |||||
mpg.kernel_options = kernel_options.copy() | |||||
mpg.ged_options = ged_options.copy() | |||||
mpg.mge_options = mge_options.copy() | |||||
# Run. | |||||
mpg.run() | |||||
"""**4. Get results.**""" | |||||
# Get results. | |||||
import pprint | |||||
pp = pprint.PrettyPrinter(indent=4) # pretty print | |||||
results = mpg.get_results() | |||||
pp.pprint(results) | |||||
draw_graph(mpg.set_median) | |||||
draw_graph(mpg.gen_median) | |||||
draw_graph(G_cand) | |||||
# Draw generated graphs. | |||||
def draw_graph(graph): | |||||
import matplotlib.pyplot as plt | |||||
import networkx as nx | |||||
plt.figure() | |||||
pos = nx.spring_layout(graph) | |||||
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'atom_symbol'), font_color='w', width=3, with_labels=True) | |||||
plt.show() | |||||
plt.clf() | |||||
plt.close() | |||||
if __name__ == '__main__': | |||||
xp_simple_preimage() |
@@ -0,0 +1,188 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Mon Jun 1 17:02:51 2020 | |||||
@author: ljia | |||||
""" | |||||
import numpy as np | |||||
from gklearn.utils import Dataset | |||||
import csv | |||||
import os | |||||
import os.path | |||||
from gklearn.preimage import RandomPreimageGenerator | |||||
from gklearn.utils import split_dataset_by_target | |||||
from gklearn.utils.graphfiles import saveGXL | |||||
def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, save_results=True, save_preimages=True, load_gm='auto', dir_save='', irrelevant_labels=None, edge_required=False, cut_range=None): | |||||
# 1. get dataset. | |||||
print('1. getting dataset...') | |||||
dataset_all = Dataset() | |||||
dataset_all.load_predefined_dataset(ds_name) | |||||
dataset_all.trim_dataset(edge_required=edge_required) | |||||
if irrelevant_labels is not None: | |||||
dataset_all.remove_labels(**irrelevant_labels) | |||||
if cut_range is not None: | |||||
dataset_all.cut_graphs(cut_range) | |||||
datasets = split_dataset_by_target(dataset_all) | |||||
if save_results: | |||||
# create result files. | |||||
print('creating output files...') | |||||
fn_output_detail, fn_output_summary = __init_output_file_preimage(ds_name, kernel_options['name'], dir_save) | |||||
dis_k_dataset_list = [] | |||||
dis_k_preimage_list = [] | |||||
time_precompute_gm_list = [] | |||||
time_generate_list = [] | |||||
time_total_list = [] | |||||
itrs_list = [] | |||||
num_updates_list = [] | |||||
if load_gm == 'auto': | |||||
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' | |||||
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) | |||||
if gmfile_exist: | |||||
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. | |||||
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']] | |||||
time_precompute_gm_list = gmfile['run_time_list'].tolist() | |||||
else: | |||||
gram_matrix_unnorm_list = [] | |||||
time_precompute_gm_list = [] | |||||
elif not load_gm: | |||||
gram_matrix_unnorm_list = [] | |||||
time_precompute_gm_list = [] | |||||
else: | |||||
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' | |||||
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. | |||||
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']] | |||||
time_precompute_gm_list = gmfile['run_time_list'].tolist() | |||||
print('starting generating preimage for each class of target...') | |||||
idx_offset = 0 | |||||
for idx, dataset in enumerate(datasets): | |||||
target = dataset.targets[0] | |||||
print('\ntarget =', target, '\n') | |||||
# if target != 1: | |||||
# continue | |||||
num_graphs = len(dataset.graphs) | |||||
if num_graphs < 2: | |||||
print('\nnumber of graphs = ', num_graphs, ', skip.\n') | |||||
idx_offset += 1 | |||||
continue | |||||
# 2. set parameters. | |||||
print('2. initializing mpg and setting parameters...') | |||||
if load_gm: | |||||
if gmfile_exist: | |||||
rpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx - idx_offset] | |||||
rpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx - idx_offset] | |||||
rpg = RandomPreimageGenerator() | |||||
rpg.dataset = dataset | |||||
rpg.set_options(**rpg_options.copy()) | |||||
rpg.kernel_options = kernel_options.copy() | |||||
# 3. compute preimage. | |||||
print('3. computing preimage...') | |||||
rpg.run() | |||||
results = rpg.get_results() | |||||
# 4. save results (and median graphs). | |||||
print('4. saving results (and preimages)...') | |||||
# write result detail. | |||||
if save_results: | |||||
print('writing results to files...') | |||||
f_detail = open(dir_save + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow([ds_name, kernel_options['name'], | |||||
num_graphs, target, 1, | |||||
results['k_dis_dataset'], results['k_dis_preimage'], | |||||
results['runtime_precompute_gm'], | |||||
results['runtime_generate_preimage'], results['runtime_total'], | |||||
results['itrs'], results['num_updates']]) | |||||
f_detail.close() | |||||
# compute result summary. | |||||
dis_k_dataset_list.append(results['k_dis_dataset']) | |||||
dis_k_preimage_list.append(results['k_dis_preimage']) | |||||
time_precompute_gm_list.append(results['runtime_precompute_gm']) | |||||
time_generate_list.append(results['runtime_generate_preimage']) | |||||
time_total_list.append(results['runtime_total']) | |||||
itrs_list.append(results['itrs']) | |||||
num_updates_list.append(results['num_updates']) | |||||
# write result summary for each letter. | |||||
f_summary = open(dir_save + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||||
num_graphs, target, | |||||
results['k_dis_dataset'], results['k_dis_preimage'], | |||||
results['runtime_precompute_gm'], | |||||
results['runtime_generate_preimage'], results['runtime_total'], | |||||
results['itrs'], results['num_updates']]) | |||||
f_summary.close() | |||||
# save median graphs. | |||||
if save_preimages: | |||||
if not os.path.exists(dir_save + 'preimages/'): | |||||
os.makedirs(dir_save + 'preimages/') | |||||
print('Saving preimages to files...') | |||||
fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||||
saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||||
fn_preimage = dir_save + 'preimages/g_preimage.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||||
saveGXL(rpg.preimage, fn_preimage + '.gxl', method='default', | |||||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | |||||
gram_matrix_unnorm_list.append(rpg.gram_matrix_unnorm) | |||||
# write result summary for each class. | |||||
if save_results: | |||||
dis_k_dataset_mean = np.mean(dis_k_dataset_list) | |||||
dis_k_preimage_mean = np.mean(dis_k_preimage_list) | |||||
time_precompute_gm_mean = np.mean(time_precompute_gm_list) | |||||
time_generate_mean = np.mean(time_generate_list) | |||||
time_total_mean = np.mean(time_total_list) | |||||
itrs_mean = np.mean(itrs_list) | |||||
num_updates_mean = np.mean(num_updates_list) | |||||
f_summary = open(dir_save + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||||
num_graphs, 'all', | |||||
dis_k_dataset_mean, dis_k_preimage_mean, | |||||
time_precompute_gm_mean, | |||||
time_generate_mean, time_total_mean, itrs_mean, | |||||
num_updates_mean]) | |||||
f_summary.close() | |||||
# write Gram matrices to file. | |||||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | |||||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list) | |||||
print('\ncomplete.\n') | |||||
def __init_output_file_preimage(ds_name, gkernel, dir_output): | |||||
if not os.path.exists(dir_output): | |||||
os.makedirs(dir_output) | |||||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||||
f_detail = open(dir_output + fn_output_detail, 'a') | |||||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'num graphs', | |||||
'target', 'repeat', 'dis_k best from dataset', 'dis_k preimage', | |||||
'time precompute gm', 'time generate preimage', 'time total', | |||||
'itrs', 'num updates']) | |||||
f_detail.close() | |||||
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' | |||||
f_summary = open(dir_output + fn_output_summary, 'a') | |||||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'num graphs', | |||||
'target', 'dis_k best from dataset', 'dis_k preimage', | |||||
'time precompute gm', 'time generate preimage', 'time total', | |||||
'itrs', 'num updates']) | |||||
f_summary.close() | |||||
return fn_output_detail, fn_output_summary |
@@ -19,7 +19,7 @@ from gklearn.ged.median import constant_node_costs,mge_options_to_string | |||||
from gklearn.gedlib import librariesImport, gedlibpy | from gklearn.gedlib import librariesImport, gedlibpy | ||||
from gklearn.utils import Timer | from gklearn.utils import Timer | ||||
from gklearn.utils.utils import get_graph_kernel_by_name | from gklearn.utils.utils import get_graph_kernel_by_name | ||||
# from gklearn.utils.dataset import Dataset | |||||
class MedianPreimageGenerator(PreimageGenerator): | class MedianPreimageGenerator(PreimageGenerator): | ||||
@@ -127,8 +127,7 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
# 3. compute set median and gen median using optimized edit costs. | # 3. compute set median and gen median using optimized edit costs. | ||||
if self._verbose >= 2: | if self._verbose >= 2: | ||||
print('\nstart computing set median and gen median using optimized edit costs...\n') | print('\nstart computing set median and gen median using optimized edit costs...\n') | ||||
# group_fnames = [Gn[g].graph['filename'] for g in group_min] | |||||
self.__generate_preimage_iam() | |||||
self.__gmg_bcu() | |||||
end_generate_preimage = time.time() | end_generate_preimage = time.time() | ||||
self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec | self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec | ||||
self.__runtime_total = end_generate_preimage - start | self.__runtime_total = end_generate_preimage - start | ||||
@@ -140,19 +139,13 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
# 4. compute kernel distances to the true median. | # 4. compute kernel distances to the true median. | ||||
if self._verbose >= 2: | if self._verbose >= 2: | ||||
print('\nstart computing distances to true median....\n') | print('\nstart computing distances to true median....\n') | ||||
# Gn_median = [Gn[g].copy() for g in group_min] | |||||
self.__compute_distances_to_true_median() | self.__compute_distances_to_true_median() | ||||
# dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min = | |||||
# idx_dis_k_gi_min = group_min[idx_dis_k_gi_min] | |||||
# print('index min dis_k_gi:', idx_dis_k_gi_min) | |||||
# print('sod_sm:', sod_sm) | |||||
# print('sod_gm:', sod_gm) | |||||
# 5. print out results. | # 5. print out results. | ||||
if self._verbose: | if self._verbose: | ||||
print() | print() | ||||
print('================================================================================') | print('================================================================================') | ||||
print('Finished generalization of preimages.') | |||||
print('Finished generation of preimages.') | |||||
print('--------------------------------------------------------------------------------') | print('--------------------------------------------------------------------------------') | ||||
print('The optimized edit cost constants:', self.__edit_cost_constants) | print('The optimized edit cost constants:', self.__edit_cost_constants) | ||||
print('SOD of the set median:', self.__sod_set_median) | print('SOD of the set median:', self.__sod_set_median) | ||||
@@ -169,11 +162,6 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
print('Is optimization of edit costs converged:', self.__converged) | print('Is optimization of edit costs converged:', self.__converged) | ||||
print('================================================================================') | print('================================================================================') | ||||
print() | print() | ||||
# collect return values. | |||||
# return (sod_sm, sod_gm), \ | |||||
# (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ | |||||
# (time_fitting, time_generating) | |||||
def get_results(self): | def get_results(self): | ||||
@@ -203,20 +191,22 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
""" | """ | ||||
if self.__fit_method == 'random': # random | if self.__fit_method == 'random': # random | ||||
if self.__ged_options['edit_cost'] == 'LETTER': | if self.__ged_options['edit_cost'] == 'LETTER': | ||||
self.__edit_cost_constants = random.sample(range(1, 10), 3) | |||||
self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants] | |||||
self.__edit_cost_constants = random.sample(range(1, 1000), 3) | |||||
self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants] | |||||
elif self.__ged_options['edit_cost'] == 'LETTER2': | elif self.__ged_options['edit_cost'] == 'LETTER2': | ||||
random.seed(time.time()) | random.seed(time.time()) | ||||
self.__edit_cost_constants = random.sample(range(1, 10), 5) | |||||
# self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants] | |||||
self.__edit_cost_constants = random.sample(range(1, 1000), 5) | |||||
self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] | |||||
elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': | elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': | ||||
self.__edit_cost_constants = random.sample(range(1, 10), 6) | |||||
self.__edit_cost_constants = random.sample(range(1, 1000), 6) | |||||
self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] | |||||
if self._dataset.node_attrs == []: | if self._dataset.node_attrs == []: | ||||
self.__edit_cost_constants[2] = 0 | self.__edit_cost_constants[2] = 0 | ||||
if self._dataset.edge_attrs == []: | if self._dataset.edge_attrs == []: | ||||
self.__edit_cost_constants[5] = 0 | self.__edit_cost_constants[5] = 0 | ||||
else: | else: | ||||
self.__edit_cost_constants = random.sample(range(1, 10), 6) | |||||
self.__edit_cost_constants = random.sample(range(1, 1000), 6) | |||||
self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] | |||||
if self._verbose >= 2: | if self._verbose >= 2: | ||||
print('edit cost constants used:', self.__edit_cost_constants) | print('edit cost constants used:', self.__edit_cost_constants) | ||||
elif self.__fit_method == 'expert': # expert | elif self.__fit_method == 'expert': # expert | ||||
@@ -861,7 +851,15 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
print() | print() | ||||
def __generate_preimage_iam(self): | |||||
def __gmg_bcu(self): | |||||
""" | |||||
The local search algorithm based on block coordinate update (BCU) for estimating a generalized median graph (GMG). | |||||
Returns | |||||
------- | |||||
None. | |||||
""" | |||||
# Set up the ged environment. | # Set up the ged environment. | ||||
ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible. | ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible. | ||||
# gedlibpy.restart_env() | # gedlibpy.restart_env() | ||||
@@ -910,24 +908,24 @@ class MedianPreimageGenerator(PreimageGenerator): | |||||
# compute distance in kernel space for set median. | # compute distance in kernel space for set median. | ||||
kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) | kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) | ||||
kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) | kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) | ||||
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize | |||||
if self._kernel_options['normalize']: | |||||
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize | |||||
kernel_sm = 1 | |||||
# @todo: not correct kernel value | # @todo: not correct kernel value | ||||
gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | ||||
gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1) | |||||
gram_with_sm = np.concatenate((np.array([[kernel_sm] + kernels_to_sm]).T, gram_with_sm), axis=1) | |||||
self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | ||||
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | ||||
gram_with_sm, withterm3=False) | gram_with_sm, withterm3=False) | ||||
# print(gen_median.nodes(data=True)) | |||||
# print(gen_median.edges(data=True)) | |||||
# print(set_median.nodes(data=True)) | |||||
# print(set_median.edges(data=True)) | |||||
# compute distance in kernel space for generalized median. | # compute distance in kernel space for generalized median. | ||||
kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) | kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) | ||||
kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) | kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) | ||||
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize | |||||
if self._kernel_options['normalize']: | |||||
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize | |||||
kernel_gm = 1 | |||||
gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | ||||
gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1) | |||||
gram_with_gm = np.concatenate((np.array([[kernel_gm] + kernels_to_gm]).T, gram_with_gm), axis=1) | |||||
self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | ||||
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | [1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | ||||
gram_with_gm, withterm3=False) | gram_with_gm, withterm3=False) | ||||
@@ -0,0 +1,389 @@ | |||||
#!/usr/bin/env python3 | |||||
# -*- coding: utf-8 -*- | |||||
""" | |||||
Created on Fri May 29 14:29:52 2020 | |||||
@author: ljia | |||||
""" | |||||
import numpy as np | |||||
import time | |||||
import sys | |||||
from tqdm import tqdm | |||||
import multiprocessing | |||||
import networkx as nx | |||||
from multiprocessing import Pool | |||||
from functools import partial | |||||
from gklearn.preimage import PreimageGenerator | |||||
from gklearn.preimage.utils import compute_k_dis | |||||
from gklearn.utils import Timer | |||||
from gklearn.utils.utils import get_graph_kernel_by_name | |||||
# from gklearn.utils.dataset import Dataset | |||||
class RandomPreimageGenerator(PreimageGenerator): | |||||
def __init__(self, dataset=None): | |||||
PreimageGenerator.__init__(self, dataset=dataset) | |||||
# arguments to set. | |||||
self.__k = 5 # number of nearest neighbors of phi in D_N. | |||||
self.__r_max = 10 # maximum number of iterations. | |||||
self.__l = 500 # numbers of graphs generated for each graph in D_k U {g_i_hat}. | |||||
self.__alphas = None # weights of linear combinations of points in kernel space. | |||||
self.__parallel = True | |||||
self.__n_jobs = multiprocessing.cpu_count() | |||||
self.__time_limit_in_sec = 0 | |||||
self.__max_itrs = 20 | |||||
# values to compute. | |||||
self.__runtime_generate_preimage = None | |||||
self.__runtime_total = None | |||||
self.__preimage = None | |||||
self.__best_from_dataset = None | |||||
self.__k_dis_preimage = None | |||||
self.__k_dis_dataset = None | |||||
self.__itrs = 0 | |||||
self.__converged = False # @todo | |||||
self.__num_updates = 0 | |||||
# values that can be set or to be computed. | |||||
self.__gram_matrix_unnorm = None | |||||
self.__runtime_precompute_gm = None | |||||
def set_options(self, **kwargs): | |||||
self._kernel_options = kwargs.get('kernel_options', {}) | |||||
self._graph_kernel = kwargs.get('graph_kernel', None) | |||||
self._verbose = kwargs.get('verbose', 2) | |||||
self.__k = kwargs.get('k', 5) | |||||
self.__r_max = kwargs.get('r_max', 10) | |||||
self.__l = kwargs.get('l', 500) | |||||
self.__alphas = kwargs.get('alphas', None) | |||||
self.__parallel = kwargs.get('parallel', True) | |||||
self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||||
self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) | |||||
self.__max_itrs = kwargs.get('max_itrs', 20) | |||||
self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) | |||||
self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) | |||||
def run(self): | |||||
self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'], | |||||
node_labels=self._dataset.node_labels, | |||||
edge_labels=self._dataset.edge_labels, | |||||
node_attrs=self._dataset.node_attrs, | |||||
edge_attrs=self._dataset.edge_attrs, | |||||
ds_infos=self._dataset.get_dataset_infos(keys=['directed']), | |||||
kernel_options=self._kernel_options) | |||||
# record start time. | |||||
start = time.time() | |||||
# 1. precompute gram matrix. | |||||
if self.__gram_matrix_unnorm is None: | |||||
gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) | |||||
self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm | |||||
end_precompute_gm = time.time() | |||||
self.__runtime_precompute_gm = end_precompute_gm - start | |||||
else: | |||||
if self.__runtime_precompute_gm is None: | |||||
raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') | |||||
self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm | |||||
if self._kernel_options['normalize']: | |||||
self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) | |||||
else: | |||||
self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm) | |||||
end_precompute_gm = time.time() | |||||
start -= self.__runtime_precompute_gm | |||||
# 2. compute k nearest neighbors of phi in D_N. | |||||
if self._verbose >= 2: | |||||
print('\nstart computing k nearest neighbors of phi in D_N...\n') | |||||
D_N = self._dataset.graphs | |||||
if self.__alphas is None: | |||||
self.__alphas = [1 / len(D_N)] * len(D_N) | |||||
k_dis_list = [] # distance between g_star and each graph. | |||||
term3 = 0 | |||||
for i1, a1 in enumerate(self.__alphas): | |||||
for i2, a2 in enumerate(self.__alphas): | |||||
term3 += a1 * a2 * self._graph_kernel.gram_matrix[i1, i2] | |||||
for idx in range(len(D_N)): | |||||
k_dis_list.append(compute_k_dis(idx, range(0, len(D_N)), self.__alphas, self._graph_kernel.gram_matrix, term3=term3, withterm3=True)) | |||||
# sort. | |||||
sort_idx = np.argsort(k_dis_list) | |||||
dis_gs = [k_dis_list[idis] for idis in sort_idx[0:self.__k]] # the k shortest distances. | |||||
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) | |||||
g0hat_list = [D_N[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in D_N | |||||
self.__best_from_dataset = g0hat_list[0] # get the first best graph if there are muitlple. | |||||
self.__k_dis_dataset = dis_gs[0] | |||||
if self.__k_dis_dataset == 0: # get the exact pre-image. | |||||
end_generate_preimage = time.time() | |||||
self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm | |||||
self.__runtime_total = end_generate_preimage - start | |||||
self.__preimage = self.__best_from_dataset.copy() | |||||
self.__k_dis_preimage = self.__k_dis_dataset | |||||
if self._verbose: | |||||
print() | |||||
print('=============================================================================') | |||||
print('The exact pre-image is found from the input dataset.') | |||||
print('-----------------------------------------------------------------------------') | |||||
print('Distance in kernel space for the best graph from dataset and for preimage:', self.__k_dis_dataset) | |||||
print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||||
print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||||
print('Total time:', self.__runtime_total) | |||||
print('=============================================================================') | |||||
print() | |||||
return | |||||
dhat = dis_gs[0] # the nearest distance | |||||
Gk = [D_N[ig].copy() for ig in sort_idx[0:self.__k]] # the k nearest neighbors | |||||
Gs_nearest = [nx.convert_node_labels_to_integers(g) for g in Gk] # [g.copy() for g in Gk] | |||||
# 3. start iterations. | |||||
if self._verbose >= 2: | |||||
print('starting iterations...') | |||||
gihat_list = [] | |||||
dihat_list = [] | |||||
r = 0 | |||||
dis_of_each_itr = [dhat] | |||||
if self.__parallel: | |||||
self._kernel_options['parallel'] = None | |||||
self.__itrs = 0 | |||||
self.__num_updates = 0 | |||||
timer = Timer(self.__time_limit_in_sec) | |||||
while not self.__termination_criterion_met(timer, self.__itrs, r): | |||||
print('\n- r =', r) | |||||
found = False | |||||
dis_bests = dis_gs + dihat_list | |||||
# compute numbers of edges to be inserted/deleted. | |||||
# @todo what if the log is negetive? how to choose alpha (scalar)? | |||||
fdgs_list = np.array(dis_bests) | |||||
if np.min(fdgs_list) < 1: # in case the log is negetive. | |||||
fdgs_list /= np.min(fdgs_list) | |||||
fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] | |||||
if np.min(fdgs_list) < 1: # in case the log is smaller than 1. | |||||
fdgs_list = np.array(fdgs_list) + 1 | |||||
# expand the number of modifications to increase the possiblity. | |||||
nb_vpairs_list = [nx.number_of_nodes(g) * (nx.number_of_nodes(g) - 1) for g in (Gs_nearest + gihat_list)] | |||||
nb_vpairs_min = np.min(nb_vpairs_list) | |||||
idx_fdgs_max = np.argmax(fdgs_list) | |||||
fdgs_max_old = fdgs_list[idx_fdgs_max] | |||||
fdgs_max = fdgs_max_old | |||||
nb_modif = 1 | |||||
for idx, nb in enumerate(range(nb_vpairs_min, nb_vpairs_min - fdgs_max, -1)): | |||||
nb_modif *= nb / (fdgs_max - idx) | |||||
while fdgs_max < nb_vpairs_min and nb_modif < self.__l: | |||||
fdgs_max += 1 | |||||
nb_modif *= (nb_vpairs_min - fdgs_max + 1) / fdgs_max | |||||
nb_increase = int(fdgs_max - fdgs_max_old) | |||||
if nb_increase > 0: | |||||
fdgs_list += 1 | |||||
for ig, gs in enumerate(Gs_nearest + gihat_list): | |||||
if self._verbose >= 2: | |||||
print('-- computing', ig + 1, 'graphs out of', len(Gs_nearest) + len(gihat_list)) | |||||
gnew, dhat, found = self.__generate_l_graphs(gs, fdgs_list[ig], dhat, ig, found, term3) | |||||
if found: | |||||
r = 0 | |||||
gihat_list = [gnew] | |||||
dihat_list = [dhat] | |||||
else: | |||||
r += 1 | |||||
dis_of_each_itr.append(dhat) | |||||
self.__itrs += 1 | |||||
if self._verbose >= 2: | |||||
print('Total number of iterations is', self.__itrs, '.') | |||||
print('The preimage is updated', self.__num_updates, 'times.') | |||||
print('The shortest distances for previous iterations are', dis_of_each_itr, '.') | |||||
# get results and print. | |||||
end_generate_preimage = time.time() | |||||
self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm | |||||
self.__runtime_total = end_generate_preimage - start | |||||
self.__preimage = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0]) | |||||
self.__k_dis_preimage = dhat | |||||
if self._verbose: | |||||
print() | |||||
print('=============================================================================') | |||||
print('Finished generation of preimages.') | |||||
print('-----------------------------------------------------------------------------') | |||||
print('Distance in kernel space for the best graph from dataset:', self.__k_dis_dataset) | |||||
print('Distance in kernel space for the preimage:', self.__k_dis_preimage) | |||||
print('Total number of iterations for optimizing:', self.__itrs) | |||||
print('Total number of updating preimage:', self.__num_updates) | |||||
print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||||
print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||||
print('Total time:', self.__runtime_total) | |||||
print('=============================================================================') | |||||
print() | |||||
def __generate_l_graphs(self, g_init, fdgs, dhat, ig, found, term3): | |||||
if self.__parallel: | |||||
gnew, dhat, found = self.__generate_l_graphs_parallel(g_init, fdgs, dhat, ig, found, term3) | |||||
else: | |||||
gnew, dhat, found = self.__generate_l_graphs_series(g_init, fdgs, dhat, ig, found, term3) | |||||
return gnew, dhat, found | |||||
def __generate_l_graphs_series(self, g_init, fdgs, dhat, ig, found, term3): | |||||
gnew = None | |||||
updated = False | |||||
for trial in range(0, self.__l): | |||||
if self._verbose >= 2: | |||||
print('---', trial + 1, 'trial out of', self.__l) | |||||
gtemp, dnew = self.__do_trial(g_init, fdgs, term3, trial) | |||||
# get the better graph preimage. | |||||
if dnew <= dhat: # @todo: the new distance is smaller or also equal? | |||||
if dhat - dnew > 1e-6: | |||||
if self._verbose >= 2: | |||||
print('trial =', str(trial)) | |||||
print('\nI am smaller!') | |||||
print('index (as in D_k U {gihat} =', str(ig)) | |||||
print('distance:', dhat, '->', dnew) | |||||
updated = True | |||||
else: | |||||
if self._verbose >= 2: | |||||
print('I am equal!') | |||||
dhat = dnew | |||||
gnew = gtemp.copy() | |||||
found = True # found better or equally good graph. | |||||
if updated: | |||||
self.__num_updates += 1 | |||||
return gnew, dhat, found | |||||
def __generate_l_graphs_parallel(self, g_init, fdgs, dhat, ig, found, term3): | |||||
gnew = None | |||||
len_itr = self.__l | |||||
gnew_list = [None] * len_itr | |||||
dnew_list = [None] * len_itr | |||||
itr = range(0, len_itr) | |||||
n_jobs = multiprocessing.cpu_count() | |||||
if len_itr < 100 * n_jobs: | |||||
chunksize = int(len_itr / n_jobs) + 1 | |||||
else: | |||||
chunksize = 100 | |||||
do_fun = partial(self._generate_graph_parallel, g_init, fdgs, term3) | |||||
pool = Pool(processes=n_jobs) | |||||
if self._verbose >= 2: | |||||
iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), | |||||
desc='Generating l graphs', file=sys.stdout) | |||||
else: | |||||
iterator = pool.imap_unordered(do_fun, itr, chunksize) | |||||
for idx, gnew, dnew in iterator: | |||||
gnew_list[idx] = gnew | |||||
dnew_list[idx] = dnew | |||||
pool.close() | |||||
pool.join() | |||||
# check if get the better graph preimage. | |||||
idx_min = np.argmin(dnew_list) | |||||
dnew = dnew_list[idx_min] | |||||
if dnew <= dhat: # @todo: the new distance is smaller or also equal? | |||||
if dhat - dnew > 1e-6: # @todo: use a proportion and watch out for 0. | |||||
if self._verbose >= 2: | |||||
print('I am smaller!') | |||||
print('index (as in D_k U {gihat}) =', str(ig)) | |||||
print('distance:', dhat, '->', dnew, '\n') | |||||
self.__num_updates += 1 | |||||
else: | |||||
if self._verbose >= 2: | |||||
print('I am equal!') | |||||
dhat = dnew | |||||
gnew = gnew_list[idx_min] | |||||
found = True # found better graph. | |||||
return gnew, dhat, found | |||||
def _generate_graph_parallel(self, g_init, fdgs, term3, itr): | |||||
trial = itr | |||||
gtemp, dnew = self.__do_trial(g_init, fdgs, term3, trial) | |||||
return trial, gtemp, dnew | |||||
def __do_trial(self, g_init, fdgs, term3, trial): | |||||
# add and delete edges. | |||||
gtemp = g_init.copy() | |||||
seed = (trial + int(time.time())) % (2 ** 32 - 1) | |||||
rdm_state = np.random.RandomState(seed=seed) | |||||
# which edges to change. | |||||
# @todo: should we use just half of the adjacency matrix for undirected graphs? | |||||
nb_vpairs = nx.number_of_nodes(g_init) * (nx.number_of_nodes(g_init) - 1) | |||||
# @todo: what if fdgs is bigger than nb_vpairs? | |||||
idx_change = rdm_state.randint(0, high=nb_vpairs, size=(fdgs if | |||||
fdgs < nb_vpairs else nb_vpairs)) | |||||
# print(idx_change) | |||||
for item in idx_change: | |||||
node1 = int(item / (nx.number_of_nodes(g_init) - 1)) | |||||
node2 = (item - node1 * (nx.number_of_nodes(g_init) - 1)) | |||||
if node2 >= node1: # skip the self pair. | |||||
node2 += 1 | |||||
# @todo: is the randomness correct? | |||||
if not gtemp.has_edge(node1, node2): | |||||
gtemp.add_edge(node1, node2) | |||||
else: | |||||
gtemp.remove_edge(node1, node2) | |||||
# compute new distances. | |||||
kernels_to_gtmp, _ = self._graph_kernel.compute(gtemp, self._dataset.graphs, **self._kernel_options) | |||||
kernel_gtmp, _ = self._graph_kernel.compute(gtemp, gtemp, **self._kernel_options) | |||||
if self._kernel_options['normalize']: | |||||
kernels_to_gtmp = [kernels_to_gtmp[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gtmp) for i in range(len(kernels_to_gtmp))] # normalize | |||||
kernel_gtmp = 1 | |||||
# @todo: not correct kernel value | |||||
gram_with_gtmp = np.concatenate((np.array([kernels_to_gtmp]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | |||||
gram_with_gtmp = np.concatenate((np.array([[kernel_gtmp] + kernels_to_gtmp]).T, gram_with_gtmp), axis=1) | |||||
dnew = compute_k_dis(0, range(1, 1 + len(self._dataset.graphs)), self.__alphas, gram_with_gtmp, term3=term3, withterm3=True) | |||||
return gtemp, dnew | |||||
def get_results(self): | |||||
results = {} | |||||
results['runtime_precompute_gm'] = self.__runtime_precompute_gm | |||||
results['runtime_generate_preimage'] = self.__runtime_generate_preimage | |||||
results['runtime_total'] = self.__runtime_total | |||||
results['k_dis_dataset'] = self.__k_dis_dataset | |||||
results['k_dis_preimage'] = self.__k_dis_preimage | |||||
results['itrs'] = self.__itrs | |||||
results['num_updates'] = self.__num_updates | |||||
return results | |||||
def __termination_criterion_met(self, timer, itr, r): | |||||
if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): | |||||
# if self.__state == AlgorithmState.TERMINATED: | |||||
# self.__state = AlgorithmState.INITIALIZED | |||||
return True | |||||
return (r >= self.__r_max if self.__r_max >= 0 else False) | |||||
# return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) | |||||
@property | |||||
def preimage(self): | |||||
return self.__preimage | |||||
@property | |||||
def best_from_dataset(self): | |||||
return self.__best_from_dataset | |||||
@property | |||||
def gram_matrix_unnorm(self): | |||||
return self.__gram_matrix_unnorm | |||||
@gram_matrix_unnorm.setter | |||||
def gram_matrix_unnorm(self, value): | |||||
self.__gram_matrix_unnorm = value |
@@ -256,7 +256,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | ||||
gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm) | gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm) | ||||
# write result summary for each letter. | |||||
# write result summary for each class. | |||||
if save_results: | if save_results: | ||||
sod_sm_mean = np.mean(sod_sm_list) | sod_sm_mean = np.mean(sod_sm_list) | ||||
sod_gm_mean = np.mean(sod_gm_list) | sod_gm_mean = np.mean(sod_gm_list) | ||||
@@ -387,15 +387,15 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||||
return np.sqrt(term1 - term2 + term3) | return np.sqrt(term1 - term2 + term3) | ||||
def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||||
def compute_k_dis(idx_g, idx_gi, alphas, Kmatrix, term3=0, withterm3=True): | |||||
term1 = Kmatrix[idx_g, idx_g] | term1 = Kmatrix[idx_g, idx_g] | ||||
term2 = 0 | term2 = 0 | ||||
for i, a in enumerate(alpha): | |||||
for i, a in enumerate(alphas): | |||||
term2 += a * Kmatrix[idx_g, idx_gi[i]] | term2 += a * Kmatrix[idx_g, idx_gi[i]] | ||||
term2 *= 2 | term2 *= 2 | ||||
if withterm3 == False: | if withterm3 == False: | ||||
for i1, a1 in enumerate(alpha): | |||||
for i2, a2 in enumerate(alpha): | |||||
for i1, a1 in enumerate(alphas): | |||||
for i2, a2 in enumerate(alphas): | |||||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | ||||
return np.sqrt(term1 - term2 + term3) | return np.sqrt(term1 - term2 + term3) | ||||
@@ -300,7 +300,13 @@ def get_edge_labels(Gn, edge_label): | |||||
def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}): | def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}): | ||||
if name == 'ShortestPath': | |||||
if name == 'Marginalized': | |||||
from gklearn.kernels import Marginalized | |||||
graph_kernel = Marginalized(node_labels=node_labels, | |||||
edge_labels=edge_labels, | |||||
ds_infos=ds_infos, | |||||
**kernel_options) | |||||
elif name == 'ShortestPath': | |||||
from gklearn.kernels import ShortestPath | from gklearn.kernels import ShortestPath | ||||
graph_kernel = ShortestPath(node_labels=node_labels, | graph_kernel = ShortestPath(node_labels=node_labels, | ||||
node_attrs=node_attrs, | node_attrs=node_attrs, | ||||