@@ -0,0 +1,73 @@ | |||
# -*- coding: utf-8 -*- | |||
"""compute_distance_in_kernel_space.ipynb | |||
Automatically generated by Colaboratory. | |||
Original file is located at | |||
https://colab.research.google.com/drive/17tZP6IrineQmzo9sRtfZOnHpHx6HnlMA | |||
**This script demonstrates how to compute distance in kernel space between the image of a graph and the mean of images of a group of graphs.** | |||
--- | |||
**0. Install `graphkit-learn`.** | |||
""" | |||
"""**1. Get dataset.**""" | |||
from gklearn.utils import Dataset | |||
# Predefined dataset name, use dataset "MUTAG". | |||
ds_name = 'MUTAG' | |||
# Initialize a Dataset. | |||
dataset = Dataset() | |||
# Load predefined dataset "MUTAG". | |||
dataset.load_predefined_dataset(ds_name) | |||
len(dataset.graphs) | |||
"""**2. Compute graph kernel.**""" | |||
from gklearn.kernels import PathUpToH | |||
import multiprocessing | |||
# Initailize parameters for graph kernel computation. | |||
kernel_options = {'depth': 3, | |||
'k_func': 'MinMax', | |||
'compute_method': 'trie' | |||
} | |||
# Initialize graph kernel. | |||
graph_kernel = PathUpToH(node_labels=dataset.node_labels, # list of node label names. | |||
edge_labels=dataset.edge_labels, # list of edge label names. | |||
ds_infos=dataset.get_dataset_infos(keys=['directed']), # dataset information required for computation. | |||
**kernel_options, # options for computation. | |||
) | |||
# Compute Gram matrix. | |||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||
parallel='imap_unordered', # or None. | |||
n_jobs=multiprocessing.cpu_count(), # number of parallel jobs. | |||
normalize=True, # whether to return normalized Gram matrix. | |||
verbose=2 # whether to print out results. | |||
) | |||
"""**3. Compute distance in kernel space.** | |||
Given a dataset $\mathcal{G}_N$, compute the distance in kernel space between the image of $G_1 \in \mathcal{G}_N$ and the mean of images of $\mathcal{G}_k \subset \mathcal{G}_N$. | |||
""" | |||
from gklearn.preimage.utils import compute_k_dis | |||
# Index of $G_1$. | |||
idx_1 = 10 | |||
# Indices of graphs in $\mathcal{G}_k$. | |||
idx_graphs = range(0, 10) | |||
# Compute the distance in kernel space. | |||
dis_k = compute_k_dis(idx_1, | |||
idx_graphs, | |||
[1 / len(idx_graphs)] * len(idx_graphs), # weights for images of graphs in $\mathcal{G}_k$; all equal when computing the mean. | |||
gram_matrix, # gram matrix of al graphs. | |||
withterm3=False | |||
) | |||
print(dis_k) |
@@ -0,0 +1,87 @@ | |||
# -*- coding: utf-8 -*- | |||
"""compute_graph_kernel.ipynb | |||
Automatically generated by Colaboratory. | |||
Original file is located at | |||
https://colab.research.google.com/drive/17Q2QCl9CAtDweGF8LiWnWoN2laeJqT0u | |||
**This script demonstrates how to compute a graph kernel.** | |||
--- | |||
**0. Install `graphkit-learn`.** | |||
""" | |||
"""**1. Get dataset.**""" | |||
from gklearn.utils import Dataset | |||
# Predefined dataset name, use dataset "MUTAG". | |||
ds_name = 'MUTAG' | |||
# Initialize a Dataset. | |||
dataset = Dataset() | |||
# Load predefined dataset "MUTAG". | |||
dataset.load_predefined_dataset(ds_name) | |||
len(dataset.graphs) | |||
"""**2. Compute graph kernel.**""" | |||
from gklearn.kernels import PathUpToH | |||
# Initailize parameters for graph kernel computation. | |||
kernel_options = {'depth': 3, | |||
'k_func': 'MinMax', | |||
'compute_method': 'trie' | |||
} | |||
# Initialize graph kernel. | |||
graph_kernel = PathUpToH(node_labels=dataset.node_labels, # list of node label names. | |||
edge_labels=dataset.edge_labels, # list of edge label names. | |||
ds_infos=dataset.get_dataset_infos(keys=['directed']), # dataset information required for computation. | |||
**kernel_options, # options for computation. | |||
) | |||
print('done.') | |||
import multiprocessing | |||
import matplotlib.pyplot as plt | |||
# Compute Gram matrix. | |||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, | |||
parallel='imap_unordered', # or None. | |||
n_jobs=multiprocessing.cpu_count(), # number of parallel jobs. | |||
normalize=True, # whether to return normalized Gram matrix. | |||
verbose=2 # whether to print out results. | |||
) | |||
# Print results. | |||
print() | |||
print(gram_matrix) | |||
print(run_time) | |||
plt.imshow(gram_matrix) | |||
import multiprocessing | |||
# Compute grah kernels between a graph and a list of graphs. | |||
kernel_list, run_time = graph_kernel.compute(dataset.graphs, # a list of graphs. | |||
dataset.graphs[0], # a single graph. | |||
parallel='imap_unordered', # or None. | |||
n_jobs=multiprocessing.cpu_count(), # number of parallel jobs. | |||
verbose=2 # whether to print out results. | |||
) | |||
# Print results. | |||
print() | |||
print(kernel_list) | |||
print(run_time) | |||
import multiprocessing | |||
# Compute a grah kernel between two graphs. | |||
kernel, run_time = graph_kernel.compute(dataset.graphs[0], # a single graph. | |||
dataset.graphs[1], # another single graph. | |||
verbose=2 # whether to print out results. | |||
) | |||
# Print results. | |||
print() | |||
print(kernel) | |||
print(run_time) |
@@ -0,0 +1,115 @@ | |||
# -*- coding: utf-8 -*- | |||
"""example_median_preimege_generator.ipynb | |||
Automatically generated by Colaboratory. | |||
Original file is located at | |||
https://colab.research.google.com/drive/1PIDvHOcmiLEQ5Np3bgBDdu0kLOquOMQK | |||
**This script demonstrates how to generate a graph preimage using Boria's method.** | |||
--- | |||
""" | |||
"""**1. Get dataset.**""" | |||
from gklearn.utils import Dataset, split_dataset_by_target | |||
# Predefined dataset name, use dataset "MAO". | |||
ds_name = 'MAO' | |||
# The node/edge labels that will not be used in the computation. | |||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
# Initialize a Dataset. | |||
dataset_all = Dataset() | |||
# Load predefined dataset "MAO". | |||
dataset_all.load_predefined_dataset(ds_name) | |||
# Remove irrelevant labels. | |||
dataset_all.remove_labels(**irrelevant_labels) | |||
# Split the whole dataset according to the classification targets. | |||
datasets = split_dataset_by_target(dataset_all) | |||
# Get the first class of graphs, whose median preimage will be computed. | |||
dataset = datasets[0] | |||
len(dataset.graphs) | |||
"""**2. Set parameters.**""" | |||
import multiprocessing | |||
# Parameters for MedianPreimageGenerator (our method). | |||
mpg_options = {'fit_method': 'k-graphs', # how to fit edit costs. "k-graphs" means use all graphs in median set when fitting. | |||
'init_ecc': [4, 4, 2, 1, 1, 1], # initial edit costs. | |||
'ds_name': ds_name, # name of the dataset. | |||
'parallel': True, # whether the parallel scheme is to be used. | |||
'time_limit_in_sec': 0, # maximum time limit to compute the preimage. If set to 0 then no limit. | |||
'max_itrs': 100, # maximum iteration limit to optimize edit costs. If set to 0 then no limit. | |||
'max_itrs_without_update': 3, # If the times that edit costs is not update is more than this number, then the optimization stops. | |||
'epsilon_residual': 0.01, # In optimization, the residual is only considered changed if the change is bigger than this number. | |||
'epsilon_ec': 0.1, # In optimization, the edit costs are only considered changed if the changes are bigger than this number. | |||
'verbose': 2 # whether to print out results. | |||
} | |||
# Parameters for graph kernel computation. | |||
kernel_options = {'name': 'PathUpToH', # use path kernel up to length h. | |||
'depth': 9, | |||
'k_func': 'MinMax', | |||
'compute_method': 'trie', | |||
'parallel': 'imap_unordered', # or None | |||
'n_jobs': multiprocessing.cpu_count(), | |||
'normalize': True, # whether to use normalized Gram matrix to optimize edit costs. | |||
'verbose': 2 # whether to print out results. | |||
} | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||
'initial_solutions': 10, # when bigger than 1, then the method is considered mIPFP. | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
'attr_distance': 'euclidean', # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'ratio_runs_from_initial_solutions': 1, | |||
'threads': multiprocessing.cpu_count(), # parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
# Parameters for MedianGraphEstimator (Boria's method). | |||
mge_options = {'init_type': 'MEDOID', # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD. | |||
'random_inits': 10, # number of random initialization when 'init_type' = 'RANDOM'. | |||
'time_limit': 600, # maximum time limit to compute the generalized median. If set to 0 then no limit. | |||
'verbose': 2, # whether to print out results. | |||
'refine': False # whether to refine the final SODs or not. | |||
} | |||
print('done.') | |||
"""**3. Run median preimage generator.**""" | |||
from gklearn.preimage import MedianPreimageGenerator | |||
# Create median preimage generator instance. | |||
mpg = MedianPreimageGenerator() | |||
# Add dataset. | |||
mpg.dataset = dataset | |||
# Set parameters. | |||
mpg.set_options(**mpg_options.copy()) | |||
mpg.kernel_options = kernel_options.copy() | |||
mpg.ged_options = ged_options.copy() | |||
mpg.mge_options = mge_options.copy() | |||
# Run. | |||
mpg.run() | |||
"""**4. Get results.**""" | |||
# Get results. | |||
import pprint | |||
pp = pprint.PrettyPrinter(indent=4) # pretty print | |||
results = mpg.get_results() | |||
pp.pprint(results) | |||
# Draw generated graphs. | |||
def draw_graph(graph): | |||
import matplotlib.pyplot as plt | |||
import networkx as nx | |||
plt.figure() | |||
pos = nx.spring_layout(graph) | |||
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'atom_symbol'), font_color='w', width=3, with_labels=True) | |||
plt.show() | |||
plt.clf() | |||
plt.close() | |||
draw_graph(mpg.set_median) | |||
draw_graph(mpg.gen_median) |
@@ -8,8 +8,9 @@ __author__ = "Linlin Jia" | |||
__date__ = "November 2018" | |||
from gklearn.kernels.graph_kernel import GraphKernel | |||
from gklearn.kernels.structural_sp import StructuralSP | |||
from gklearn.kernels.marginalized import Marginalized | |||
from gklearn.kernels.shortest_path import ShortestPath | |||
from gklearn.kernels.structural_sp import StructuralSP | |||
from gklearn.kernels.path_up_to_h import PathUpToH | |||
from gklearn.kernels.treelet import Treelet | |||
from gklearn.kernels.weisfeiler_lehman import WeisfeilerLehman, WLSubtree |
@@ -0,0 +1,338 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Wed Jun 3 22:22:57 2020 | |||
@author: ljia | |||
@references: | |||
[1] H. Kashima, K. Tsuda, and A. Inokuchi. Marginalized kernels between | |||
labeled graphs. In Proceedings of the 20th International Conference on | |||
Machine Learning, Washington, DC, United States, 2003. | |||
[2] Pierre Mahé, Nobuhisa Ueda, Tatsuya Akutsu, Jean-Luc Perret, and | |||
Jean-Philippe Vert. Extensions of marginalized graph kernels. In | |||
Proceedings of the twenty-first international conference on Machine | |||
learning, page 70. ACM, 2004. | |||
""" | |||
import sys | |||
from multiprocessing import Pool | |||
from tqdm import tqdm | |||
import numpy as np | |||
import networkx as nx | |||
from gklearn.utils import SpecialLabel | |||
from gklearn.utils.kernels import deltakernel | |||
from gklearn.utils.parallel import parallel_gm, parallel_me | |||
from gklearn.utils.utils import untotterTransformation | |||
from gklearn.kernels import GraphKernel | |||
class Marginalized(GraphKernel): | |||
def __init__(self, **kwargs): | |||
GraphKernel.__init__(self) | |||
self.__node_labels = kwargs.get('node_labels', []) | |||
self.__edge_labels = kwargs.get('edge_labels', []) | |||
self.__p_quit = kwargs.get('p_quit', 0.5) | |||
self.__n_iteration = kwargs.get('n_iteration', 10) | |||
self.__remove_totters = kwargs.get('remove_totters', False) | |||
self.__ds_infos = kwargs.get('ds_infos', {}) | |||
self.__n_iteration = int(self.__n_iteration) | |||
def _compute_gm_series(self): | |||
self.__add_dummy_labels(self._graphs) | |||
if self.__remove_totters: | |||
if self._verbose >= 2: | |||
iterator = tqdm(self._graphs, desc='removing tottering', file=sys.stdout) | |||
else: | |||
iterator = self._graphs | |||
# @todo: this may not work. | |||
self._graphs = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator] | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
from itertools import combinations_with_replacement | |||
itr = combinations_with_replacement(range(0, len(self._graphs)), 2) | |||
if self._verbose >= 2: | |||
iterator = tqdm(itr, desc='calculating kernels', file=sys.stdout) | |||
else: | |||
iterator = itr | |||
for i, j in iterator: | |||
kernel = self.__kernel_do(self._graphs[i], self._graphs[j]) | |||
gram_matrix[i][j] = kernel | |||
gram_matrix[j][i] = kernel # @todo: no directed graph considered? | |||
return gram_matrix | |||
def _compute_gm_imap_unordered(self): | |||
self.__add_dummy_labels(self._graphs) | |||
if self.__remove_totters: | |||
pool = Pool(self._n_jobs) | |||
itr = range(0, len(self._graphs)) | |||
if len(self._graphs) < 100 * self._n_jobs: | |||
chunksize = int(len(self._graphs) / self._n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
remove_fun = self._wrapper_untotter | |||
if self._verbose >= 2: | |||
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), | |||
desc='removing tottering', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(remove_fun, itr, chunksize) | |||
for i, g in iterator: | |||
self._graphs[i] = g | |||
pool.close() | |||
pool.join() | |||
# compute Gram matrix. | |||
gram_matrix = np.zeros((len(self._graphs), len(self._graphs))) | |||
def init_worker(gn_toshare): | |||
global G_gn | |||
G_gn = gn_toshare | |||
do_fun = self._wrapper_kernel_do | |||
parallel_gm(do_fun, gram_matrix, self._graphs, init_worker=init_worker, | |||
glbv=(self._graphs,), n_jobs=self._n_jobs, verbose=self._verbose) | |||
return gram_matrix | |||
def _compute_kernel_list_series(self, g1, g_list): | |||
self.__add_dummy_labels(g_list + [g1]) | |||
if self.__remove_totters: | |||
g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. | |||
if self._verbose >= 2: | |||
iterator = tqdm(g_list, desc='removing tottering', file=sys.stdout) | |||
else: | |||
iterator = g_list | |||
# @todo: this may not work. | |||
g_list = [untotterTransformation(G, self.__node_label, self.__edge_label) for G in iterator] | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
if self._verbose >= 2: | |||
iterator = tqdm(range(len(g_list)), desc='calculating kernels', file=sys.stdout) | |||
else: | |||
iterator = range(len(g_list)) | |||
for i in iterator: | |||
kernel = self.__kernel_do(g1, g_list[i]) | |||
kernel_list[i] = kernel | |||
return kernel_list | |||
def _compute_kernel_list_imap_unordered(self, g1, g_list): | |||
self.__add_dummy_labels(g_list + [g1]) | |||
if self.__remove_totters: | |||
g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. | |||
pool = Pool(self._n_jobs) | |||
itr = range(0, len(g_list)) | |||
if len(g_list) < 100 * self._n_jobs: | |||
chunksize = int(len(g_list) / self._n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
remove_fun = self._wrapper_untotter | |||
if self._verbose >= 2: | |||
iterator = tqdm(pool.imap_unordered(remove_fun, itr, chunksize), | |||
desc='removing tottering', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(remove_fun, itr, chunksize) | |||
for i, g in iterator: | |||
g_list[i] = g | |||
pool.close() | |||
pool.join() | |||
# compute kernel list. | |||
kernel_list = [None] * len(g_list) | |||
def init_worker(g1_toshare, g_list_toshare): | |||
global G_g1, G_g_list | |||
G_g1 = g1_toshare | |||
G_g_list = g_list_toshare | |||
do_fun = self._wrapper_kernel_list_do | |||
def func_assign(result, var_to_assign): | |||
var_to_assign[result[0]] = result[1] | |||
itr = range(len(g_list)) | |||
len_itr = len(g_list) | |||
parallel_me(do_fun, func_assign, kernel_list, itr, len_itr=len_itr, | |||
init_worker=init_worker, glbv=(g1, g_list), method='imap_unordered', | |||
n_jobs=self._n_jobs, itr_desc='calculating kernels', verbose=self._verbose) | |||
return kernel_list | |||
def _wrapper_kernel_list_do(self, itr): | |||
return itr, self.__kernel_do(G_g1, G_g_list[itr]) | |||
def _compute_single_kernel_series(self, g1, g2): | |||
self.__add_dummy_labels([g1] + [g2]) | |||
if self.__remove_totters: | |||
g1 = untotterTransformation(g1, self.__node_label, self.__edge_label) # @todo: this may not work. | |||
g2 = untotterTransformation(g2, self.__node_label, self.__edge_label) | |||
kernel = self.__kernel_do(g1, g2) | |||
return kernel | |||
def __kernel_do(self, g1, g2): | |||
"""Calculate marginalized graph kernel between 2 graphs. | |||
Parameters | |||
---------- | |||
g1, g2 : NetworkX graphs | |||
2 graphs between which the kernel is calculated. | |||
Return | |||
------ | |||
kernel : float | |||
Marginalized kernel between 2 graphs. | |||
""" | |||
# init parameters | |||
kernel = 0 | |||
num_nodes_G1 = nx.number_of_nodes(g1) | |||
num_nodes_G2 = nx.number_of_nodes(g2) | |||
# the initial probability distribution in the random walks generating step | |||
# (uniform distribution over |G|) | |||
p_init_G1 = 1 / num_nodes_G1 | |||
p_init_G2 = 1 / num_nodes_G2 | |||
q = self.__p_quit * self.__p_quit | |||
r1 = q | |||
# # initial R_inf | |||
# # matrix to save all the R_inf for all pairs of nodes | |||
# R_inf = np.zeros([num_nodes_G1, num_nodes_G2]) | |||
# | |||
# # calculate R_inf with a simple interative method | |||
# for i in range(1, n_iteration): | |||
# R_inf_new = np.zeros([num_nodes_G1, num_nodes_G2]) | |||
# R_inf_new.fill(r1) | |||
# | |||
# # calculate R_inf for each pair of nodes | |||
# for node1 in g1.nodes(data=True): | |||
# neighbor_n1 = g1[node1[0]] | |||
# # the transition probability distribution in the random walks | |||
# # generating step (uniform distribution over the vertices adjacent | |||
# # to the current vertex) | |||
# if len(neighbor_n1) > 0: | |||
# p_trans_n1 = (1 - p_quit) / len(neighbor_n1) | |||
# for node2 in g2.nodes(data=True): | |||
# neighbor_n2 = g2[node2[0]] | |||
# if len(neighbor_n2) > 0: | |||
# p_trans_n2 = (1 - p_quit) / len(neighbor_n2) | |||
# | |||
# for neighbor1 in neighbor_n1: | |||
# for neighbor2 in neighbor_n2: | |||
# t = p_trans_n1 * p_trans_n2 * \ | |||
# deltakernel(g1.node[neighbor1][node_label], | |||
# g2.node[neighbor2][node_label]) * \ | |||
# deltakernel( | |||
# neighbor_n1[neighbor1][edge_label], | |||
# neighbor_n2[neighbor2][edge_label]) | |||
# | |||
# R_inf_new[node1[0]][node2[0]] += t * R_inf[neighbor1][ | |||
# neighbor2] # ref [1] equation (8) | |||
# R_inf[:] = R_inf_new | |||
# | |||
# # add elements of R_inf up and calculate kernel | |||
# for node1 in g1.nodes(data=True): | |||
# for node2 in g2.nodes(data=True): | |||
# s = p_init_G1 * p_init_G2 * deltakernel( | |||
# node1[1][node_label], node2[1][node_label]) | |||
# kernel += s * R_inf[node1[0]][node2[0]] # ref [1] equation (6) | |||
R_inf = {} # dict to save all the R_inf for all pairs of nodes | |||
# initial R_inf, the 1st iteration. | |||
for node1 in g1.nodes(): | |||
for node2 in g2.nodes(): | |||
# R_inf[(node1[0], node2[0])] = r1 | |||
if len(g1[node1]) > 0: | |||
if len(g2[node2]) > 0: | |||
R_inf[(node1, node2)] = r1 | |||
else: | |||
R_inf[(node1, node2)] = self.__p_quit | |||
else: | |||
if len(g2[node2]) > 0: | |||
R_inf[(node1, node2)] = self.__p_quit | |||
else: | |||
R_inf[(node1, node2)] = 1 | |||
# compute all transition probability first. | |||
t_dict = {} | |||
if self.__n_iteration > 1: | |||
for node1 in g1.nodes(): | |||
neighbor_n1 = g1[node1] | |||
# the transition probability distribution in the random walks | |||
# generating step (uniform distribution over the vertices adjacent | |||
# to the current vertex) | |||
if len(neighbor_n1) > 0: | |||
p_trans_n1 = (1 - self.__p_quit) / len(neighbor_n1) | |||
for node2 in g2.nodes(): | |||
neighbor_n2 = g2[node2] | |||
if len(neighbor_n2) > 0: | |||
p_trans_n2 = (1 - self.__p_quit) / len(neighbor_n2) | |||
for neighbor1 in neighbor_n1: | |||
for neighbor2 in neighbor_n2: | |||
t_dict[(node1, node2, neighbor1, neighbor2)] = \ | |||
p_trans_n1 * p_trans_n2 * \ | |||
deltakernel(tuple(g1.nodes[neighbor1][nl] for nl in self.__node_labels), tuple(g2.nodes[neighbor2][nl] for nl in self.__node_labels)) * \ | |||
deltakernel(tuple(neighbor_n1[neighbor1][el] for el in self.__edge_labels), tuple(neighbor_n2[neighbor2][el] for el in self.__edge_labels)) | |||
# calculate R_inf with a simple interative method | |||
for i in range(2, self.__n_iteration + 1): | |||
R_inf_old = R_inf.copy() | |||
# calculate R_inf for each pair of nodes | |||
for node1 in g1.nodes(): | |||
neighbor_n1 = g1[node1] | |||
# the transition probability distribution in the random walks | |||
# generating step (uniform distribution over the vertices adjacent | |||
# to the current vertex) | |||
if len(neighbor_n1) > 0: | |||
for node2 in g2.nodes(): | |||
neighbor_n2 = g2[node2] | |||
if len(neighbor_n2) > 0: | |||
R_inf[(node1, node2)] = r1 | |||
for neighbor1 in neighbor_n1: | |||
for neighbor2 in neighbor_n2: | |||
R_inf[(node1, node2)] += \ | |||
(t_dict[(node1, node2, neighbor1, neighbor2)] * \ | |||
R_inf_old[(neighbor1, neighbor2)]) # ref [1] equation (8) | |||
# add elements of R_inf up and calculate kernel | |||
for (n1, n2), value in R_inf.items(): | |||
s = p_init_G1 * p_init_G2 * deltakernel(tuple(g1.nodes[n1][nl] for nl in self.__node_labels), tuple(g2.nodes[n2][nl] for nl in self.__node_labels)) | |||
kernel += s * value # ref [1] equation (6) | |||
return kernel | |||
def _wrapper_kernel_do(self, itr): | |||
i = itr[0] | |||
j = itr[1] | |||
return i, j, self.__kernel_do(G_gn[i], G_gn[j]) | |||
def _wrapper_untotter(self, i): | |||
return i, untotterTransformation(self._graphs[i], self.__node_label, self.__edge_label) # @todo: this may not work. | |||
def __add_dummy_labels(self, Gn): | |||
if len(self.__node_labels) == 0 or (len(self.__node_labels) == 1 and self.__node_labels[0] == SpecialLabel.DUMMY): | |||
for i in range(len(Gn)): | |||
nx.set_node_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
self.__node_labels = [SpecialLabel.DUMMY] | |||
if len(self.__edge_labels) == 0 or (len(self.__edge_labels) == 1 and self.__edge_labels[0] == SpecialLabel.DUMMY): | |||
for i in range(len(Gn)): | |||
nx.set_edge_attributes(Gn[i], '0', SpecialLabel.DUMMY) | |||
self.__edge_labels = [SpecialLabel.DUMMY] |
@@ -195,7 +195,7 @@ class Treelet(GraphKernel): | |||
Return | |||
------ | |||
kernel : float | |||
Treelet Kernel between 2 graphs. | |||
Treelet kernel between 2 graphs. | |||
""" | |||
keys = set(canonkey1.keys()) & set(canonkey2.keys()) # find same canonical keys in both graphs | |||
vector1 = np.array([(canonkey1[key] if (key in canonkey1.keys()) else 0) for key in keys]) | |||
@@ -12,4 +12,6 @@ __date__ = "March 2020" | |||
from gklearn.preimage.preimage_generator import PreimageGenerator | |||
from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator | |||
from gklearn.preimage.random_preimage_generator import RandomPreimageGenerator | |||
from gklearn.preimage.kernel_knn_cv import kernel_knn_cv | |||
from gklearn.preimage.generate_random_preimages_by_class import generate_random_preimages_by_class |
@@ -0,0 +1,262 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Jun 1 11:37:57 2020 | |||
@author: ljia | |||
""" | |||
import multiprocessing | |||
import numpy as np | |||
import networkx as nx | |||
import os | |||
from gklearn.utils.graphfiles import saveGXL | |||
from gklearn.preimage import RandomPreimageGenerator | |||
from gklearn.utils import Dataset | |||
dir_root = '../results/xp_random_preimage_generation/' | |||
def xp_random_preimage_generation(kernel_name): | |||
""" | |||
Experiment similar to the one in Bakir's paper. A test to check if RandomPreimageGenerator class works correctly. | |||
Returns | |||
------- | |||
None. | |||
""" | |||
alpha1_list = np.linspace(0, 1, 11) | |||
k_dis_datasets = [] | |||
k_dis_preimages = [] | |||
preimages = [] | |||
bests_from_dataset = [] | |||
for alpha1 in alpha1_list: | |||
print('alpha1 =', alpha1, ':\n') | |||
# set parameters. | |||
ds_name = 'MUTAG' | |||
rpg_options = {'k': 5, | |||
'r_max': 10, # | |||
'l': 500, | |||
'alphas': None, | |||
'parallel': True, | |||
'verbose': 2} | |||
if kernel_name == 'PathUpToH': | |||
kernel_options = {'name': 'PathUpToH', | |||
'depth': 2, # | |||
'k_func': 'MinMax', # | |||
'compute_method': 'trie', | |||
'parallel': 'imap_unordered', | |||
# 'parallel': None, | |||
'n_jobs': multiprocessing.cpu_count(), | |||
'normalize': True, | |||
'verbose': 0} | |||
elif kernel_name == 'Marginalized': | |||
kernel_options = {'name': 'Marginalized', | |||
'p_quit': 0.8, # | |||
'n_iteration': 7, # | |||
'remove_totters': False, | |||
'parallel': 'imap_unordered', | |||
# 'parallel': None, | |||
'n_jobs': multiprocessing.cpu_count(), | |||
'normalize': True, | |||
'verbose': 0} | |||
edge_required = True | |||
irrelevant_labels = {'edge_labels': ['label_0']} | |||
cut_range = None | |||
# create/get Gram matrix. | |||
dir_save = dir_root + ds_name + '.' + kernel_options['name'] + '/' | |||
if not os.path.exists(dir_save): | |||
os.makedirs(dir_save) | |||
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' | |||
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) | |||
if gmfile_exist: | |||
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. | |||
gram_matrix_unnorm = gmfile['gram_matrix_unnorm'] | |||
time_precompute_gm = gmfile['run_time'] | |||
# 1. get dataset. | |||
print('1. getting dataset...') | |||
dataset_all = Dataset() | |||
dataset_all.load_predefined_dataset(ds_name) | |||
dataset_all.trim_dataset(edge_required=edge_required) | |||
if irrelevant_labels is not None: | |||
dataset_all.remove_labels(**irrelevant_labels) | |||
if cut_range is not None: | |||
dataset_all.cut_graphs(cut_range) | |||
# # add two "random" graphs. | |||
# g1 = nx.Graph() | |||
# g1.add_nodes_from(range(0, 16), label_0='0') | |||
# g1.add_nodes_from(range(16, 25), label_0='1') | |||
# g1.add_node(25, label_0='2') | |||
# g1.add_nodes_from([26, 27], label_0='3') | |||
# g1.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 12), (5, 0), (4, 9), (12, 3), (10, 13), (13, 14), (14, 15), (15, 8), (0, 16), (1, 17), (2, 18), (12, 19), (11, 20), (13, 21), (15, 22), (7, 23), (6, 24), (14, 25), (25, 26), (25, 27)]) | |||
# g2 = nx.Graph() | |||
# g2.add_nodes_from(range(0, 12), label_0='0') | |||
# g2.add_nodes_from(range(12, 19), label_0='1') | |||
# g2.add_nodes_from([19, 20, 21], label_0='2') | |||
# g2.add_nodes_from([22, 23], label_0='3') | |||
# g2.add_edges_from([(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 19), (19, 7), (7, 8), (8, 9), (9, 10), (10, 11), (11, 20), (20, 7), (5, 0), (4, 8), (0, 12), (1, 13), (2, 14), (9, 15), (10, 16), (11, 17), (6, 18), (3, 21), (21, 22), (21, 23)]) | |||
# dataset_all.load_graphs([g1, g2] + dataset_all.graphs, targets=None) | |||
# 2. initialize rpg and setting parameters. | |||
print('2. initializing rpg and setting parameters...') | |||
# nb_graphs = len(dataset_all.graphs) - 2 | |||
# rpg_options['alphas'] = [alpha1, 1 - alpha1] + [0] * nb_graphs | |||
nb_graphs = len(dataset_all.graphs) | |||
alphas = [0] * nb_graphs | |||
alphas[1] = alpha1 | |||
alphas[6] = 1 - alpha1 | |||
rpg_options['alphas'] = alphas | |||
if gmfile_exist: | |||
rpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm | |||
rpg_options['runtime_precompute_gm'] = time_precompute_gm | |||
rpg = RandomPreimageGenerator() | |||
rpg.dataset = dataset_all | |||
rpg.set_options(**rpg_options.copy()) | |||
rpg.kernel_options = kernel_options.copy() | |||
# 3. compute preimage. | |||
print('3. computing preimage...') | |||
rpg.run() | |||
results = rpg.get_results() | |||
k_dis_datasets.append(results['k_dis_dataset']) | |||
k_dis_preimages.append(results['k_dis_preimage']) | |||
bests_from_dataset.append(rpg.best_from_dataset) | |||
preimages.append(rpg.preimage) | |||
# 4. save results. | |||
# write Gram matrices to file. | |||
if not gmfile_exist: | |||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm=rpg.gram_matrix_unnorm, run_time=results['runtime_precompute_gm']) | |||
# save graphs. | |||
fn_best_dataset = dir_save + 'g_best_dataset.' + 'alpha1_' + str(alpha1)[0:3] | |||
saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||
node_labels=dataset_all.node_labels, edge_labels=dataset_all.edge_labels, | |||
node_attrs=dataset_all.node_attrs, edge_attrs=dataset_all.edge_attrs) | |||
fn_preimage = dir_save + 'g_preimage.' + 'alpha1_' + str(alpha1)[0:3] | |||
saveGXL(rpg.preimage, fn_preimage + '.gxl', method='default', | |||
node_labels=dataset_all.node_labels, edge_labels=dataset_all.edge_labels, | |||
node_attrs=dataset_all.node_attrs, edge_attrs=dataset_all.edge_attrs) | |||
# draw graphs. | |||
__draw_graph(rpg.best_from_dataset, fn_best_dataset) | |||
__draw_graph(rpg.preimage, fn_preimage) | |||
# save distances. | |||
np.savez(dir_save + 'distances.' + ds_name + '.' + kernel_options['name'], k_dis_datasets=k_dis_datasets, k_dis_preimages=k_dis_preimages) | |||
# plot results figure. | |||
__plot_results(alpha1_list, k_dis_datasets, k_dis_preimages, dir_save) | |||
print('\ncomplete.\n') | |||
return k_dis_datasets, k_dis_preimages, bests_from_dataset, preimages | |||
def __draw_graph(graph, file_prefix): | |||
# import matplotlib | |||
# matplotlib.use('agg') | |||
import matplotlib.pyplot as plt | |||
plt.figure() | |||
pos = nx.spring_layout(graph) | |||
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'label_0'), font_color='w', width=3, with_labels=True) | |||
plt.savefig(file_prefix + '.eps', format='eps', dpi=300) | |||
# plt.show() | |||
plt.clf() | |||
plt.close() | |||
def __plot_results(alpha1_list, k_dis_datasets, k_dis_preimages, dir_save): | |||
import matplotlib.pyplot as plt | |||
fig, ax = plt.subplots(1, 1, figsize=(7, 4.5)) | |||
ind = np.arange(len(alpha1_list)) # the x locations for the groups | |||
width = 0.35 # the width of the bars: can also be len(x) sequence | |||
ax.bar(ind, k_dis_preimages, width, label='Reconstructed pre-image', zorder=3, color='#133AAC') | |||
ax.set_xlabel(r'$\alpha \in [0,1]$') | |||
ax.set_ylabel(r'$d(g_i,g^\star(\alpha))$') | |||
#ax.set_title('Runtime of the shortest path kernel on all datasets') | |||
plt.xticks(ind, [str(i)[0:3] for i in alpha1_list]) | |||
#ax.set_yticks(np.logspace(-16, -3, num=20, base=10)) | |||
#ax.set_ylim(bottom=1e-15) | |||
ax.grid(axis='y', zorder=0) | |||
ax.spines['top'].set_visible(False) | |||
ax.spines['bottom'].set_visible(False) | |||
ax.spines['left'].set_visible(False) | |||
ax.spines['right'].set_visible(False) | |||
ax.xaxis.set_ticks_position('none') | |||
ax.plot(ind, k_dis_datasets, 'b.-', label=r'Nearest neighbor in $D_N$', color='orange', zorder=4) | |||
ax.yaxis.set_ticks_position('none') | |||
fig.subplots_adjust(bottom=.2) | |||
fig.legend(loc='lower center', ncol=2, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) | |||
plt.savefig(dir_save + 'distances in kernel space.eps', format='eps', dpi=300, | |||
transparent=True, bbox_inches='tight') | |||
plt.show() | |||
plt.clf() | |||
plt.close() | |||
if __name__ == '__main__': | |||
# kernel_name = 'PathUpToH' | |||
kernel_name = 'Marginalized' | |||
k_dis_datasets, k_dis_preimages, bests_from_dataset, preimages = xp_random_preimage_generation(kernel_name) | |||
# # save graphs. | |||
# dir_save = dir_root + 'MUTAG.PathUpToH/' | |||
# for i, alpha1 in enumerate(np.linspace(0, 1, 11)): | |||
# fn_best_dataset = dir_save + 'g_best_dataset.' + 'alpha1_' + str(alpha1)[0:3] | |||
# saveGXL(bests_from_dataset[i], fn_best_dataset + '.gxl', method='default', | |||
# node_labels=['label_0'], edge_labels=[], | |||
# node_attrs=[], edge_attrs=[]) | |||
# fn_preimage = dir_save + 'g_preimage.' + 'alpha1_' + str(alpha1)[0:3] | |||
# saveGXL(preimages[i], fn_preimage + '.gxl', method='default', | |||
# node_labels=['label_0'], edge_labels=[], | |||
# node_attrs=[], edge_attrs=[]) | |||
# # draw graphs. | |||
# dir_save = dir_root + 'MUTAG.PathUpToH/' | |||
# for i, alpha1 in enumerate(np.linspace(0, 1, 11)): | |||
# fn_best_dataset = dir_save + 'g_best_dataset.' + 'alpha1_' + str(alpha1)[0:3] | |||
# __draw_graph(bests_from_dataset[i], fn_best_dataset) | |||
# fn_preimage = dir_save + 'g_preimage.' + 'alpha1_' + str(alpha1)[0:3] | |||
# __draw_graph(preimages[i], fn_preimage) | |||
# # plot results figure. | |||
# alpha1_list = np.linspace(0, 1, 11) | |||
# dir_save = dir_root + 'MUTAG.PathUpToH/' | |||
# __plot_results(alpha1_list, k_dis_datasets, k_dis_preimages, dir_save) | |||
# k_dis_datasets = [0.0, | |||
# 0.08882515554098754, | |||
# 0.17765031108197632, | |||
# 0.2664754666229643, | |||
# 0.35530062216395264, | |||
# 0.44412577770494066, | |||
# 0.35530062216395236, | |||
# 0.2664754666229643, | |||
# 0.17765031108197632, | |||
# 0.08882515554098878, | |||
# 0.0] | |||
# k_dis_preimages = [0.0, | |||
# 0.08882515554098754, | |||
# 0.17765031108197632, | |||
# 0.2664754666229643, | |||
# 0.35530062216395264, | |||
# 0.44412577770494066, | |||
# 0.35530062216395236, | |||
# 0.2664754666229643, | |||
# 0.17765031108197632, | |||
# 0.08882515554098878, | |||
# 0.0] |
@@ -0,0 +1,176 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri Jun 12 10:30:17 2020 | |||
@author: ljia | |||
This script constructs simple preimages to test preimage methods and find bugs and shortcomings in them. | |||
""" | |||
def xp_simple_preimage(): | |||
import numpy as np | |||
"""**1. Get dataset.**""" | |||
from gklearn.utils import Dataset, split_dataset_by_target | |||
# Predefined dataset name, use dataset "MAO". | |||
ds_name = 'MAO' | |||
# The node/edge labels that will not be used in the computation. | |||
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} | |||
# Initialize a Dataset. | |||
dataset_all = Dataset() | |||
# Load predefined dataset "MAO". | |||
dataset_all.load_predefined_dataset(ds_name) | |||
# Remove irrelevant labels. | |||
dataset_all.remove_labels(**irrelevant_labels) | |||
# Split the whole dataset according to the classification targets. | |||
datasets = split_dataset_by_target(dataset_all) | |||
# Get the first class of graphs, whose median preimage will be computed. | |||
dataset = datasets[0] | |||
len(dataset.graphs) | |||
"""**2. Set parameters.**""" | |||
import multiprocessing | |||
# Parameters for MedianPreimageGenerator (our method). | |||
mpg_options = {'fit_method': 'k-graphs', # how to fit edit costs. "k-graphs" means use all graphs in median set when fitting. | |||
'init_ecc': [4, 4, 2, 1, 1, 1], # initial edit costs. | |||
'ds_name': ds_name, # name of the dataset. | |||
'parallel': True, # whether the parallel scheme is to be used. | |||
'time_limit_in_sec': 0, # maximum time limit to compute the preimage. If set to 0 then no limit. | |||
'max_itrs': 10, # maximum iteration limit to optimize edit costs. If set to 0 then no limit. | |||
'max_itrs_without_update': 3, # If the times that edit costs is not update is more than this number, then the optimization stops. | |||
'epsilon_residual': 0.01, # In optimization, the residual is only considered changed if the change is bigger than this number. | |||
'epsilon_ec': 0.1, # In optimization, the edit costs are only considered changed if the changes are bigger than this number. | |||
'verbose': 2 # whether to print out results. | |||
} | |||
# Parameters for graph kernel computation. | |||
kernel_options = {'name': 'PathUpToH', # use path kernel up to length h. | |||
'depth': 9, | |||
'k_func': 'MinMax', | |||
'compute_method': 'trie', | |||
'parallel': 'imap_unordered', # or None | |||
'n_jobs': multiprocessing.cpu_count(), | |||
'normalize': True, # whether to use normalized Gram matrix to optimize edit costs. | |||
'verbose': 2 # whether to print out results. | |||
} | |||
# Parameters for GED computation. | |||
ged_options = {'method': 'IPFP', # use IPFP huristic. | |||
'initialization_method': 'RANDOM', # or 'NODE', etc. | |||
'initial_solutions': 10, # when bigger than 1, then the method is considered mIPFP. | |||
'edit_cost': 'CONSTANT', # use CONSTANT cost. | |||
'attr_distance': 'euclidean', # the distance between non-symbolic node/edge labels is computed by euclidean distance. | |||
'ratio_runs_from_initial_solutions': 1, | |||
'threads': multiprocessing.cpu_count(), # parallel threads. Do not work if mpg_options['parallel'] = False. | |||
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES' | |||
} | |||
# Parameters for MedianGraphEstimator (Boria's method). | |||
mge_options = {'init_type': 'MEDOID', # how to initial median (compute set-median). "MEDOID" is to use the graph with smallest SOD. | |||
'random_inits': 10, # number of random initialization when 'init_type' = 'RANDOM'. | |||
'time_limit': 600, # maximum time limit to compute the generalized median. If set to 0 then no limit. | |||
'verbose': 2, # whether to print out results. | |||
'refine': False # whether to refine the final SODs or not. | |||
} | |||
print('done.') | |||
"""**3. Compute the Gram matrix and distance matrix.**""" | |||
from gklearn.utils.utils import get_graph_kernel_by_name | |||
# Get a graph kernel instance. | |||
graph_kernel = get_graph_kernel_by_name(kernel_options['name'], | |||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs, | |||
ds_infos=dataset.get_dataset_infos(keys=['directed']), | |||
kernel_options=kernel_options) | |||
# Compute Gram matrix. | |||
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options) | |||
# Compute distance matrix. | |||
from gklearn.utils import compute_distance_matrix | |||
dis_mat, _, _, _ = compute_distance_matrix(gram_matrix) | |||
print('done.') | |||
"""**4. Find the candidate graph.**""" | |||
from gklearn.preimage.utils import compute_k_dis | |||
# Number of the nearest neighbors. | |||
k_neighbors = 10 | |||
# For each graph G in dataset, compute the distance between its image \Phi(G) and the mean of its neighbors' images. | |||
dis_min = np.inf # the minimum distance between possible \Phi(G) and the mean of its neighbors. | |||
for idx, G in enumerate(dataset.graphs): | |||
# Find the k nearest neighbors of G. | |||
dis_list = dis_mat[idx] # distance between \Phi(G) and image of each graphs. | |||
idx_sort = np.argsort(dis_list) # sort distances and get the sorted indices. | |||
idx_nearest = idx_sort[1:k_neighbors+1] # indices of the k-nearest neighbors. | |||
dis_k_nearest = [dis_list[i] for i in idx_nearest] # k-nearest distances, except the 0. | |||
G_k_nearest = [dataset.graphs[i] for i in idx_nearest] # k-nearest neighbors. | |||
# Compute the distance between \Phi(G) and the mean of its neighbors. | |||
dis_tmp = compute_k_dis(idx, # the index of G in Gram matrix. | |||
idx_nearest, # the indices of the neighbors | |||
[1 / k_neighbors] * k_neighbors, # coefficients for neighbors. | |||
gram_matrix, | |||
withterm3=False) | |||
# Check if the new distance is smallers. | |||
if dis_tmp < dis_min: | |||
dis_min = dis_tmp | |||
G_cand = G | |||
G_neighbors = G_k_nearest | |||
print('The minimum distance is', dis_min) | |||
"""**5. Run median preimage generator.**""" | |||
from gklearn.preimage import MedianPreimageGenerator | |||
# Set the dataset as the k-nearest neighbors. | |||
dataset.load_graphs(G_neighbors) | |||
# Create median preimage generator instance. | |||
mpg = MedianPreimageGenerator() | |||
# Add dataset. | |||
mpg.dataset = dataset | |||
# Set parameters. | |||
mpg.set_options(**mpg_options.copy()) | |||
mpg.kernel_options = kernel_options.copy() | |||
mpg.ged_options = ged_options.copy() | |||
mpg.mge_options = mge_options.copy() | |||
# Run. | |||
mpg.run() | |||
"""**4. Get results.**""" | |||
# Get results. | |||
import pprint | |||
pp = pprint.PrettyPrinter(indent=4) # pretty print | |||
results = mpg.get_results() | |||
pp.pprint(results) | |||
draw_graph(mpg.set_median) | |||
draw_graph(mpg.gen_median) | |||
draw_graph(G_cand) | |||
# Draw generated graphs. | |||
def draw_graph(graph): | |||
import matplotlib.pyplot as plt | |||
import networkx as nx | |||
plt.figure() | |||
pos = nx.spring_layout(graph) | |||
nx.draw(graph, pos, node_size=500, labels=nx.get_node_attributes(graph, 'atom_symbol'), font_color='w', width=3, with_labels=True) | |||
plt.show() | |||
plt.clf() | |||
plt.close() | |||
if __name__ == '__main__': | |||
xp_simple_preimage() |
@@ -0,0 +1,188 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Mon Jun 1 17:02:51 2020 | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
from gklearn.utils import Dataset | |||
import csv | |||
import os | |||
import os.path | |||
from gklearn.preimage import RandomPreimageGenerator | |||
from gklearn.utils import split_dataset_by_target | |||
from gklearn.utils.graphfiles import saveGXL | |||
def generate_random_preimages_by_class(ds_name, rpg_options, kernel_options, save_results=True, save_preimages=True, load_gm='auto', dir_save='', irrelevant_labels=None, edge_required=False, cut_range=None): | |||
# 1. get dataset. | |||
print('1. getting dataset...') | |||
dataset_all = Dataset() | |||
dataset_all.load_predefined_dataset(ds_name) | |||
dataset_all.trim_dataset(edge_required=edge_required) | |||
if irrelevant_labels is not None: | |||
dataset_all.remove_labels(**irrelevant_labels) | |||
if cut_range is not None: | |||
dataset_all.cut_graphs(cut_range) | |||
datasets = split_dataset_by_target(dataset_all) | |||
if save_results: | |||
# create result files. | |||
print('creating output files...') | |||
fn_output_detail, fn_output_summary = __init_output_file_preimage(ds_name, kernel_options['name'], dir_save) | |||
dis_k_dataset_list = [] | |||
dis_k_preimage_list = [] | |||
time_precompute_gm_list = [] | |||
time_generate_list = [] | |||
time_total_list = [] | |||
itrs_list = [] | |||
num_updates_list = [] | |||
if load_gm == 'auto': | |||
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' | |||
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) | |||
if gmfile_exist: | |||
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. | |||
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']] | |||
time_precompute_gm_list = gmfile['run_time_list'].tolist() | |||
else: | |||
gram_matrix_unnorm_list = [] | |||
time_precompute_gm_list = [] | |||
elif not load_gm: | |||
gram_matrix_unnorm_list = [] | |||
time_precompute_gm_list = [] | |||
else: | |||
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' | |||
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe. | |||
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']] | |||
time_precompute_gm_list = gmfile['run_time_list'].tolist() | |||
print('starting generating preimage for each class of target...') | |||
idx_offset = 0 | |||
for idx, dataset in enumerate(datasets): | |||
target = dataset.targets[0] | |||
print('\ntarget =', target, '\n') | |||
# if target != 1: | |||
# continue | |||
num_graphs = len(dataset.graphs) | |||
if num_graphs < 2: | |||
print('\nnumber of graphs = ', num_graphs, ', skip.\n') | |||
idx_offset += 1 | |||
continue | |||
# 2. set parameters. | |||
print('2. initializing mpg and setting parameters...') | |||
if load_gm: | |||
if gmfile_exist: | |||
rpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx - idx_offset] | |||
rpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx - idx_offset] | |||
rpg = RandomPreimageGenerator() | |||
rpg.dataset = dataset | |||
rpg.set_options(**rpg_options.copy()) | |||
rpg.kernel_options = kernel_options.copy() | |||
# 3. compute preimage. | |||
print('3. computing preimage...') | |||
rpg.run() | |||
results = rpg.get_results() | |||
# 4. save results (and median graphs). | |||
print('4. saving results (and preimages)...') | |||
# write result detail. | |||
if save_results: | |||
print('writing results to files...') | |||
f_detail = open(dir_save + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow([ds_name, kernel_options['name'], | |||
num_graphs, target, 1, | |||
results['k_dis_dataset'], results['k_dis_preimage'], | |||
results['runtime_precompute_gm'], | |||
results['runtime_generate_preimage'], results['runtime_total'], | |||
results['itrs'], results['num_updates']]) | |||
f_detail.close() | |||
# compute result summary. | |||
dis_k_dataset_list.append(results['k_dis_dataset']) | |||
dis_k_preimage_list.append(results['k_dis_preimage']) | |||
time_precompute_gm_list.append(results['runtime_precompute_gm']) | |||
time_generate_list.append(results['runtime_generate_preimage']) | |||
time_total_list.append(results['runtime_total']) | |||
itrs_list.append(results['itrs']) | |||
num_updates_list.append(results['num_updates']) | |||
# write result summary for each letter. | |||
f_summary = open(dir_save + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||
num_graphs, target, | |||
results['k_dis_dataset'], results['k_dis_preimage'], | |||
results['runtime_precompute_gm'], | |||
results['runtime_generate_preimage'], results['runtime_total'], | |||
results['itrs'], results['num_updates']]) | |||
f_summary.close() | |||
# save median graphs. | |||
if save_preimages: | |||
if not os.path.exists(dir_save + 'preimages/'): | |||
os.makedirs(dir_save + 'preimages/') | |||
print('Saving preimages to files...') | |||
fn_best_dataset = dir_save + 'preimages/g_best_dataset.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(rpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', | |||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||
fn_preimage = dir_save + 'preimages/g_preimage.' + 'nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) | |||
saveGXL(rpg.preimage, fn_preimage + '.gxl', method='default', | |||
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, | |||
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) | |||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | |||
gram_matrix_unnorm_list.append(rpg.gram_matrix_unnorm) | |||
# write result summary for each class. | |||
if save_results: | |||
dis_k_dataset_mean = np.mean(dis_k_dataset_list) | |||
dis_k_preimage_mean = np.mean(dis_k_preimage_list) | |||
time_precompute_gm_mean = np.mean(time_precompute_gm_list) | |||
time_generate_mean = np.mean(time_generate_list) | |||
time_total_mean = np.mean(time_total_list) | |||
itrs_mean = np.mean(itrs_list) | |||
num_updates_mean = np.mean(num_updates_list) | |||
f_summary = open(dir_save + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], | |||
num_graphs, 'all', | |||
dis_k_dataset_mean, dis_k_preimage_mean, | |||
time_precompute_gm_mean, | |||
time_generate_mean, time_total_mean, itrs_mean, | |||
num_updates_mean]) | |||
f_summary.close() | |||
# write Gram matrices to file. | |||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | |||
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list) | |||
print('\ncomplete.\n') | |||
def __init_output_file_preimage(ds_name, gkernel, dir_output): | |||
if not os.path.exists(dir_output): | |||
os.makedirs(dir_output) | |||
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' | |||
f_detail = open(dir_output + fn_output_detail, 'a') | |||
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'num graphs', | |||
'target', 'repeat', 'dis_k best from dataset', 'dis_k preimage', | |||
'time precompute gm', 'time generate preimage', 'time total', | |||
'itrs', 'num updates']) | |||
f_detail.close() | |||
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' | |||
f_summary = open(dir_output + fn_output_summary, 'a') | |||
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'num graphs', | |||
'target', 'dis_k best from dataset', 'dis_k preimage', | |||
'time precompute gm', 'time generate preimage', 'time total', | |||
'itrs', 'num updates']) | |||
f_summary.close() | |||
return fn_output_detail, fn_output_summary |
@@ -19,7 +19,7 @@ from gklearn.ged.median import constant_node_costs,mge_options_to_string | |||
from gklearn.gedlib import librariesImport, gedlibpy | |||
from gklearn.utils import Timer | |||
from gklearn.utils.utils import get_graph_kernel_by_name | |||
# from gklearn.utils.dataset import Dataset | |||
class MedianPreimageGenerator(PreimageGenerator): | |||
@@ -127,8 +127,7 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
# 3. compute set median and gen median using optimized edit costs. | |||
if self._verbose >= 2: | |||
print('\nstart computing set median and gen median using optimized edit costs...\n') | |||
# group_fnames = [Gn[g].graph['filename'] for g in group_min] | |||
self.__generate_preimage_iam() | |||
self.__gmg_bcu() | |||
end_generate_preimage = time.time() | |||
self.__runtime_generate_preimage = end_generate_preimage - end_optimize_ec | |||
self.__runtime_total = end_generate_preimage - start | |||
@@ -140,19 +139,13 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
# 4. compute kernel distances to the true median. | |||
if self._verbose >= 2: | |||
print('\nstart computing distances to true median....\n') | |||
# Gn_median = [Gn[g].copy() for g in group_min] | |||
self.__compute_distances_to_true_median() | |||
# dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min = | |||
# idx_dis_k_gi_min = group_min[idx_dis_k_gi_min] | |||
# print('index min dis_k_gi:', idx_dis_k_gi_min) | |||
# print('sod_sm:', sod_sm) | |||
# print('sod_gm:', sod_gm) | |||
# 5. print out results. | |||
if self._verbose: | |||
print() | |||
print('================================================================================') | |||
print('Finished generalization of preimages.') | |||
print('Finished generation of preimages.') | |||
print('--------------------------------------------------------------------------------') | |||
print('The optimized edit cost constants:', self.__edit_cost_constants) | |||
print('SOD of the set median:', self.__sod_set_median) | |||
@@ -169,11 +162,6 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
print('Is optimization of edit costs converged:', self.__converged) | |||
print('================================================================================') | |||
print() | |||
# collect return values. | |||
# return (sod_sm, sod_gm), \ | |||
# (dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, idx_dis_k_gi_min), \ | |||
# (time_fitting, time_generating) | |||
def get_results(self): | |||
@@ -203,20 +191,22 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
""" | |||
if self.__fit_method == 'random': # random | |||
if self.__ged_options['edit_cost'] == 'LETTER': | |||
self.__edit_cost_constants = random.sample(range(1, 10), 3) | |||
self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants] | |||
self.__edit_cost_constants = random.sample(range(1, 1000), 3) | |||
self.__edit_cost_constants = [item * 0.001 for item in self.__edit_cost_constants] | |||
elif self.__ged_options['edit_cost'] == 'LETTER2': | |||
random.seed(time.time()) | |||
self.__edit_cost_constants = random.sample(range(1, 10), 5) | |||
# self.__edit_cost_constants = [item * 0.1 for item in self.__edit_cost_constants] | |||
self.__edit_cost_constants = random.sample(range(1, 1000), 5) | |||
self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] | |||
elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': | |||
self.__edit_cost_constants = random.sample(range(1, 10), 6) | |||
self.__edit_cost_constants = random.sample(range(1, 1000), 6) | |||
self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] | |||
if self._dataset.node_attrs == []: | |||
self.__edit_cost_constants[2] = 0 | |||
if self._dataset.edge_attrs == []: | |||
self.__edit_cost_constants[5] = 0 | |||
else: | |||
self.__edit_cost_constants = random.sample(range(1, 10), 6) | |||
self.__edit_cost_constants = random.sample(range(1, 1000), 6) | |||
self.__edit_cost_constants = [item * 0.01 for item in self.__edit_cost_constants] | |||
if self._verbose >= 2: | |||
print('edit cost constants used:', self.__edit_cost_constants) | |||
elif self.__fit_method == 'expert': # expert | |||
@@ -861,7 +851,15 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
print() | |||
def __generate_preimage_iam(self): | |||
def __gmg_bcu(self): | |||
""" | |||
The local search algorithm based on block coordinate update (BCU) for estimating a generalized median graph (GMG). | |||
Returns | |||
------- | |||
None. | |||
""" | |||
# Set up the ged environment. | |||
ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible. | |||
# gedlibpy.restart_env() | |||
@@ -910,24 +908,24 @@ class MedianPreimageGenerator(PreimageGenerator): | |||
# compute distance in kernel space for set median. | |||
kernels_to_sm, _ = self._graph_kernel.compute(self.__set_median, self._dataset.graphs, **self._kernel_options) | |||
kernel_sm, _ = self._graph_kernel.compute(self.__set_median, self.__set_median, **self._kernel_options) | |||
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize | |||
if self._kernel_options['normalize']: | |||
kernels_to_sm = [kernels_to_sm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_sm) for i in range(len(kernels_to_sm))] # normalize | |||
kernel_sm = 1 | |||
# @todo: not correct kernel value | |||
gram_with_sm = np.concatenate((np.array([kernels_to_sm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | |||
gram_with_sm = np.concatenate((np.array([[1] + kernels_to_sm]).T, gram_with_sm), axis=1) | |||
gram_with_sm = np.concatenate((np.array([[kernel_sm] + kernels_to_sm]).T, gram_with_sm), axis=1) | |||
self.__k_dis_set_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | |||
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | |||
gram_with_sm, withterm3=False) | |||
# print(gen_median.nodes(data=True)) | |||
# print(gen_median.edges(data=True)) | |||
# print(set_median.nodes(data=True)) | |||
# print(set_median.edges(data=True)) | |||
# compute distance in kernel space for generalized median. | |||
kernels_to_gm, _ = self._graph_kernel.compute(self.__gen_median, self._dataset.graphs, **self._kernel_options) | |||
kernel_gm, _ = self._graph_kernel.compute(self.__gen_median, self.__gen_median, **self._kernel_options) | |||
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize | |||
if self._kernel_options['normalize']: | |||
kernels_to_gm = [kernels_to_gm[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gm) for i in range(len(kernels_to_gm))] # normalize | |||
kernel_gm = 1 | |||
gram_with_gm = np.concatenate((np.array([kernels_to_gm]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | |||
gram_with_gm = np.concatenate((np.array([[1] + kernels_to_gm]).T, gram_with_gm), axis=1) | |||
gram_with_gm = np.concatenate((np.array([[kernel_gm] + kernels_to_gm]).T, gram_with_gm), axis=1) | |||
self.__k_dis_gen_median = compute_k_dis(0, range(1, 1+len(self._dataset.graphs)), | |||
[1 / len(self._dataset.graphs)] * len(self._dataset.graphs), | |||
gram_with_gm, withterm3=False) | |||
@@ -0,0 +1,389 @@ | |||
#!/usr/bin/env python3 | |||
# -*- coding: utf-8 -*- | |||
""" | |||
Created on Fri May 29 14:29:52 2020 | |||
@author: ljia | |||
""" | |||
import numpy as np | |||
import time | |||
import sys | |||
from tqdm import tqdm | |||
import multiprocessing | |||
import networkx as nx | |||
from multiprocessing import Pool | |||
from functools import partial | |||
from gklearn.preimage import PreimageGenerator | |||
from gklearn.preimage.utils import compute_k_dis | |||
from gklearn.utils import Timer | |||
from gklearn.utils.utils import get_graph_kernel_by_name | |||
# from gklearn.utils.dataset import Dataset | |||
class RandomPreimageGenerator(PreimageGenerator): | |||
def __init__(self, dataset=None): | |||
PreimageGenerator.__init__(self, dataset=dataset) | |||
# arguments to set. | |||
self.__k = 5 # number of nearest neighbors of phi in D_N. | |||
self.__r_max = 10 # maximum number of iterations. | |||
self.__l = 500 # numbers of graphs generated for each graph in D_k U {g_i_hat}. | |||
self.__alphas = None # weights of linear combinations of points in kernel space. | |||
self.__parallel = True | |||
self.__n_jobs = multiprocessing.cpu_count() | |||
self.__time_limit_in_sec = 0 | |||
self.__max_itrs = 20 | |||
# values to compute. | |||
self.__runtime_generate_preimage = None | |||
self.__runtime_total = None | |||
self.__preimage = None | |||
self.__best_from_dataset = None | |||
self.__k_dis_preimage = None | |||
self.__k_dis_dataset = None | |||
self.__itrs = 0 | |||
self.__converged = False # @todo | |||
self.__num_updates = 0 | |||
# values that can be set or to be computed. | |||
self.__gram_matrix_unnorm = None | |||
self.__runtime_precompute_gm = None | |||
def set_options(self, **kwargs): | |||
self._kernel_options = kwargs.get('kernel_options', {}) | |||
self._graph_kernel = kwargs.get('graph_kernel', None) | |||
self._verbose = kwargs.get('verbose', 2) | |||
self.__k = kwargs.get('k', 5) | |||
self.__r_max = kwargs.get('r_max', 10) | |||
self.__l = kwargs.get('l', 500) | |||
self.__alphas = kwargs.get('alphas', None) | |||
self.__parallel = kwargs.get('parallel', True) | |||
self.__n_jobs = kwargs.get('n_jobs', multiprocessing.cpu_count()) | |||
self.__time_limit_in_sec = kwargs.get('time_limit_in_sec', 0) | |||
self.__max_itrs = kwargs.get('max_itrs', 20) | |||
self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) | |||
self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) | |||
def run(self): | |||
self._graph_kernel = get_graph_kernel_by_name(self._kernel_options['name'], | |||
node_labels=self._dataset.node_labels, | |||
edge_labels=self._dataset.edge_labels, | |||
node_attrs=self._dataset.node_attrs, | |||
edge_attrs=self._dataset.edge_attrs, | |||
ds_infos=self._dataset.get_dataset_infos(keys=['directed']), | |||
kernel_options=self._kernel_options) | |||
# record start time. | |||
start = time.time() | |||
# 1. precompute gram matrix. | |||
if self.__gram_matrix_unnorm is None: | |||
gram_matrix, run_time = self._graph_kernel.compute(self._dataset.graphs, **self._kernel_options) | |||
self.__gram_matrix_unnorm = self._graph_kernel.gram_matrix_unnorm | |||
end_precompute_gm = time.time() | |||
self.__runtime_precompute_gm = end_precompute_gm - start | |||
else: | |||
if self.__runtime_precompute_gm is None: | |||
raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') | |||
self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm | |||
if self._kernel_options['normalize']: | |||
self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm)) | |||
else: | |||
self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm) | |||
end_precompute_gm = time.time() | |||
start -= self.__runtime_precompute_gm | |||
# 2. compute k nearest neighbors of phi in D_N. | |||
if self._verbose >= 2: | |||
print('\nstart computing k nearest neighbors of phi in D_N...\n') | |||
D_N = self._dataset.graphs | |||
if self.__alphas is None: | |||
self.__alphas = [1 / len(D_N)] * len(D_N) | |||
k_dis_list = [] # distance between g_star and each graph. | |||
term3 = 0 | |||
for i1, a1 in enumerate(self.__alphas): | |||
for i2, a2 in enumerate(self.__alphas): | |||
term3 += a1 * a2 * self._graph_kernel.gram_matrix[i1, i2] | |||
for idx in range(len(D_N)): | |||
k_dis_list.append(compute_k_dis(idx, range(0, len(D_N)), self.__alphas, self._graph_kernel.gram_matrix, term3=term3, withterm3=True)) | |||
# sort. | |||
sort_idx = np.argsort(k_dis_list) | |||
dis_gs = [k_dis_list[idis] for idis in sort_idx[0:self.__k]] # the k shortest distances. | |||
nb_best = len(np.argwhere(dis_gs == dis_gs[0]).flatten().tolist()) | |||
g0hat_list = [D_N[idx].copy() for idx in sort_idx[0:nb_best]] # the nearest neighbors of phi in D_N | |||
self.__best_from_dataset = g0hat_list[0] # get the first best graph if there are muitlple. | |||
self.__k_dis_dataset = dis_gs[0] | |||
if self.__k_dis_dataset == 0: # get the exact pre-image. | |||
end_generate_preimage = time.time() | |||
self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm | |||
self.__runtime_total = end_generate_preimage - start | |||
self.__preimage = self.__best_from_dataset.copy() | |||
self.__k_dis_preimage = self.__k_dis_dataset | |||
if self._verbose: | |||
print() | |||
print('=============================================================================') | |||
print('The exact pre-image is found from the input dataset.') | |||
print('-----------------------------------------------------------------------------') | |||
print('Distance in kernel space for the best graph from dataset and for preimage:', self.__k_dis_dataset) | |||
print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||
print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||
print('Total time:', self.__runtime_total) | |||
print('=============================================================================') | |||
print() | |||
return | |||
dhat = dis_gs[0] # the nearest distance | |||
Gk = [D_N[ig].copy() for ig in sort_idx[0:self.__k]] # the k nearest neighbors | |||
Gs_nearest = [nx.convert_node_labels_to_integers(g) for g in Gk] # [g.copy() for g in Gk] | |||
# 3. start iterations. | |||
if self._verbose >= 2: | |||
print('starting iterations...') | |||
gihat_list = [] | |||
dihat_list = [] | |||
r = 0 | |||
dis_of_each_itr = [dhat] | |||
if self.__parallel: | |||
self._kernel_options['parallel'] = None | |||
self.__itrs = 0 | |||
self.__num_updates = 0 | |||
timer = Timer(self.__time_limit_in_sec) | |||
while not self.__termination_criterion_met(timer, self.__itrs, r): | |||
print('\n- r =', r) | |||
found = False | |||
dis_bests = dis_gs + dihat_list | |||
# compute numbers of edges to be inserted/deleted. | |||
# @todo what if the log is negetive? how to choose alpha (scalar)? | |||
fdgs_list = np.array(dis_bests) | |||
if np.min(fdgs_list) < 1: # in case the log is negetive. | |||
fdgs_list /= np.min(fdgs_list) | |||
fdgs_list = [int(item) for item in np.ceil(np.log(fdgs_list))] | |||
if np.min(fdgs_list) < 1: # in case the log is smaller than 1. | |||
fdgs_list = np.array(fdgs_list) + 1 | |||
# expand the number of modifications to increase the possiblity. | |||
nb_vpairs_list = [nx.number_of_nodes(g) * (nx.number_of_nodes(g) - 1) for g in (Gs_nearest + gihat_list)] | |||
nb_vpairs_min = np.min(nb_vpairs_list) | |||
idx_fdgs_max = np.argmax(fdgs_list) | |||
fdgs_max_old = fdgs_list[idx_fdgs_max] | |||
fdgs_max = fdgs_max_old | |||
nb_modif = 1 | |||
for idx, nb in enumerate(range(nb_vpairs_min, nb_vpairs_min - fdgs_max, -1)): | |||
nb_modif *= nb / (fdgs_max - idx) | |||
while fdgs_max < nb_vpairs_min and nb_modif < self.__l: | |||
fdgs_max += 1 | |||
nb_modif *= (nb_vpairs_min - fdgs_max + 1) / fdgs_max | |||
nb_increase = int(fdgs_max - fdgs_max_old) | |||
if nb_increase > 0: | |||
fdgs_list += 1 | |||
for ig, gs in enumerate(Gs_nearest + gihat_list): | |||
if self._verbose >= 2: | |||
print('-- computing', ig + 1, 'graphs out of', len(Gs_nearest) + len(gihat_list)) | |||
gnew, dhat, found = self.__generate_l_graphs(gs, fdgs_list[ig], dhat, ig, found, term3) | |||
if found: | |||
r = 0 | |||
gihat_list = [gnew] | |||
dihat_list = [dhat] | |||
else: | |||
r += 1 | |||
dis_of_each_itr.append(dhat) | |||
self.__itrs += 1 | |||
if self._verbose >= 2: | |||
print('Total number of iterations is', self.__itrs, '.') | |||
print('The preimage is updated', self.__num_updates, 'times.') | |||
print('The shortest distances for previous iterations are', dis_of_each_itr, '.') | |||
# get results and print. | |||
end_generate_preimage = time.time() | |||
self.__runtime_generate_preimage = end_generate_preimage - end_precompute_gm | |||
self.__runtime_total = end_generate_preimage - start | |||
self.__preimage = (g0hat_list[0] if len(gihat_list) == 0 else gihat_list[0]) | |||
self.__k_dis_preimage = dhat | |||
if self._verbose: | |||
print() | |||
print('=============================================================================') | |||
print('Finished generation of preimages.') | |||
print('-----------------------------------------------------------------------------') | |||
print('Distance in kernel space for the best graph from dataset:', self.__k_dis_dataset) | |||
print('Distance in kernel space for the preimage:', self.__k_dis_preimage) | |||
print('Total number of iterations for optimizing:', self.__itrs) | |||
print('Total number of updating preimage:', self.__num_updates) | |||
print('Time to pre-compute Gram matrix:', self.__runtime_precompute_gm) | |||
print('Time to generate pre-images:', self.__runtime_generate_preimage) | |||
print('Total time:', self.__runtime_total) | |||
print('=============================================================================') | |||
print() | |||
def __generate_l_graphs(self, g_init, fdgs, dhat, ig, found, term3): | |||
if self.__parallel: | |||
gnew, dhat, found = self.__generate_l_graphs_parallel(g_init, fdgs, dhat, ig, found, term3) | |||
else: | |||
gnew, dhat, found = self.__generate_l_graphs_series(g_init, fdgs, dhat, ig, found, term3) | |||
return gnew, dhat, found | |||
def __generate_l_graphs_series(self, g_init, fdgs, dhat, ig, found, term3): | |||
gnew = None | |||
updated = False | |||
for trial in range(0, self.__l): | |||
if self._verbose >= 2: | |||
print('---', trial + 1, 'trial out of', self.__l) | |||
gtemp, dnew = self.__do_trial(g_init, fdgs, term3, trial) | |||
# get the better graph preimage. | |||
if dnew <= dhat: # @todo: the new distance is smaller or also equal? | |||
if dhat - dnew > 1e-6: | |||
if self._verbose >= 2: | |||
print('trial =', str(trial)) | |||
print('\nI am smaller!') | |||
print('index (as in D_k U {gihat} =', str(ig)) | |||
print('distance:', dhat, '->', dnew) | |||
updated = True | |||
else: | |||
if self._verbose >= 2: | |||
print('I am equal!') | |||
dhat = dnew | |||
gnew = gtemp.copy() | |||
found = True # found better or equally good graph. | |||
if updated: | |||
self.__num_updates += 1 | |||
return gnew, dhat, found | |||
def __generate_l_graphs_parallel(self, g_init, fdgs, dhat, ig, found, term3): | |||
gnew = None | |||
len_itr = self.__l | |||
gnew_list = [None] * len_itr | |||
dnew_list = [None] * len_itr | |||
itr = range(0, len_itr) | |||
n_jobs = multiprocessing.cpu_count() | |||
if len_itr < 100 * n_jobs: | |||
chunksize = int(len_itr / n_jobs) + 1 | |||
else: | |||
chunksize = 100 | |||
do_fun = partial(self._generate_graph_parallel, g_init, fdgs, term3) | |||
pool = Pool(processes=n_jobs) | |||
if self._verbose >= 2: | |||
iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize), | |||
desc='Generating l graphs', file=sys.stdout) | |||
else: | |||
iterator = pool.imap_unordered(do_fun, itr, chunksize) | |||
for idx, gnew, dnew in iterator: | |||
gnew_list[idx] = gnew | |||
dnew_list[idx] = dnew | |||
pool.close() | |||
pool.join() | |||
# check if get the better graph preimage. | |||
idx_min = np.argmin(dnew_list) | |||
dnew = dnew_list[idx_min] | |||
if dnew <= dhat: # @todo: the new distance is smaller or also equal? | |||
if dhat - dnew > 1e-6: # @todo: use a proportion and watch out for 0. | |||
if self._verbose >= 2: | |||
print('I am smaller!') | |||
print('index (as in D_k U {gihat}) =', str(ig)) | |||
print('distance:', dhat, '->', dnew, '\n') | |||
self.__num_updates += 1 | |||
else: | |||
if self._verbose >= 2: | |||
print('I am equal!') | |||
dhat = dnew | |||
gnew = gnew_list[idx_min] | |||
found = True # found better graph. | |||
return gnew, dhat, found | |||
def _generate_graph_parallel(self, g_init, fdgs, term3, itr): | |||
trial = itr | |||
gtemp, dnew = self.__do_trial(g_init, fdgs, term3, trial) | |||
return trial, gtemp, dnew | |||
def __do_trial(self, g_init, fdgs, term3, trial): | |||
# add and delete edges. | |||
gtemp = g_init.copy() | |||
seed = (trial + int(time.time())) % (2 ** 32 - 1) | |||
rdm_state = np.random.RandomState(seed=seed) | |||
# which edges to change. | |||
# @todo: should we use just half of the adjacency matrix for undirected graphs? | |||
nb_vpairs = nx.number_of_nodes(g_init) * (nx.number_of_nodes(g_init) - 1) | |||
# @todo: what if fdgs is bigger than nb_vpairs? | |||
idx_change = rdm_state.randint(0, high=nb_vpairs, size=(fdgs if | |||
fdgs < nb_vpairs else nb_vpairs)) | |||
# print(idx_change) | |||
for item in idx_change: | |||
node1 = int(item / (nx.number_of_nodes(g_init) - 1)) | |||
node2 = (item - node1 * (nx.number_of_nodes(g_init) - 1)) | |||
if node2 >= node1: # skip the self pair. | |||
node2 += 1 | |||
# @todo: is the randomness correct? | |||
if not gtemp.has_edge(node1, node2): | |||
gtemp.add_edge(node1, node2) | |||
else: | |||
gtemp.remove_edge(node1, node2) | |||
# compute new distances. | |||
kernels_to_gtmp, _ = self._graph_kernel.compute(gtemp, self._dataset.graphs, **self._kernel_options) | |||
kernel_gtmp, _ = self._graph_kernel.compute(gtemp, gtemp, **self._kernel_options) | |||
if self._kernel_options['normalize']: | |||
kernels_to_gtmp = [kernels_to_gtmp[i] / np.sqrt(self.__gram_matrix_unnorm[i, i] * kernel_gtmp) for i in range(len(kernels_to_gtmp))] # normalize | |||
kernel_gtmp = 1 | |||
# @todo: not correct kernel value | |||
gram_with_gtmp = np.concatenate((np.array([kernels_to_gtmp]), np.copy(self._graph_kernel.gram_matrix)), axis=0) | |||
gram_with_gtmp = np.concatenate((np.array([[kernel_gtmp] + kernels_to_gtmp]).T, gram_with_gtmp), axis=1) | |||
dnew = compute_k_dis(0, range(1, 1 + len(self._dataset.graphs)), self.__alphas, gram_with_gtmp, term3=term3, withterm3=True) | |||
return gtemp, dnew | |||
def get_results(self): | |||
results = {} | |||
results['runtime_precompute_gm'] = self.__runtime_precompute_gm | |||
results['runtime_generate_preimage'] = self.__runtime_generate_preimage | |||
results['runtime_total'] = self.__runtime_total | |||
results['k_dis_dataset'] = self.__k_dis_dataset | |||
results['k_dis_preimage'] = self.__k_dis_preimage | |||
results['itrs'] = self.__itrs | |||
results['num_updates'] = self.__num_updates | |||
return results | |||
def __termination_criterion_met(self, timer, itr, r): | |||
if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): | |||
# if self.__state == AlgorithmState.TERMINATED: | |||
# self.__state = AlgorithmState.INITIALIZED | |||
return True | |||
return (r >= self.__r_max if self.__r_max >= 0 else False) | |||
# return converged or (itrs_without_update > self.__max_itrs_without_update if self.__max_itrs_without_update >= 0 else False) | |||
@property | |||
def preimage(self): | |||
return self.__preimage | |||
@property | |||
def best_from_dataset(self): | |||
return self.__best_from_dataset | |||
@property | |||
def gram_matrix_unnorm(self): | |||
return self.__gram_matrix_unnorm | |||
@gram_matrix_unnorm.setter | |||
def gram_matrix_unnorm(self, value): | |||
self.__gram_matrix_unnorm = value |
@@ -256,7 +256,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged | |||
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: | |||
gram_matrix_unnorm_list.append(mpg.gram_matrix_unnorm) | |||
# write result summary for each letter. | |||
# write result summary for each class. | |||
if save_results: | |||
sod_sm_mean = np.mean(sod_sm_list) | |||
sod_gm_mean = np.mean(sod_gm_list) | |||
@@ -387,15 +387,15 @@ def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||
return np.sqrt(term1 - term2 + term3) | |||
def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): | |||
def compute_k_dis(idx_g, idx_gi, alphas, Kmatrix, term3=0, withterm3=True): | |||
term1 = Kmatrix[idx_g, idx_g] | |||
term2 = 0 | |||
for i, a in enumerate(alpha): | |||
for i, a in enumerate(alphas): | |||
term2 += a * Kmatrix[idx_g, idx_gi[i]] | |||
term2 *= 2 | |||
if withterm3 == False: | |||
for i1, a1 in enumerate(alpha): | |||
for i2, a2 in enumerate(alpha): | |||
for i1, a1 in enumerate(alphas): | |||
for i2, a2 in enumerate(alphas): | |||
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] | |||
return np.sqrt(term1 - term2 + term3) | |||
@@ -300,7 +300,13 @@ def get_edge_labels(Gn, edge_label): | |||
def get_graph_kernel_by_name(name, node_labels=None, edge_labels=None, node_attrs=None, edge_attrs=None, ds_infos=None, kernel_options={}): | |||
if name == 'ShortestPath': | |||
if name == 'Marginalized': | |||
from gklearn.kernels import Marginalized | |||
graph_kernel = Marginalized(node_labels=node_labels, | |||
edge_labels=edge_labels, | |||
ds_infos=ds_infos, | |||
**kernel_options) | |||
elif name == 'ShortestPath': | |||
from gklearn.kernels import ShortestPath | |||
graph_kernel = ShortestPath(node_labels=node_labels, | |||
node_attrs=node_attrs, | |||