Browse Source

Merge pull request #18 from jajupmochi/v0.2

V0.2
tags/v0.2.0
linlin GitHub 5 years ago
parent
commit
55aef2be49
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
31 changed files with 10105 additions and 1715 deletions
  1. +1
    -0
      .gitignore
  2. +1
    -1
      README.md
  3. +1
    -1
      gklearn/__init__.py
  4. +6
    -5
      gklearn/ged/env/common_types.py
  5. +19
    -9
      gklearn/ged/env/node_map.py
  6. +342
    -125
      gklearn/ged/median/median_graph_estimator.py
  7. +4
    -4
      gklearn/ged/median/test_median_graph_estimator.py
  8. +4
    -0
      gklearn/ged/median/utils.py
  9. +26
    -11
      gklearn/ged/util/util.py
  10. +1468
    -1466
      gklearn/gedlib/gedlibpy.cpp
  11. BIN
      gklearn/gedlib/gedlibpy.cpython-36m-x86_64-linux-gnu.so
  12. +9
    -1
      gklearn/gedlib/gedlibpy.pyx
  13. +3
    -0
      gklearn/kernels/graph_kernel.py
  14. +1
    -0
      gklearn/preimage/__init__.py
  15. +91
    -0
      gklearn/preimage/experiments/tools/analyze_results_of_random_edit_costs.py
  16. +228
    -0
      gklearn/preimage/experiments/tools/preimage_results_to_latex_tables.py
  17. +3382
    -0
      gklearn/preimage/experiments/xp_1nn_init10_trianglerule.py
  18. +3085
    -0
      gklearn/preimage/experiments/xp_remove_best_graph_init10.py
  19. +418
    -0
      gklearn/preimage/kernel_knn_cv.py
  20. +256
    -41
      gklearn/preimage/median_preimage_generator.py
  21. +423
    -0
      gklearn/preimage/remove_best_graph.py
  22. +28
    -26
      gklearn/preimage/utils.py
  23. +71
    -0
      gklearn/tests/test_median_preimage_generator.py
  24. +2
    -0
      gklearn/utils/__init__.py
  25. +19
    -1
      gklearn/utils/dataset.py
  26. +24
    -13
      gklearn/utils/graph_files.py
  27. +141
    -0
      gklearn/utils/knn.py
  28. +33
    -4
      gklearn/utils/utils.py
  29. +5
    -4
      requirements.txt
  30. +11
    -0
      requirements_pypi.txt
  31. +3
    -3
      setup.py

+ 1
- 0
.gitignore View File

@@ -32,6 +32,7 @@ gklearn/kernels/*_sym.py
gklearn/preimage/* gklearn/preimage/*
!gklearn/preimage/*.py !gklearn/preimage/*.py
!gklearn/preimage/experiments/*.py !gklearn/preimage/experiments/*.py
!gklearn/preimage/experiments/tools/*.py


__pycache__ __pycache__
##*# ##*#


+ 1
- 1
README.md View File

@@ -10,7 +10,7 @@ A Python package for graph kernels, graph edit distances and graph pre-image pro
## Requirements ## Requirements


* python>=3.5 * python>=3.5
* numpy>=1.15.2
* numpy>=1.16.2
* scipy>=1.1.0 * scipy>=1.1.0
* matplotlib>=3.0.0 * matplotlib>=3.0.0
* networkx>=2.2 * networkx>=2.2


+ 1
- 1
gklearn/__init__.py View File

@@ -18,4 +18,4 @@ __date__ = "November 2017"
# import sub modules # import sub modules
# from gklearn import c_ext # from gklearn import c_ext
# from gklearn import ged # from gklearn import ged
from gklearn import utils
# from gklearn import utils

+ 6
- 5
gklearn/ged/env/common_types.py View File

@@ -6,12 +6,13 @@ Created on Thu Mar 19 18:17:38 2020
@author: ljia @author: ljia
""" """


from enum import Enum, auto
from enum import Enum, unique


@unique
class AlgorithmState(Enum): class AlgorithmState(Enum):
"""can be used to specify the state of an algorithm. """can be used to specify the state of an algorithm.
""" """
CALLED = auto # The algorithm has been called.
INITIALIZED = auto # The algorithm has been initialized.
CONVERGED = auto # The algorithm has converged.
TERMINATED = auto # The algorithm has terminated.
CALLED = 1 # The algorithm has been called.
INITIALIZED = 2 # The algorithm has been initialized.
CONVERGED = 3 # The algorithm has converged.
TERMINATED = 4 # The algorithm has terminated.

+ 19
- 9
gklearn/ged/env/node_map.py View File

@@ -39,14 +39,6 @@ class NodeMap(object):
return np.inf return np.inf
def get_forward_map(self):
return self.__forward_map
def get_backward_map(self):
return self.__backward_map
def as_relation(self, relation): def as_relation(self, relation):
relation.clear() relation.clear()
for i in range(0, len(self.__forward_map)): for i in range(0, len(self.__forward_map)):
@@ -77,4 +69,22 @@ class NodeMap(object):
def induced_cost(self): def induced_cost(self):
return self.__induced_cost
return self.__induced_cost
@property
def forward_map(self):
return self.__forward_map

@forward_map.setter
def forward_map(self, value):
self.__forward_map = value
@property
def backward_map(self):
return self.__backward_map

@backward_map.setter
def backward_map(self, value):
self.__backward_map = value

+ 342
- 125
gklearn/ged/median/median_graph_estimator.py View File

@@ -13,6 +13,9 @@ import time
from tqdm import tqdm from tqdm import tqdm
import sys import sys
import networkx as nx import networkx as nx
import multiprocessing
from multiprocessing import Pool
from functools import partial




class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined node? class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined node?
@@ -47,7 +50,9 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
self.__desired_num_random_inits = 10 self.__desired_num_random_inits = 10
self.__use_real_randomness = True self.__use_real_randomness = True
self.__seed = 0 self.__seed = 0
self.__parallel = True
self.__update_order = True self.__update_order = True
self.__sort_graphs = True # sort graphs by size when computing GEDs.
self.__refine = True self.__refine = True
self.__time_limit_in_sec = 0 self.__time_limit_in_sec = 0
self.__epsilon = 0.0001 self.__epsilon = 0.0001
@@ -125,6 +130,16 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
else: else:
raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"') raise Exception('Invalid argument "' + opt_val + '" for option stdout. Usage: options = "[--stdout 0|1|2] [...]"')

elif opt_name == 'parallel':
if opt_val == 'TRUE':
self.__parallel = True
elif opt_val == 'FALSE':
self.__parallel = False
else:
raise Exception('Invalid argument "' + opt_val + '" for option parallel. Usage: options = "[--parallel TRUE|FALSE] [...]"')
elif opt_name == 'update-order': elif opt_name == 'update-order':
if opt_val == 'TRUE': if opt_val == 'TRUE':
@@ -136,6 +151,16 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
else: else:
raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"') raise Exception('Invalid argument "' + opt_val + '" for option update-order. Usage: options = "[--update-order TRUE|FALSE] [...]"')
elif opt_name == 'sort-graphs':
if opt_val == 'TRUE':
self.__sort_graphs = True
elif opt_val == 'FALSE':
self.__sort_graphs = False
else:
raise Exception('Invalid argument "' + opt_val + '" for option sort-graphs. Usage: options = "[--sort-graphs TRUE|FALSE] [...]"')
elif opt_name == 'refine': elif opt_name == 'refine':
if opt_val == 'TRUE': if opt_val == 'TRUE':
self.__refine = True self.__refine = True
@@ -302,7 +327,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
self.__median_id = gen_median_id self.__median_id = gen_median_id
self.__state = AlgorithmState.TERMINATED self.__state = AlgorithmState.TERMINATED
# Get ExchangeGraph representations of the input graphs.
# Get NetworkX graph representations of the input graphs.
graphs = {} graphs = {}
for graph_id in graph_ids: for graph_id in graph_ids:
# @todo: get_nx_graph() function may need to be modified according to the coming code. # @todo: get_nx_graph() function may need to be modified according to the coming code.
@@ -312,7 +337,6 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
# print(graphs[0].nodes(data=True)) # print(graphs[0].nodes(data=True))
# print(graphs[0].edges(data=True)) # print(graphs[0].edges(data=True))
# print(nx.adjacency_matrix(graphs[0])) # print(nx.adjacency_matrix(graphs[0]))

# Construct initial medians. # Construct initial medians.
medians = [] medians = []
@@ -356,30 +380,14 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
self.__ged_env.load_nx_graph(median, gen_median_id) self.__ged_env.load_nx_graph(median, gen_median_id)
self.__ged_env.init(self.__ged_env.get_init_type()) self.__ged_env.init(self.__ged_env.get_init_type())
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout)
# Compute node maps and sum of distances for initial median. # Compute node maps and sum of distances for initial median.
self.__sum_of_distances = 0
self.__node_maps_from_median.clear()
for graph_id in graph_ids:
self.__ged_env.run_method(gen_median_id, graph_id)
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id)
# print(self.__node_maps_from_median[graph_id])
self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost()
# print(self.__sum_of_distances)
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress.update(1)
# xxx = self.__node_maps_from_median
self.__compute_init_node_maps(graph_ids, gen_median_id)
# yyy = self.__node_maps_from_median
self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances) self.__best_init_sum_of_distances = min(self.__best_init_sum_of_distances, self.__sum_of_distances)
self.__ged_env.load_nx_graph(median, set_median_id) self.__ged_env.load_nx_graph(median, set_median_id)
# print(self.__best_init_sum_of_distances) # print(self.__best_init_sum_of_distances)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
# Run block gradient descent from initial median. # Run block gradient descent from initial median.
converged = False converged = False
@@ -434,7 +442,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
# print(self.__node_maps_from_median[graph_id].induced_cost()) # print(self.__node_maps_from_median[graph_id].induced_cost())
# xxx = self.__node_maps_from_median[graph_id] # xxx = self.__node_maps_from_median[graph_id]
self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id]) self.__ged_env.compute_induced_cost(gen_median_id, graph_id, self.__node_maps_from_median[graph_id])
# print('---------------------------------------')
# print('---------------------------------------')
# print(self.__node_maps_from_median[graph_id].induced_cost()) # print(self.__node_maps_from_median[graph_id].induced_cost())
# @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully! # @todo:!!!!!!!!!!!!!!!!!!!!!!!!!!!!This value is a slight different from the c++ program, which might be a bug! Use it very carefully!
@@ -540,18 +548,31 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
progress.update(1) progress.update(1)
# Improving the node maps. # Improving the node maps.
nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__gen_median_id)
for graph_id, node_map in self.__node_maps_from_median.items(): for graph_id, node_map in self.__node_maps_from_median.items():
if time.expired(): if time.expired():
if self.__state == AlgorithmState.TERMINATED: if self.__state == AlgorithmState.TERMINATED:
self.__state = AlgorithmState.CONVERGED self.__state = AlgorithmState.CONVERGED
break break
self.__ged_env.run_method(self.__gen_median_id, graph_id)
if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost():
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id)
self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost()

nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id)
if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs:
self.__ged_env.run_method(self.__gen_median_id, graph_id)
if self.__ged_env.get_upper_bound(self.__gen_median_id, graph_id) < node_map.induced_cost():
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__gen_median_id, graph_id)
else:
self.__ged_env.run_method(graph_id, self.__gen_median_id)
if self.__ged_env.get_upper_bound(graph_id, self.__gen_median_id) < node_map.induced_cost():
node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__gen_median_id)
node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map
self.__node_maps_from_median[graph_id] = node_map_tmp
self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost()

# Print information. # Print information.
if self.__print_to_stdout == 2: if self.__print_to_stdout == 2:
progress.update(1) progress.update(1)

self.__sum_of_distances = 0.0 self.__sum_of_distances = 0.0
for key, val in self.__node_maps_from_median.items(): for key, val in self.__node_maps_from_median.items():
self.__sum_of_distances += val.induced_cost() self.__sum_of_distances += val.induced_cost()
@@ -562,7 +583,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
def __median_available(self): def __median_available(self):
return self.__gen_median_id != np.inf
return self.__median_id != np.inf
def get_state(self): def get_state(self):
@@ -637,7 +658,9 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
self.__desired_num_random_inits = 10 self.__desired_num_random_inits = 10
self.__use_real_randomness = True self.__use_real_randomness = True
self.__seed = 0 self.__seed = 0
self.__parallel = True
self.__update_order = True self.__update_order = True
self.__sort_graphs = True
self.__refine = True self.__refine = True
self.__time_limit_in_sec = 0 self.__time_limit_in_sec = 0
self.__epsilon = 0.0001 self.__epsilon = 0.0001
@@ -682,35 +705,138 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
def __compute_medoid(self, graph_ids, timer, initial_medians): def __compute_medoid(self, graph_ids, timer, initial_medians):
# Use method selected for initialization phase. # Use method selected for initialization phase.
self.__ged_env.set_method(self.__init_method, self.__init_options) self.__ged_env.set_method(self.__init_method, self.__init_options)
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout)
# Compute the medoid. # Compute the medoid.
medoid_id = graph_ids[0]
best_sum_of_distances = np.inf
for g_id in graph_ids:
if timer.expired():
self.__state = AlgorithmState.CALLED
break
sum_of_distances = 0
for h_id in graph_ids:
self.__ged_env.run_method(g_id, h_id)
sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id)
if sum_of_distances < best_sum_of_distances:
best_sum_of_distances = sum_of_distances
medoid_id = g_id
if self.__parallel:
# @todo: notice when parallel self.__ged_env is not modified.
sum_of_distances_list = [np.inf] * len(graph_ids)
len_itr = len(graph_ids)
itr = zip(graph_ids, range(0, len(graph_ids)))
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
def init_worker(ged_env_toshare):
global G_ged_env
G_ged_env = ged_env_toshare
do_fun = partial(_compute_medoid_parallel, graph_ids, self.__sort_graphs)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,))
if self.__print_to_stdout == 2:
iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize),
desc='Computing medoid', file=sys.stdout)
else:
iterator = pool.imap_unordered(do_fun, itr, chunksize)
for i, dis in iterator:
sum_of_distances_list[i] = dis
pool.close()
pool.join()
medoid_id = np.argmin(sum_of_distances_list)
best_sum_of_distances = sum_of_distances_list[medoid_id]
initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo

else:
# Print information about current iteration. # Print information about current iteration.
if self.__print_to_stdout == 2: if self.__print_to_stdout == 2:
progress.update(1)
initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo
progress = tqdm(desc='Computing medoid', total=len(graph_ids), file=sys.stdout)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
medoid_id = graph_ids[0]
best_sum_of_distances = np.inf
for g_id in graph_ids:
if timer.expired():
self.__state = AlgorithmState.CALLED
break
nb_nodes_g = self.__ged_env.get_graph_num_nodes(g_id)
sum_of_distances = 0
for h_id in graph_ids:
nb_nodes_h = self.__ged_env.get_graph_num_nodes(h_id)
if nb_nodes_g <= nb_nodes_h or not self.__sort_graphs:
self.__ged_env.run_method(g_id, h_id)
sum_of_distances += self.__ged_env.get_upper_bound(g_id, h_id)
else:
self.__ged_env.run_method(h_id, g_id)
sum_of_distances += self.__ged_env.get_upper_bound(h_id, g_id)
if sum_of_distances < best_sum_of_distances:
best_sum_of_distances = sum_of_distances
medoid_id = g_id
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress.update(1)
initial_medians.append(self.__ged_env.get_nx_graph(medoid_id, True, True, False)) # @todo
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
def __compute_init_node_maps(self, graph_ids, gen_median_id):
# Compute node maps and sum of distances for initial median.
if self.__parallel:
# @todo: notice when parallel self.__ged_env is not modified.
self.__sum_of_distances = 0
self.__node_maps_from_median.clear()
sum_of_distances_list = [0] * len(graph_ids)
len_itr = len(graph_ids)
itr = graph_ids
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
def init_worker(ged_env_toshare):
global G_ged_env
G_ged_env = ged_env_toshare
nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id)
do_fun = partial(_compute_init_node_maps_parallel, gen_median_id, self.__sort_graphs, nb_nodes_median)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,))
if self.__print_to_stdout == 2:
iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize),
desc='Computing initial node maps', file=sys.stdout)
else:
iterator = pool.imap_unordered(do_fun, itr, chunksize)
for g_id, sod, node_maps in iterator:
sum_of_distances_list[g_id] = sod
self.__node_maps_from_median[g_id] = node_maps
pool.close()
pool.join()
self.__sum_of_distances = np.sum(sum_of_distances_list)
# xxx = self.__node_maps_from_median
else:
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='Computing initial node maps', total=len(graph_ids), file=sys.stdout)
self.__sum_of_distances = 0
self.__node_maps_from_median.clear()
nb_nodes_median = self.__ged_env.get_graph_num_nodes(gen_median_id)
for graph_id in graph_ids:
nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id)
if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs:
self.__ged_env.run_method(gen_median_id, graph_id)
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(gen_median_id, graph_id)
else:
self.__ged_env.run_method(graph_id, gen_median_id)
node_map_tmp = self.__ged_env.get_node_map(graph_id, gen_median_id)
node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map
self.__node_maps_from_median[graph_id] = node_map_tmp
# print(self.__node_maps_from_median[graph_id])
self.__sum_of_distances += self.__node_maps_from_median[graph_id].induced_cost()
# print(self.__sum_of_distances)
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress.update(1)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')

def __termination_criterion_met(self, converged, timer, itr, itrs_without_update): def __termination_criterion_met(self, converged, timer, itr, itrs_without_update):
if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False): if timer.expired() or (itr >= self.__max_itrs if self.__max_itrs >= 0 else False):
@@ -743,6 +869,7 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
def __update_node_labels(self, graphs, median): def __update_node_labels(self, graphs, median):
# print('----------------------------')
# Print information about current iteration. # Print information about current iteration.
if self.__print_to_stdout == 2: if self.__print_to_stdout == 2:
@@ -750,14 +877,15 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
# Iterate through all nodes of the median. # Iterate through all nodes of the median.
for i in range(0, nx.number_of_nodes(median)): for i in range(0, nx.number_of_nodes(median)):
# print('i: ', i)
# print('i: ', i)
# Collect the labels of the substituted nodes. # Collect the labels of the substituted nodes.
node_labels = [] node_labels = []
for graph_id, graph in graphs.items(): for graph_id, graph in graphs.items():
# print('graph_id: ', graph_id)
# print(self.__node_maps_from_median[graph_id])
# print('graph_id: ', graph_id)
# print(self.__node_maps_from_median[graph_id])
# print(self.__node_maps_from_median[graph_id].forward_map, self.__node_maps_from_median[graph_id].backward_map)
k = self.__node_maps_from_median[graph_id].image(i) k = self.__node_maps_from_median[graph_id].image(i)
# print('k: ', k)
# print('k: ', k)
if k != np.inf: if k != np.inf:
node_labels.append(graph.nodes[k]) node_labels.append(graph.nodes[k])
@@ -816,26 +944,70 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no




def __update_node_maps(self): def __update_node_maps(self):
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout)
# Update the node maps. # Update the node maps.
node_maps_were_modified = False
for graph_id, node_map in self.__node_maps_from_median.items():
self.__ged_env.run_method(self.__median_id, graph_id)
if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon:
# xxx = self.__node_maps_from_median[graph_id]
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id)
# yyy = self.__node_maps_from_median[graph_id]
node_maps_were_modified = True
if self.__parallel:
# @todo: notice when parallel self.__ged_env is not modified.
node_maps_were_modified = False
# xxx = self.__node_maps_from_median.copy()
len_itr = len(self.__node_maps_from_median)
itr = [item for item in self.__node_maps_from_median.items()]
n_jobs = multiprocessing.cpu_count()
if len_itr < 100 * n_jobs:
chunksize = int(len_itr / n_jobs) + 1
else:
chunksize = 100
def init_worker(ged_env_toshare):
global G_ged_env
G_ged_env = ged_env_toshare
nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id)
do_fun = partial(_update_node_maps_parallel, self.__median_id, self.__epsilon, self.__sort_graphs, nb_nodes_median)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(self.__ged_env,))
if self.__print_to_stdout == 2:
iterator = tqdm(pool.imap_unordered(do_fun, itr, chunksize),
desc='Updating node maps', file=sys.stdout)
else:
iterator = pool.imap_unordered(do_fun, itr, chunksize)
for g_id, node_map, nm_modified in iterator:
self.__node_maps_from_median[g_id] = node_map
if nm_modified:
node_maps_were_modified = True
pool.close()
pool.join()
# yyy = self.__node_maps_from_median.copy()

else:
# Print information about current iteration. # Print information about current iteration.
if self.__print_to_stdout == 2: if self.__print_to_stdout == 2:
progress.update(1)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
progress = tqdm(desc='Updating node maps', total=len(self.__node_maps_from_median), file=sys.stdout)
node_maps_were_modified = False
nb_nodes_median = self.__ged_env.get_graph_num_nodes(self.__median_id)
for graph_id, node_map in self.__node_maps_from_median.items():
nb_nodes_g = self.__ged_env.get_graph_num_nodes(graph_id)
if nb_nodes_median <= nb_nodes_g or not self.__sort_graphs:
self.__ged_env.run_method(self.__median_id, graph_id)
if self.__ged_env.get_upper_bound(self.__median_id, graph_id) < node_map.induced_cost() - self.__epsilon:
# xxx = self.__node_maps_from_median[graph_id]
self.__node_maps_from_median[graph_id] = self.__ged_env.get_node_map(self.__median_id, graph_id)
node_maps_were_modified = True
else:
self.__ged_env.run_method(graph_id, self.__median_id)
if self.__ged_env.get_upper_bound(graph_id, self.__median_id) < node_map.induced_cost() - self.__epsilon:
node_map_tmp = self.__ged_env.get_node_map(graph_id, self.__median_id)
node_map_tmp.forward_map, node_map_tmp.backward_map = node_map_tmp.backward_map, node_map_tmp.forward_map
self.__node_maps_from_median[graph_id] = node_map_tmp
node_maps_were_modified = True
# Print information about current iteration.
if self.__print_to_stdout == 2:
progress.update(1)
# Print information about current iteration.
if self.__print_to_stdout == 2:
print('\n')
# Return true if the node maps were modified. # Return true if the node maps were modified.
return node_maps_were_modified return node_maps_were_modified
@@ -846,6 +1018,11 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
if self.__print_to_stdout == 2: if self.__print_to_stdout == 2:
print('Trying to decrease order: ... ', end='') print('Trying to decrease order: ... ', end='')
if nx.number_of_nodes(median) <= 1:
if self.__print_to_stdout == 2:
print('median graph has only 1 node, skip decrease.')
return False
# Initialize ID of the node that is to be deleted. # Initialize ID of the node that is to be deleted.
id_deleted_node = [None] # @todo: or np.inf id_deleted_node = [None] # @todo: or np.inf
decreased_order = False decreased_order = False
@@ -853,7 +1030,11 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
# Decrease the order as long as the best deletion delta is negative. # Decrease the order as long as the best deletion delta is negative.
while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon: while self.__compute_best_deletion_delta(graphs, median, id_deleted_node) < -self.__epsilon:
decreased_order = True decreased_order = True
median = self.__delete_node_from_median(id_deleted_node[0], median)
self.__delete_node_from_median(id_deleted_node[0], median)
if nx.number_of_nodes(median) <= 1:
if self.__print_to_stdout == 2:
print('decrease stopped because median graph remains only 1 node. ', end='')
break
# Print information about current iteration. # Print information about current iteration.
if self.__print_to_stdout == 2: if self.__print_to_stdout == 2:
@@ -896,16 +1077,22 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
def __delete_node_from_median(self, id_deleted_node, median): def __delete_node_from_median(self, id_deleted_node, median):
# Update the median. # Update the median.
mapping = {}
for i in range(0, nx.number_of_nodes(median)):
if i != id_deleted_node:
new_i = (i if i < id_deleted_node else (i - 1))
mapping[i] = new_i
median.remove_node(id_deleted_node) median.remove_node(id_deleted_node)
median = nx.convert_node_labels_to_integers(median, first_label=0, ordering='default', label_attribute=None) # @todo: This doesn't guarantee that the order is the same as in G.
nx.relabel_nodes(median, mapping, copy=False)
# Update the node maps. # Update the node maps.
# xxx = self.__node_maps_from_median
for key, node_map in self.__node_maps_from_median.items(): for key, node_map in self.__node_maps_from_median.items():
new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes()) new_node_map = NodeMap(nx.number_of_nodes(median), node_map.num_target_nodes())
is_unassigned_target_node = [True] * node_map.num_target_nodes() is_unassigned_target_node = [True] * node_map.num_target_nodes()
for i in range(0, nx.number_of_nodes(median) + 1): for i in range(0, nx.number_of_nodes(median) + 1):
if i != id_deleted_node: if i != id_deleted_node:
new_i = (i if i < id_deleted_node else i - 1)
new_i = (i if i < id_deleted_node else (i - 1))
k = node_map.image(i) k = node_map.image(i)
new_node_map.add_assignment(new_i, k) new_node_map.add_assignment(new_i, k)
if k != np.inf: if k != np.inf:
@@ -913,13 +1100,12 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
for k in range(0, node_map.num_target_nodes()): for k in range(0, node_map.num_target_nodes()):
if is_unassigned_target_node[k]: if is_unassigned_target_node[k]:
new_node_map.add_assignment(np.inf, k) new_node_map.add_assignment(np.inf, k)
# print(new_node_map.get_forward_map(), new_node_map.get_backward_map())
# print(self.__node_maps_from_median[key].forward_map, self.__node_maps_from_median[key].backward_map)
# print(new_node_map.forward_map, new_node_map.backward_map
self.__node_maps_from_median[key] = new_node_map self.__node_maps_from_median[key] = new_node_map
# Increase overall number of decreases. # Increase overall number of decreases.
self.__num_decrease_order += 1 self.__num_decrease_order += 1
return median
def __increase_order(self, graphs, median): def __increase_order(self, graphs, median):
@@ -1115,10 +1301,22 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
continue continue
for label in median_labels: for label in median_labels:
weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id]))) weights[label_id] = min(weights[label_id], self.__ged_env.get_node_rel_cost(dict(label), dict(node_labels[label_id])))
selected_label_id = urng.choice(range(0, len(weights)), size=1, p=np.array(weights) / np.sum(weights))[0] # for c++ test: xxx[iii]
# get non-zero weights.
weights_p, idx_p = [], []
for i, w in enumerate(weights):
if w != 0:
weights_p.append(w)
idx_p.append(i)
if len(weights_p) > 0:
p = np.array(weights_p) / np.sum(weights_p)
selected_label_id = urng.choice(range(0, len(weights_p)), size=1, p=p)[0] # for c++ test: xxx[iii]
selected_label_id = idx_p[selected_label_id]
# iii += 1 for c++ test # iii += 1 for c++ test
median_labels.append(node_labels[selected_label_id])
already_selected[selected_label_id] = True
median_labels.append(node_labels[selected_label_id])
already_selected[selected_label_id] = True
else: # skip the loop when all node_labels are selected. This happens when len(node_labels) <= self.__num_inits_increase_order.
break
else: else:
# Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size. # Compute the initial node medians as the medians of randomly generated clusters of (roughly) equal size.
# @todo: go through and test. # @todo: go through and test.
@@ -1195,6 +1393,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
def __update_node_label(self, node_labels, node_label): def __update_node_label(self, node_labels, node_label):
if len(node_labels) == 0: # @todo: check if this is the correct solution. Especially after calling __update_config().
return False
new_node_label = self.__get_median_node_label(node_labels) new_node_label = self.__get_median_node_label(node_labels)
if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon: if self.__ged_env.get_node_rel_cost(new_node_label, node_label) > self.__epsilon:
node_label.clear() node_label.clear()
@@ -1225,7 +1425,8 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
def __add_node_to_median(self, best_config, best_label, median): def __add_node_to_median(self, best_config, best_label, median):
# Update the median. # Update the median.
median.add_node(nx.number_of_nodes(median), **best_label)
nb_nodes_median = nx.number_of_nodes(median)
median.add_node(nb_nodes_median, **best_label)
# Update the node maps. # Update the node maps.
for graph_id, node_map in self.__node_maps_from_median.items(): for graph_id, node_map in self.__node_maps_from_median.items():
@@ -1239,47 +1440,6 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
# Increase overall number of increases. # Increase overall number of increases.
self.__num_increase_order += 1 self.__num_increase_order += 1
def __improve_sum_of_distances(self, timer):
pass
def __median_available(self):
return self.__median_id != np.inf
# def __get_node_image_from_map(self, node_map, node):
# """
# Return ID of the node mapping of `node` in `node_map`.

# Parameters
# ----------
# node_map : list[tuple(int, int)]
# List of node maps where the mapping node is found.
#
# node : int
# The mapping node of this node is returned

# Raises
# ------
# Exception
# If the node with ID `node` is not contained in the source nodes of the node map.

# Returns
# -------
# int
# ID of the mapping of `node`.
#
# Notes
# -----
# This function is not implemented in the `ged::MedianGraphEstimator` class of the `GEDLIB` library. Instead it is a Python implementation of the `ged::NodeMap::image` function.
# """
# if node < len(node_map):
# return node_map[node][1] if node_map[node][1] < len(node_map) else np.inf
# else:
# raise Exception('The node with ID ', str(node), ' is not contained in the source nodes of the node map.')
# return np.inf
def __are_graphs_equal(self, g1, g2): def __are_graphs_equal(self, g1, g2):
@@ -1489,4 +1649,61 @@ class MedianGraphEstimator(object): # @todo: differ dummy_node from undifined no
# median_label = {} # median_label = {}
# for key, val in median.items(): # for key, val in median.items():
# median_label[key] = str(val) # median_label[key] = str(val)
# return median_label
# return median_label


def _compute_medoid_parallel(graph_ids, sort, itr):
g_id = itr[0]
i = itr[1]
# @todo: timer not considered here.
# if timer.expired():
# self.__state = AlgorithmState.CALLED
# break
nb_nodes_g = G_ged_env.get_graph_num_nodes(g_id)
sum_of_distances = 0
for h_id in graph_ids:
nb_nodes_h = G_ged_env.get_graph_num_nodes(h_id)
if nb_nodes_g <= nb_nodes_h or not sort:
G_ged_env.run_method(g_id, h_id)
sum_of_distances += G_ged_env.get_upper_bound(g_id, h_id)
else:
G_ged_env.run_method(h_id, g_id)
sum_of_distances += G_ged_env.get_upper_bound(h_id, g_id)
return i, sum_of_distances

def _compute_init_node_maps_parallel(gen_median_id, sort, nb_nodes_median, itr):
graph_id = itr
nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id)
if nb_nodes_median <= nb_nodes_g or not sort:
G_ged_env.run_method(gen_median_id, graph_id)
node_map = G_ged_env.get_node_map(gen_median_id, graph_id)
# print(self.__node_maps_from_median[graph_id])
else:
G_ged_env.run_method(graph_id, gen_median_id)
node_map = G_ged_env.get_node_map(graph_id, gen_median_id)
node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map
sum_of_distance = node_map.induced_cost()
# print(self.__sum_of_distances)
return graph_id, sum_of_distance, node_map

def _update_node_maps_parallel(median_id, epsilon, sort, nb_nodes_median, itr):
graph_id = itr[0]
node_map = itr[1]

node_maps_were_modified = False
nb_nodes_g = G_ged_env.get_graph_num_nodes(graph_id)
if nb_nodes_median <= nb_nodes_g or not sort:
G_ged_env.run_method(median_id, graph_id)
if G_ged_env.get_upper_bound(median_id, graph_id) < node_map.induced_cost() - epsilon:
node_map = G_ged_env.get_node_map(median_id, graph_id)
node_maps_were_modified = True
else:
G_ged_env.run_method(graph_id, median_id)
if G_ged_env.get_upper_bound(graph_id, median_id) < node_map.induced_cost() - epsilon:
node_map = G_ged_env.get_node_map(graph_id, median_id)
node_map.forward_map, node_map.backward_map = node_map.backward_map, node_map.forward_map
node_maps_were_modified = True
return graph_id, node_map, node_maps_were_modified

+ 4
- 4
gklearn/ged/median/test_median_graph_estimator.py View File

@@ -53,7 +53,7 @@ def test_median_graph_estimator():
mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1') mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1')
mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --update-order TRUE --refine FALSE --randomness PSEUDO '# @todo: std::to_string(rng())
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --update-order TRUE --refine FALSE --randomness PSEUDO --parallel TRUE '# @todo: std::to_string(rng())
# Select the GED algorithm. # Select the GED algorithm.
algo_options = '--threads ' + str(threads) + algo_options_suffix algo_options = '--threads ' + str(threads) + algo_options_suffix
@@ -127,7 +127,7 @@ def test_median_graph_estimator_symb():
mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1') mge.set_refine_method(algo, '--threads ' + str(threads) + ' --initial-solutions ' + str(initial_solutions) + ' --ratio-runs-from-initial-solutions 1')
mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type mge_options = '--time-limit ' + str(time_limit) + ' --stdout 2 --init-type ' + init_type
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --update-order TRUE --refine FALSE'# @todo: std::to_string(rng())
mge_options += ' --random-inits ' + str(num_inits) + ' --seed ' + '1' + ' --update-order TRUE --refine FALSE --randomness PSEUDO --parallel TRUE '# @todo: std::to_string(rng())
# Select the GED algorithm. # Select the GED algorithm.
algo_options = '--threads ' + str(threads) + algo_options_suffix algo_options = '--threads ' + str(threads) + algo_options_suffix
@@ -155,5 +155,5 @@ def test_median_graph_estimator_symb():




if __name__ == '__main__': if __name__ == '__main__':
set_median, gen_median = test_median_graph_estimator()
# set_median, gen_median = test_median_graph_estimator_symb()
# set_median, gen_median = test_median_graph_estimator()
set_median, gen_median = test_median_graph_estimator_symb()

+ 4
- 0
gklearn/ged/median/utils.py View File

@@ -30,8 +30,12 @@ def mge_options_to_string(options):
opt_str += '--randomness ' + str(val) + ' ' opt_str += '--randomness ' + str(val) + ' '
elif key == 'verbose': elif key == 'verbose':
opt_str += '--stdout ' + str(val) + ' ' opt_str += '--stdout ' + str(val) + ' '
elif key == 'parallel':
opt_str += '--parallel ' + ('TRUE' if val else 'FALSE') + ' '
elif key == 'update_order': elif key == 'update_order':
opt_str += '--update-order ' + ('TRUE' if val else 'FALSE') + ' ' opt_str += '--update-order ' + ('TRUE' if val else 'FALSE') + ' '
elif key == 'sort_graphs':
opt_str += '--sort-graphs ' + ('TRUE' if val else 'FALSE') + ' '
elif key == 'refine': elif key == 'refine':
opt_str += '--refine ' + ('TRUE' if val else 'FALSE') + ' ' opt_str += '--refine ' + ('TRUE' if val else 'FALSE') + ' '
elif key == 'time_limit': elif key == 'time_limit':


+ 26
- 11
gklearn/ged/util/util.py View File

@@ -46,7 +46,7 @@ def compute_ged(g1, g2, options):
return dis, pi_forward, pi_backward return dis, pi_forward, pi_backward




def compute_geds(graphs, options={}, parallel=False):
def compute_geds(graphs, options={}, sort=True, parallel=False, verbose=True):
# initialize ged env. # initialize ged env.
ged_env = gedlibpy.GEDEnv() ged_env = gedlibpy.GEDEnv()
ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants']) ged_env.set_edit_cost(options['edit_cost'], edit_cost_constant=options['edit_cost_constants'])
@@ -54,6 +54,8 @@ def compute_geds(graphs, options={}, parallel=False):
ged_env.add_nx_graph(g, '') ged_env.add_nx_graph(g, '')
listID = ged_env.get_all_graph_ids() listID = ged_env.get_all_graph_ids()
ged_env.init() ged_env.init()
if parallel:
options['threads'] = 1
ged_env.set_method(options['method'], ged_options_to_string(options)) ged_env.set_method(options['method'], ged_options_to_string(options))
ged_env.init_method() ged_env.init_method()


@@ -77,10 +79,13 @@ def compute_geds(graphs, options={}, parallel=False):
G_graphs = graphs_toshare G_graphs = graphs_toshare
G_ged_env = ged_env_toshare G_ged_env = ged_env_toshare
G_listID = listID_toshare G_listID = listID_toshare
do_partial = partial(_wrapper_compute_ged_parallel, neo_options)
do_partial = partial(_wrapper_compute_ged_parallel, neo_options, sort)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
if verbose:
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout) desc='computing GEDs', file=sys.stdout)
else:
iterator = pool.imap_unordered(do_partial, itr, chunksize)
# iterator = pool.imap_unordered(do_partial, itr, chunksize) # iterator = pool.imap_unordered(do_partial, itr, chunksize)
for i, j, dis, n_eo_tmp in iterator: for i, j, dis, n_eo_tmp in iterator:
idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2) idx_itr = int(len(graphs) * i + j - (i + 1) * (i + 2) / 2)
@@ -96,28 +101,38 @@ def compute_geds(graphs, options={}, parallel=False):
else: else:
ged_vec = [] ged_vec = []
n_edit_operations = [] n_edit_operations = []
for i in tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout):
if verbose:
iterator = tqdm(range(len(graphs)), desc='computing GEDs', file=sys.stdout)
else:
iterator = range(len(graphs))
for i in iterator:
# for i in range(len(graphs)): # for i in range(len(graphs)):
for j in range(i + 1, len(graphs)): for j in range(i + 1, len(graphs)):
dis, pi_forward, pi_backward = _compute_ged(ged_env, listID[i], listID[j], graphs[i], graphs[j])
if nx.number_of_nodes(graphs[i]) <= nx.number_of_nodes(graphs[j]) or not sort:
dis, pi_forward, pi_backward = _compute_ged(ged_env, listID[i], listID[j], graphs[i], graphs[j])
else:
dis, pi_backward, pi_forward = _compute_ged(ged_env, listID[j], listID[i], graphs[j], graphs[i])
ged_vec.append(dis) ged_vec.append(dis)
ged_mat[i][j] = dis ged_mat[i][j] = dis
ged_mat[j][i] = dis ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, **neo_options)
n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, **neo_options)
n_edit_operations.append(n_eo_tmp) n_edit_operations.append(n_eo_tmp)
return ged_vec, ged_mat, n_edit_operations return ged_vec, ged_mat, n_edit_operations




def _wrapper_compute_ged_parallel(options, itr):
def _wrapper_compute_ged_parallel(options, sort, itr):
i = itr[0] i = itr[0]
j = itr[1] j = itr[1]
dis, n_eo_tmp = _compute_ged_parallel(G_ged_env, G_listID[i], G_listID[j], G_graphs[i], G_graphs[j], options)
dis, n_eo_tmp = _compute_ged_parallel(G_ged_env, G_listID[i], G_listID[j], G_graphs[i], G_graphs[j], options, sort)
return i, j, dis, n_eo_tmp return i, j, dis, n_eo_tmp




def _compute_ged_parallel(env, gid1, gid2, g1, g2, options):
dis, pi_forward, pi_backward = _compute_ged(env, gid1, gid2, g1, g2)
def _compute_ged_parallel(env, gid1, gid2, g1, g2, options, sort):
if nx.number_of_nodes(g1) <= nx.number_of_nodes(g2) or not sort:
dis, pi_forward, pi_backward = _compute_ged(env, gid1, gid2, g1, g2)
else:
dis, pi_backward, pi_forward = _compute_ged(env, gid2, gid1, g2, g1)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward, **options) # [0,0,0,0,0,0] n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward, **options) # [0,0,0,0,0,0]
return dis, n_eo_tmp return dis, n_eo_tmp




+ 1468
- 1466
gklearn/gedlib/gedlibpy.cpp
File diff suppressed because it is too large
View File


BIN
gklearn/gedlib/gedlibpy.cpython-36m-x86_64-linux-gnu.so View File


+ 9
- 1
gklearn/gedlib/gedlibpy.pyx View File

@@ -112,6 +112,7 @@ cdef extern from "src/GedLibBind.hpp" namespace "pyged":
##CYTHON WRAPPER INTERFACES## ##CYTHON WRAPPER INTERFACES##
############################# #############################


# import cython
import numpy as np import numpy as np
import networkx as nx import networkx as nx
from gklearn.ged.env import NodeMap from gklearn.ged.env import NodeMap
@@ -177,14 +178,16 @@ def get_dummy_node() :
return getDummyNode() return getDummyNode()


# @cython.auto_pickle(True)
cdef class GEDEnv: cdef class GEDEnv:
"""Cython wrapper class for C++ class PyGEDEnv """Cython wrapper class for C++ class PyGEDEnv
""" """
# cdef PyGEDEnv c_env # Hold a C++ instance which we're wrapping
# cdef PyGEDEnv c_env # Hold a C++ instance which we're wrapping
cdef PyGEDEnv* c_env # hold a pointer to the C++ instance which we're wrapping cdef PyGEDEnv* c_env # hold a pointer to the C++ instance which we're wrapping




def __cinit__(self): def __cinit__(self):
# self.c_env = PyGEDEnv()
self.c_env = new PyGEDEnv() self.c_env = new PyGEDEnv()
@@ -192,6 +195,11 @@ cdef class GEDEnv:
del self.c_env del self.c_env




# def __reduce__(self):
# # return GEDEnv, (self.c_env,)
# return GEDEnv, tuple()


def is_initialized(self) : def is_initialized(self) :
""" """
Checks and returns if the computation environment is initialized or not. Checks and returns if the computation environment is initialized or not.


+ 3
- 0
gklearn/kernels/graph_kernel.py View File

@@ -67,6 +67,9 @@ class GraphKernel(object):
def normalize_gm(self, gram_matrix): def normalize_gm(self, gram_matrix):
import warnings
warnings.warn('gklearn.kernels.graph_kernel.normalize_gm will be deprecated, use gklearn.utils.normalize_gram_matrix instead', DeprecationWarning)

diag = gram_matrix.diagonal().copy() diag = gram_matrix.diagonal().copy()
for i in range(len(gram_matrix)): for i in range(len(gram_matrix)):
for j in range(i, len(gram_matrix)): for j in range(i, len(gram_matrix)):


+ 1
- 0
gklearn/preimage/__init__.py View File

@@ -12,3 +12,4 @@ __date__ = "March 2020"


from gklearn.preimage.preimage_generator import PreimageGenerator from gklearn.preimage.preimage_generator import PreimageGenerator
from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator from gklearn.preimage.median_preimage_generator import MedianPreimageGenerator
from gklearn.preimage.kernel_knn_cv import kernel_knn_cv

+ 91
- 0
gklearn/preimage/experiments/tools/analyze_results_of_random_edit_costs.py View File

@@ -0,0 +1,91 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 14 16:57:18 2020

@author: ljia
"""
import pandas as pd
import numpy as np
import os
import math

def summarize_results_of_random_edit_costs(data_dir, ds_name, gkernel):
sod_sm_list = []
sod_gm_list = []
dis_k_sm_list = []
dis_k_gm_list = []
dis_k_min_gi = []
time_total_list = []
mge_dec_order_list = []
mge_inc_order_list = []
# get results from .csv.
file_name = data_dir + 'results_summary.' + ds_name + '.' + gkernel + '.csv'
try:
df = pd.read_csv(file_name)
except FileNotFoundError:
return
for index, row in df.iterrows():
if row['target'] == 'all' and row['fit method'] == 'random':
if not math.isnan(float(row['SOD SM'])):
sod_sm_list.append(float(row['SOD SM']))
if not math.isnan(float(row['SOD GM'])):
sod_gm_list.append(float(row['SOD GM']))
if not math.isnan(float(row['dis_k SM'])):
dis_k_sm_list.append(float(row['dis_k SM']))
if not math.isnan(float(row['dis_k GM'])):
dis_k_gm_list.append(float(row['dis_k GM']))
if not math.isnan(float(row['min dis_k gi'])):
dis_k_min_gi.append(float(row['min dis_k gi']))
if not math.isnan(float(row['time total'])):
time_total_list.append(float(row['time total']))
if 'mge num decrease order' in row:
mge_dec_order_list.append(int(row['mge num decrease order']))
if 'mge num increase order' in row:
mge_inc_order_list.append(int(row['mge num increase order']))
# return if no results.
if len(sod_sm_list) == 0:
return
# construct output results.
op = {}
op['measure'] = ['max', 'min', 'mean']
op['SOD SM'] = [np.max(sod_sm_list), np.min(sod_sm_list), np.mean(sod_sm_list)]
op['SOD GM'] = [np.max(sod_gm_list), np.min(sod_gm_list), np.mean(sod_gm_list)]
op['dis_k SM'] = [np.max(dis_k_sm_list), np.min(dis_k_sm_list), np.mean(dis_k_sm_list)]
op['dis_k GM'] = [np.max(dis_k_gm_list), np.min(dis_k_gm_list), np.mean(dis_k_gm_list)]
op['min dis_k gi'] = [np.max(dis_k_min_gi), np.min(dis_k_min_gi), np.mean(dis_k_min_gi)]
op['time total'] = [np.max(time_total_list), np.min(time_total_list), np.mean(time_total_list)]
if len(mge_dec_order_list) > 0:
op['mge num decrease order'] = [np.max(mge_dec_order_list), np.min(mge_dec_order_list), np.mean(mge_dec_order_list)]
if len(mge_inc_order_list) > 0:
op['mge num increase order'] = [np.max(mge_inc_order_list), np.min(mge_inc_order_list), np.mean(mge_inc_order_list)]
df = pd.DataFrame(data=op)
# write results to .csv
df.to_csv(data_dir + 'summary_for_random_edit_costs.csv', index=False, header=True)
def compute_for_all_experiments(data_dir):
dir_list = [i for i in os.listdir(data_dir) if os.path.isdir(data_dir + i)]
for dir_name in dir_list:
sp_tmp = dir_name.split('.')
ds_name = sp_tmp[0].strip('[error]')
gkernel = sp_tmp[1]
summarize_results_of_random_edit_costs(data_dir + dir_name + '/',
ds_name, gkernel)
if os.path.exists(data_dir + dir_name + '/update_order/'):
summarize_results_of_random_edit_costs(data_dir + dir_name + '/update_order/',
ds_name, gkernel)


if __name__ == '__main__':
# data_dir = '../results/xp_median_preimage.update_order/'
root_dir_tnz = '../../results/CRIANN/xp_median_preimage.init10/'
root_dir_ntnz = '../../results/CRIANN/xp_median_preimage.init10.no_triangle_rule/'
root_dir_tz = '../../results/CRIANN/xp_median_preimage.init10.triangle_rule.allow_zeros/'
root_dir_ntz = '../../results/CRIANN/xp_median_preimage.init10.no_triangle_rule.allow_zeros/'
data_dirs = [root_dir_tnz, root_dir_ntnz, root_dir_tz, root_dir_ntz]
for data_dir in data_dirs:
compute_for_all_experiments(data_dir)

+ 228
- 0
gklearn/preimage/experiments/tools/preimage_results_to_latex_tables.py View File

@@ -0,0 +1,228 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 30 10:16:33 2020

@author: ljia
"""
import pandas as pd
import numpy as np
import os


DS_SYMB = ['MUTAG', 'Monoterpenoides', 'MAO_symb']
DS_NON_SYMB = ['Letter-high', 'Letter-med', 'Letter-low', 'COIL-RAG', 'PAH']
DS_UNLABELED = ['PAH_unlabeled']


def rounder(x, decimals):
x_strs = str(x).split('.')
if len(x_strs) == 2:
before = x_strs[0]
after = x_strs[1]
if len(after) > decimals:
if int(after[decimals]) >= 5:
after0s = ''
for c in after:
if c == '0':
after0s += '0'
elif c != '0':
break
after = after0s + str(int(after[0:decimals]) + 1)[-decimals:]
else:
after = after[0:decimals]
elif len(after) < decimals:
after += '0' * (decimals - len(after))
return before + '.' + after

elif len(x_strs) == 1:
return x_strs[0]

def replace_nth(string, sub, wanted, n):
import re
where = [m.start() for m in re.finditer(sub, string)][n-1]
before = string[:where]
after = string[where:]
after = after.replace(sub, wanted, 1)
newString = before + after
return newString


def df_to_latex_table(df):
ltx = df.to_latex(index=True, escape=False, multirow=True)
# modify middle lines.
ltx = ltx.replace('\\cline{1-9}\n\\cline{2-9}', '\\toprule')
ltx = ltx.replace('\\cline{2-9}', '\\cmidrule(l){2-9}')
# modify first row.
i_start = ltx.find('\n\\toprule\n')
i_end = ltx.find('\\\\\n\\midrule\n')
ltx = ltx.replace(ltx[i_start:i_end+12], '\n\\toprule\nDatasets & Graph Kernels & Algorithms & $d_\\mathcal{F}$ SM & $d_\\mathcal{F}$ SM (UO) & $d_\\mathcal{F}$ GM & $d_\\mathcal{F}$ GM (UO) & Runtime & Runtime (UO) \\\\\n\\midrule\n', 1)
# add row numbers.
ltx = ltx.replace('lllllllll', 'lllllllll|@{\\makebox[2em][r]{\\textit{\\rownumber\\space}}}', 1)
ltx = replace_nth(ltx, '\\\\\n', '\\gdef\\rownumber{\\stepcounter{magicrownumbers}\\arabic{magicrownumbers}} \\\\\n', 1)
return ltx


def beautify_df(df):
df = df.sort_values(by=['Datasets', 'Graph Kernels'])
df = df.set_index(['Datasets', 'Graph Kernels', 'Algorithms'])
# index = pd.MultiIndex.from_frame(df[['Datasets', 'Graph Kernels', 'Algorithms']])

# bold the best results.
for ds in df.index.get_level_values('Datasets').unique():
for gk in df.loc[ds].index.get_level_values('Graph Kernels').unique():
min_val = np.inf
min_indices = []
min_labels = []
for index, row in df.loc[(ds, gk)].iterrows():
for label in ['$d_\mathcal{F}$ SM', '$d_\mathcal{F}$ GM', '$d_\mathcal{F}$ GM (UO)']:
value = row[label]
if value != '-':
value = float(value.strip('/same'))
if value < min_val:
min_val = value
min_indices = [index]
min_labels = [label]
elif value == min_val:
min_indices.append(index)
min_labels.append(label)
for idx, index in enumerate(min_indices):
df.loc[(ds, gk, index), min_labels[idx]] = '\\textbf{' + df.loc[(ds, gk, index), min_labels[idx]] + '}'
return df


def get_results(data_dir, ds_name, gkernel):
# get results from .csv.
file_name = data_dir + 'results_summary.' + ds_name + '.' + gkernel + '.csv'
try:
df_summary = pd.read_csv(file_name)
except FileNotFoundError:
return None

df_results = pd.DataFrame(index=None, columns=['d_F SM', 'd_F GM', 'runtime'])
for index, row in df_summary.iterrows():
if row['target'] == 'all' and row['fit method'] == 'k-graphs':
df_results.loc['From median set'] = ['-', rounder(row['min dis_k gi'], 3), '-']
if_uo = (int(row['mge num decrease order']) > 0 or int(row['mge num increase order']) > 0)
df_results.loc['Optimized'] = [rounder(row['dis_k SM'], 3),
rounder(row['dis_k GM'], 3) if if_uo else (rounder(row['dis_k GM'], 3) + '/same'),
rounder(row['time total'], 2)]
if row['target'] == 'all' and row['fit method'] == 'expert':
if_uo = (int(row['mge num decrease order']) > 0 or int(row['mge num increase order']) > 0)
df_results.loc['IAM: expert costs'] = [rounder(row['dis_k SM'], 3),
rounder(row['dis_k GM'], 3) if if_uo else (rounder(row['dis_k GM'], 3) + '/same'),
rounder(row['time total'], 2)]
# get results from random summary .csv.
random_fini = True
file_name = data_dir + 'summary_for_random_edit_costs.csv'
try:
df_random = pd.read_csv(file_name)
except FileNotFoundError:
random_fini = False

if random_fini:
for index, row in df_random.iterrows():
if row['measure'] == 'mean':
if_uo = (float(row['mge num decrease order']) > 0 or float(row['mge num increase order']) > 0)
df_results.loc['IAM: random costs'] = [rounder(row['dis_k SM'], 3),
rounder(row['dis_k GM'], 3) if if_uo else (rounder(row['dis_k GM'], 3) + '/same'),
rounder(row['time total'], 2)]
# sort index.
df_results = df_results.reindex([item for item in ['From median set', 'IAM: random costs', 'IAM: expert costs', 'Optimized'] if item in df_results.index])
return df_results

def get_results_of_one_xp(data_dir, ds_name, gkernel):
df_results = pd.DataFrame()
df_tmp_uo = None
if not os.path.isfile(data_dir + 'update_order/error.txt'):
df_tmp_uo = get_results(data_dir + 'update_order/', ds_name, gkernel)

df_tmp = None
if not os.path.isfile(data_dir + 'error.txt'):
df_tmp = get_results(data_dir, ds_name, gkernel)

if (df_tmp_uo is not None and not df_tmp_uo.empty) or (df_tmp is not None and not df_tmp.empty):
df_results = pd.DataFrame(index=['From median set', 'IAM: random costs', 'IAM: expert costs', 'Optimized'], columns=['$d_\mathcal{F}$ SM', '$d_\mathcal{F}$ SM (UO)', '$d_\mathcal{F}$ GM', '$d_\mathcal{F}$ GM (UO)', 'Runtime', 'Runtime (UO)'])
if df_tmp_uo is not None and not df_tmp_uo.empty:
for index, row in df_tmp_uo.iterrows():
for algo in df_results.index:
if index == algo:
df_results.at[algo, '$d_\mathcal{F}$ SM (UO)'] = row['d_F SM']
df_results.at[algo, '$d_\mathcal{F}$ GM (UO)'] = row['d_F GM']
df_results.at[algo, 'Runtime (UO)'] = row['runtime']
if df_tmp is not None and not df_tmp.empty:
for index, row in df_tmp.iterrows():
for algo in df_results.index:
if index == algo:
df_results.at[algo, '$d_\mathcal{F}$ SM'] = row['d_F SM']
df_results.at[algo, '$d_\mathcal{F}$ GM'] = row['d_F GM'].strip('/same')
df_results.at[algo, 'Runtime'] = row['runtime']
df_results = df_results.dropna(axis=0, how='all')
df_results = df_results.fillna(value='-')
df_results = df_results.reset_index().rename(columns={'index': 'Algorithms'})
return df_results


def get_results_for_all_experiments(root_dir):
columns=['Datasets', 'Graph Kernels', 'Algorithms', '$d_\mathcal{F}$ SM', '$d_\mathcal{F}$ SM (UO)', '$d_\mathcal{F}$ GM', '$d_\mathcal{F}$ GM (UO)', 'Runtime', 'Runtime (UO)']
df_symb = pd.DataFrame(columns=columns)
df_nonsymb = pd.DataFrame(columns=columns)
df_unlabeled = pd.DataFrame(columns=columns)
dir_list = [i for i in os.listdir(root_dir) if os.path.isdir(root_dir + i)]
for dir_name in dir_list:
sp_tmp = dir_name.split('.')
gkernel = sp_tmp[1]
ds_name = sp_tmp[0].strip('[error]')
suffix = ''
if sp_tmp[-1] == 'unlabeled':
suffix = '_unlabeled'
elif sp_tmp[-1] == 'symb':
suffix = '_symb'
df_results = get_results_of_one_xp(root_dir + dir_name + '/', ds_name, gkernel)
if not df_results.empty:
ds_name += suffix
if ds_name in DS_SYMB:
for index, row in df_results.iterrows():
df_symb.loc[len(df_symb)] = [ds_name.replace('_', '\_'), gkernel] + row.tolist()
elif ds_name in DS_NON_SYMB:
for index, row in df_results.iterrows():
df_nonsymb.loc[len(df_nonsymb)] = [ds_name.replace('_', '\_'), gkernel] + row.tolist()
elif ds_name in DS_UNLABELED:
for index, row in df_results.iterrows():
df_unlabeled.loc[len(df_unlabeled)] = [ds_name.replace('_', '\_'), gkernel] + row.tolist()
else:
raise Exception('dataset' + ds_name + 'is not pre-defined.')
# sort.
df_symb = beautify_df(df_symb)
df_nonsymb = beautify_df(df_nonsymb)
df_unlabeled = beautify_df(df_unlabeled)
# convert dfs to latex strings.
ltx_symb = df_to_latex_table(df_symb)
ltx_nonsymb = df_to_latex_table(df_nonsymb)
ltx_unlabeled = df_to_latex_table(df_unlabeled)
return ltx_symb, ltx_nonsymb, ltx_unlabeled


if __name__ == '__main__':
# root_dir = '../results/xp_median_preimage.init20/'
root_dir = '../../results/CRIANN/xp_median_preimage.init10/'
ltx_symb, ltx_nonsymb, ltx_unlabeled = get_results_for_all_experiments(root_dir)

+ 3382
- 0
gklearn/preimage/experiments/xp_1nn_init10_trianglerule.py
File diff suppressed because it is too large
View File


+ 3085
- 0
gklearn/preimage/experiments/xp_remove_best_graph_init10.py
File diff suppressed because it is too large
View File


+ 418
- 0
gklearn/preimage/kernel_knn_cv.py View File

@@ -0,0 +1,418 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue May 12 12:52:15 2020

@author: ljia
"""
import numpy as np
import csv
import os
import os.path
from gklearn.utils import Dataset
from sklearn.model_selection import ShuffleSplit
from gklearn.preimage import MedianPreimageGenerator
from gklearn.utils import normalize_gram_matrix, compute_distance_matrix
from gklearn.preimage.utils import get_same_item_indices
from gklearn.utils.knn import knn_classification
from gklearn.preimage.utils import compute_k_dis

def kernel_knn_cv(ds_name, train_examples, knn_options, mpg_options, kernel_options, ged_options, mge_options, save_results=True, load_gm='auto', dir_save='', irrelevant_labels=None, edge_required=False, cut_range=None):
# 1. get dataset.
print('1. getting dataset...')
dataset_all = Dataset()
dataset_all.load_predefined_dataset(ds_name)
dataset_all.trim_dataset(edge_required=edge_required)
if irrelevant_labels is not None:
dataset_all.remove_labels(**irrelevant_labels)
if cut_range is not None:
dataset_all.cut_graphs(cut_range)

if save_results:
# create result files.
print('creating output files...')
fn_output_detail, fn_output_summary = __init_output_file_knn(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save)
else:
fn_output_detail, fn_output_summary = None, None
# 2. compute/load Gram matrix a priori.
print('2. computing/loading Gram matrix...')
gram_matrix_unnorm, time_precompute_gm = __get_gram_matrix(load_gm, dir_save, ds_name, kernel_options, dataset_all)
# 3. perform k-nn CV.
print('3. performing k-nn CV...')
if train_examples == 'k-graphs' or train_examples == 'expert' or train_examples == 'random':
__kernel_knn_cv_median(dataset_all, ds_name, knn_options, mpg_options, kernel_options, mge_options, ged_options, gram_matrix_unnorm, time_precompute_gm, train_examples, save_results, dir_save, fn_output_detail, fn_output_summary)
elif train_examples == 'best-dataset':
__kernel_knn_cv_best_ds(dataset_all, ds_name, knn_options, kernel_options, gram_matrix_unnorm, time_precompute_gm, train_examples, save_results, dir_save, fn_output_detail, fn_output_summary)
elif train_examples == 'trainset':
__kernel_knn_cv_trainset(dataset_all, ds_name, knn_options, kernel_options, gram_matrix_unnorm, time_precompute_gm, train_examples, save_results, dir_save, fn_output_detail, fn_output_summary)

print('\ncomplete.\n')
def __kernel_knn_cv_median(dataset_all, ds_name, knn_options, mpg_options, kernel_options, mge_options, ged_options, gram_matrix_unnorm, time_precompute_gm, train_examples, save_results, dir_save, fn_output_detail, fn_output_summary):
Gn = dataset_all.graphs
y_all = dataset_all.targets
n_neighbors, n_splits, test_size = knn_options['n_neighbors'], knn_options['n_splits'], knn_options['test_size']

# get shuffles.
train_indices, test_indices, train_nums, y_app = __get_shuffles(y_all, n_splits, test_size)
accuracies = [[], [], []]
for trial in range(len(train_indices)):
print('\ntrial =', trial)
train_index = train_indices[trial]
test_index = test_indices[trial]
G_app = [Gn[i] for i in train_index]
G_test = [Gn[i] for i in test_index]
y_test = [y_all[i] for i in test_index]
gm_unnorm_trial = gram_matrix_unnorm[train_index,:][:,train_index].copy()
# compute pre-images for each class.
medians = [[], [], []]
train_nums_tmp = [0] + train_nums
print('\ncomputing pre-image for each class...\n')
for i_class in range(len(train_nums_tmp) - 1):
print(i_class + 1, 'of', len(train_nums_tmp) - 1, 'classes:')
i_start = int(np.sum(train_nums_tmp[0:i_class + 1]))
i_end = i_start + train_nums_tmp[i_class + 1]
median_set = G_app[i_start:i_end]
dataset = dataset_all.copy()
dataset.load_graphs([g.copy() for g in median_set], targets=None)
mge_options['update_order'] = True
mpg_options['gram_matrix_unnorm'] = gm_unnorm_trial[i_start:i_end,i_start:i_end].copy()
mpg_options['runtime_precompute_gm'] = 0
set_median, gen_median_uo = __generate_median_preimages(dataset, mpg_options, kernel_options, ged_options, mge_options)
mge_options['update_order'] = False
mpg_options['gram_matrix_unnorm'] = gm_unnorm_trial[i_start:i_end,i_start:i_end].copy()
mpg_options['runtime_precompute_gm'] = 0
_, gen_median = __generate_median_preimages(dataset, mpg_options, kernel_options, ged_options, mge_options)
medians[0].append(set_median)
medians[1].append(gen_median)
medians[2].append(gen_median_uo)
# for each set of medians.
print('\nperforming k-nn...')
for i_app, G_app in enumerate(medians):
# compute dis_mat between medians.
dataset = dataset_all.copy()
dataset.load_graphs([g.copy() for g in G_app], targets=None)
gm_app_unnorm, _ = __compute_gram_matrix_unnorm(dataset, kernel_options.copy())
# compute the entire Gram matrix.
graph_kernel = __get_graph_kernel(dataset.copy(), kernel_options.copy())
kernels_to_medians = []
for g in G_app:
kernels_to_median, _ = graph_kernel.compute(g, G_test, **kernel_options.copy())
kernels_to_medians.append(kernels_to_median)
kernels_to_medians = np.array(kernels_to_medians)
gm_all = np.concatenate((gm_app_unnorm, kernels_to_medians), axis=1)
gm_all = np.concatenate((gm_all, np.concatenate((kernels_to_medians.T, gram_matrix_unnorm[test_index,:][:,test_index].copy()), axis=1)), axis=0)
gm_all = normalize_gram_matrix(gm_all.copy())
dis_mat, _, _, _ = compute_distance_matrix(gm_all)
N = len(G_app)
d_app = dis_mat[range(N),:][:,range(N)].copy()
d_test = np.zeros((N, len(test_index)))
for i in range(N):
for j in range(len(test_index)):
d_test[i, j] = dis_mat[i, j]
accuracies[i_app].append(knn_classification(d_app, d_test, y_app, y_test, n_neighbors, verbose=True, text=train_examples))
# write result detail.
if save_results:
f_detail = open(dir_save + fn_output_detail, 'a')
print('writing results to files...')
for i, median_type in enumerate(['set-median', 'gen median', 'gen median uo']):
csv.writer(f_detail).writerow([ds_name, kernel_options['name'],
train_examples + ': ' + median_type, trial,
knn_options['n_neighbors'],
len(gm_all), knn_options['test_size'],
accuracies[i][-1][0], accuracies[i][-1][1]])
f_detail.close()
results = {}
results['ave_perf_train'] = [np.mean([i[0] for i in j], axis=0) for j in accuracies]
results['std_perf_train'] = [np.std([i[0] for i in j], axis=0, ddof=1) for j in accuracies]
results['ave_perf_test'] = [np.mean([i[1] for i in j], axis=0) for j in accuracies]
results['std_perf_test'] = [np.std([i[1] for i in j], axis=0, ddof=1) for j in accuracies]

# write result summary for each letter.
if save_results:
f_summary = open(dir_save + fn_output_summary, 'a')
for i, median_type in enumerate(['set-median', 'gen median', 'gen median uo']):
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
train_examples + ': ' + median_type,
knn_options['n_neighbors'],
knn_options['test_size'], results['ave_perf_train'][i],
results['ave_perf_test'][i], results['std_perf_train'][i],
results['std_perf_test'][i], time_precompute_gm])
f_summary.close()
def __kernel_knn_cv_best_ds(dataset_all, ds_name, knn_options, kernel_options, gram_matrix_unnorm, time_precompute_gm, train_examples, save_results, dir_save, fn_output_detail, fn_output_summary):
Gn = dataset_all.graphs
y_all = dataset_all.targets
n_neighbors, n_splits, test_size = knn_options['n_neighbors'], knn_options['n_splits'], knn_options['test_size']

# get shuffles.
train_indices, test_indices, train_nums, y_app = __get_shuffles(y_all, n_splits, test_size)
accuracies = []
for trial in range(len(train_indices)):
print('\ntrial =', trial)
train_index = train_indices[trial]
test_index = test_indices[trial]
G_app = [Gn[i] for i in train_index]
G_test = [Gn[i] for i in test_index]
y_test = [y_all[i] for i in test_index]
gm_unnorm_trial = gram_matrix_unnorm[train_index,:][:,train_index].copy()
# get best graph from trainset according to distance in kernel space for each class.
best_graphs = []
train_nums_tmp = [0] + train_nums
print('\ngetting best graph from trainset for each class...')
for i_class in range(len(train_nums_tmp) - 1):
print(i_class + 1, 'of', len(train_nums_tmp) - 1, 'classes.')
i_start = int(np.sum(train_nums_tmp[0:i_class + 1]))
i_end = i_start + train_nums_tmp[i_class + 1]
G_class = G_app[i_start:i_end]
gm_unnorm_class = gm_unnorm_trial[i_start:i_end,i_start:i_end]
gm_class = normalize_gram_matrix(gm_unnorm_class.copy())
k_dis_list = []
for idx in range(len(G_class)):
k_dis_list.append(compute_k_dis(idx, range(0, len(G_class)), [1 / len(G_class)] * len(G_class), gm_class, withterm3=False))
idx_k_dis_min = np.argmin(k_dis_list)
best_graphs.append(G_class[idx_k_dis_min].copy())
# perform k-nn.
print('\nperforming k-nn...')
# compute dis_mat between medians.
dataset = dataset_all.copy()
dataset.load_graphs([g.copy() for g in best_graphs], targets=None)
gm_app_unnorm, _ = __compute_gram_matrix_unnorm(dataset, kernel_options.copy())
# compute the entire Gram matrix.
graph_kernel = __get_graph_kernel(dataset.copy(), kernel_options.copy())
kernels_to_best_graphs = []
for g in best_graphs:
kernels_to_best_graph, _ = graph_kernel.compute(g, G_test, **kernel_options.copy())
kernels_to_best_graphs.append(kernels_to_best_graph)
kernels_to_best_graphs = np.array(kernels_to_best_graphs)
gm_all = np.concatenate((gm_app_unnorm, kernels_to_best_graphs), axis=1)
gm_all = np.concatenate((gm_all, np.concatenate((kernels_to_best_graphs.T, gram_matrix_unnorm[test_index,:][:,test_index].copy()), axis=1)), axis=0)
gm_all = normalize_gram_matrix(gm_all.copy())
dis_mat, _, _, _ = compute_distance_matrix(gm_all)
N = len(best_graphs)
d_app = dis_mat[range(N),:][:,range(N)].copy()
d_test = np.zeros((N, len(test_index)))
for i in range(N):
for j in range(len(test_index)):
d_test[i, j] = dis_mat[i, j]
accuracies.append(knn_classification(d_app, d_test, y_app, y_test, n_neighbors, verbose=True, text=train_examples))
# write result detail.
if save_results:
f_detail = open(dir_save + fn_output_detail, 'a')
print('writing results to files...')
csv.writer(f_detail).writerow([ds_name, kernel_options['name'],
train_examples, trial,
knn_options['n_neighbors'],
len(gm_all), knn_options['test_size'],
accuracies[-1][0], accuracies[-1][1]])
f_detail.close()
results = {}
results['ave_perf_train'] = np.mean([i[0] for i in accuracies], axis=0)
results['std_perf_train'] = np.std([i[0] for i in accuracies], axis=0, ddof=1)
results['ave_perf_test'] = np.mean([i[1] for i in accuracies], axis=0)
results['std_perf_test'] = np.std([i[1] for i in accuracies], axis=0, ddof=1)
# write result summary for each letter.
if save_results:
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
train_examples,
knn_options['n_neighbors'],
knn_options['test_size'], results['ave_perf_train'],
results['ave_perf_test'], results['std_perf_train'],
results['std_perf_test'], time_precompute_gm])
f_summary.close()
def __kernel_knn_cv_trainset(dataset_all, ds_name, knn_options, kernel_options, gram_matrix_unnorm, time_precompute_gm, train_examples, save_results, dir_save, fn_output_detail, fn_output_summary):
y_all = dataset_all.targets
n_neighbors, n_splits, test_size = knn_options['n_neighbors'], knn_options['n_splits'], knn_options['test_size']
# compute distance matrix.
gram_matrix = normalize_gram_matrix(gram_matrix_unnorm.copy())
dis_mat, _, _, _ = compute_distance_matrix(gram_matrix)

# get shuffles.
train_indices, test_indices, _, _ = __get_shuffles(y_all, n_splits, test_size)
accuracies = []
for trial in range(len(train_indices)):
print('\ntrial =', trial)
train_index = train_indices[trial]
test_index = test_indices[trial]
y_app = [y_all[i] for i in train_index]
y_test = [y_all[i] for i in test_index]
N = len(train_index)
d_app = dis_mat[train_index,:][:,train_index].copy()
d_test = np.zeros((N, len(test_index)))
for i in range(N):
for j in range(len(test_index)):
d_test[i, j] = dis_mat[train_index[i], test_index[j]]
accuracies.append(knn_classification(d_app, d_test, y_app, y_test, n_neighbors, verbose=True, text=train_examples))
# write result detail.
if save_results:
print('writing results to files...')
f_detail = open(dir_save + fn_output_detail, 'a')
csv.writer(f_detail).writerow([ds_name, kernel_options['name'],
train_examples, trial, knn_options['n_neighbors'],
len(gram_matrix), knn_options['test_size'],
accuracies[-1][0], accuracies[-1][1]])
f_detail.close()
results = {}
results['ave_perf_train'] = np.mean([i[0] for i in accuracies], axis=0)
results['std_perf_train'] = np.std([i[0] for i in accuracies], axis=0, ddof=1)
results['ave_perf_test'] = np.mean([i[1] for i in accuracies], axis=0)
results['std_perf_test'] = np.std([i[1] for i in accuracies], axis=0, ddof=1)

# write result summary for each letter.
if save_results:
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
train_examples, knn_options['n_neighbors'],
knn_options['test_size'], results['ave_perf_train'],
results['ave_perf_test'], results['std_perf_train'],
results['std_perf_test'], time_precompute_gm])
f_summary.close()
def __get_shuffles(y_all, n_splits, test_size):
rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0)
train_indices = [[] for _ in range(n_splits)]
test_indices = [[] for _ in range(n_splits)]
idx_targets = get_same_item_indices(y_all)
train_nums = []
keys = []
for key, item in idx_targets.items():
i = 0
for train_i, test_i in rs.split(item): # @todo: careful when parallel.
train_indices[i] += [item[idx] for idx in train_i]
test_indices[i] += [item[idx] for idx in test_i]
i += 1
train_nums.append(len(train_i))
keys.append(key)
return train_indices, test_indices, train_nums, keys
def __generate_median_preimages(dataset, mpg_options, kernel_options, ged_options, mge_options):
mpg = MedianPreimageGenerator()
mpg.dataset = dataset.copy()
mpg.set_options(**mpg_options.copy())
mpg.kernel_options = kernel_options.copy()
mpg.ged_options = ged_options.copy()
mpg.mge_options = mge_options.copy()
mpg.run()
return mpg.set_median, mpg.gen_median


def __get_gram_matrix(load_gm, dir_save, ds_name, kernel_options, dataset_all):
if load_gm == 'auto':
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz'
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname))
if gmfile_exist:
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe.
gram_matrix_unnorm = gmfile['gram_matrix_unnorm']
time_precompute_gm = float(gmfile['run_time'])
else:
gram_matrix_unnorm, time_precompute_gm = __compute_gram_matrix_unnorm(dataset_all, kernel_options)
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm=gram_matrix_unnorm, run_time=time_precompute_gm)
elif not load_gm:
gram_matrix_unnorm, time_precompute_gm = __compute_gram_matrix_unnorm(dataset_all, kernel_options)
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm=gram_matrix_unnorm, run_time=time_precompute_gm)
else:
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz'
gmfile = np.load(gm_fname, allow_pickle=True)
gram_matrix_unnorm = gmfile['gram_matrix_unnorm']
time_precompute_gm = float(gmfile['run_time'])
return gram_matrix_unnorm, time_precompute_gm


def __get_graph_kernel(dataset, kernel_options):
from gklearn.utils.utils import get_graph_kernel_by_name
graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
kernel_options=kernel_options)
return graph_kernel
def __compute_gram_matrix_unnorm(dataset, kernel_options):
from gklearn.utils.utils import get_graph_kernel_by_name
graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
kernel_options=kernel_options)
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options)
gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm
return gram_matrix_unnorm, run_time
def __init_output_file_knn(ds_name, gkernel, fit_method, dir_output):
if not os.path.exists(dir_output):
os.makedirs(dir_output)
fn_output_detail = 'results_detail_knn.' + ds_name + '.' + gkernel + '.csv'
f_detail = open(dir_output + fn_output_detail, 'a')
csv.writer(f_detail).writerow(['dataset', 'graph kernel',
'train examples', 'trial', 'num neighbors', 'num graphs', 'test size',
'perf train', 'perf test'])
f_detail.close()
fn_output_summary = 'results_summary_knn.' + ds_name + '.' + gkernel + '.csv'
f_summary = open(dir_output + fn_output_summary, 'a')
csv.writer(f_summary).writerow(['dataset', 'graph kernel',
'train examples', 'num neighbors', 'test size',
'ave perf train', 'ave perf test',
'std perf train', 'std perf test', 'time precompute gm'])
f_summary.close()
return fn_output_detail, fn_output_summary

+ 256
- 41
gklearn/preimage/median_preimage_generator.py View File

@@ -39,6 +39,8 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__max_itrs_without_update = 3 self.__max_itrs_without_update = 3
self.__epsilon_residual = 0.01 self.__epsilon_residual = 0.01
self.__epsilon_ec = 0.1 self.__epsilon_ec = 0.1
self.__allow_zeros = False
self.__triangle_rule = True
# values to compute. # values to compute.
self.__runtime_optimize_ec = None self.__runtime_optimize_ec = None
self.__runtime_generate_preimage = None self.__runtime_generate_preimage = None
@@ -79,6 +81,8 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1) self.__epsilon_ec = kwargs.get('epsilon_ec', 0.1)
self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None) self.__gram_matrix_unnorm = kwargs.get('gram_matrix_unnorm', None)
self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None) self.__runtime_precompute_gm = kwargs.get('runtime_precompute_gm', None)
self.__allow_zeros = kwargs.get('allow_zeros', False)
self.__triangle_rule = kwargs.get('triangle_rule', True)
def run(self): def run(self):
@@ -277,7 +281,7 @@ class MedianPreimageGenerator(PreimageGenerator):
options['edge_labels'] = self._dataset.edge_labels options['edge_labels'] = self._dataset.edge_labels
options['node_attrs'] = self._dataset.node_attrs options['node_attrs'] = self._dataset.node_attrs
options['edge_attrs'] = self._dataset.edge_attrs options['edge_attrs'] = self._dataset.edge_attrs
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1))
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
time_list = [time.time() - time0] time_list = [time.time() - time0]
edit_cost_list = [self.__init_ecc] edit_cost_list = [self.__init_ecc]
@@ -319,7 +323,7 @@ class MedianPreimageGenerator(PreimageGenerator):
options['edge_labels'] = self._dataset.edge_labels options['edge_labels'] = self._dataset.edge_labels
options['node_attrs'] = self._dataset.node_attrs options['node_attrs'] = self._dataset.node_attrs
options['edge_attrs'] = self._dataset.edge_attrs options['edge_attrs'] = self._dataset.edge_attrs
ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel, verbose=(self._verbose > 1))
residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
time_list.append(time.time() - time0) time_list.append(time.time() - time0)
edit_cost_list.append(self.__edit_cost_constants) edit_cost_list.append(self.__edit_cost_constants)
@@ -382,7 +386,8 @@ class MedianPreimageGenerator(PreimageGenerator):


def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'): def __update_ecc(self, nb_cost_mat, dis_k_vec, rw_constraints='inequality'):
# if self.__ds_name == 'Letter-high': # if self.__ds_name == 'Letter-high':
if self.__ged_options['edit_cost'] == 'LETTER':
if self.__ged_options['edit_cost'] == 'LETTER':
raise Exception('Cannot compute for cost "LETTER".')
pass pass
# # method 1: set alpha automatically, just tune c_vir and c_eir by # # method 1: set alpha automatically, just tune c_vir and c_eir by
# # LMS using cvxpy. # # LMS using cvxpy.
@@ -438,7 +443,7 @@ class MedianPreimageGenerator(PreimageGenerator):
# # 1. if c_vi != c_vr, c_ei != c_er. # # 1. if c_vi != c_vr, c_ei != c_er.
# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
# x = cp.Variable(nb_cost_mat_new.shape[1]) # x = cp.Variable(nb_cost_mat_new.shape[1])
# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
# cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
## # 1.1 no constraints. ## # 1.1 no constraints.
## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
# # 1.2 c_vs <= c_vi + c_vr. # # 1.2 c_vs <= c_vi + c_vr.
@@ -449,7 +454,7 @@ class MedianPreimageGenerator(PreimageGenerator):
## nb_cost_mat_new[:,0] += nb_cost_mat[:,1] ## nb_cost_mat_new[:,0] += nb_cost_mat[:,1]
## nb_cost_mat_new[:,2] += nb_cost_mat[:,5] ## nb_cost_mat_new[:,2] += nb_cost_mat[:,5]
## x = cp.Variable(nb_cost_mat_new.shape[1]) ## x = cp.Variable(nb_cost_mat_new.shape[1])
## cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
## cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
## # 2.1 no constraints. ## # 2.1 no constraints.
## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]] ## constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])]]
### # 2.2 c_vs <= c_vi + c_vr. ### # 2.2 c_vs <= c_vi + c_vr.
@@ -461,35 +466,37 @@ class MedianPreimageGenerator(PreimageGenerator):
# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
# edit_costs_new = np.array(edit_costs_new) # edit_costs_new = np.array(edit_costs_new)
# residual = np.sqrt(prob.value) # residual = np.sqrt(prob.value)
if rw_constraints == 'inequality':
# c_vs <= c_vi + c_vr.
if not self.__triangle_rule and self.__allow_zeros:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob) self.__execute_cvx(prob)
edit_costs_new = x.value edit_costs_new = x.value
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif rw_constraints == '2constraints':
# c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
elif self.__triangle_rule and self.__allow_zeros:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01,
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
self.__execute_cvx(prob)
edit_costs_new = x.value edit_costs_new = x.value
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif rw_constraints == 'no-constraint':
# no constraint.
elif not self.__triangle_rule and not self.__allow_zeros:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve() prob.solve()
@@ -499,7 +506,7 @@ class MedianPreimageGenerator(PreimageGenerator):
# # c_vs <= c_vi + c_vr. # # c_vs <= c_vi + c_vr.
# nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] # nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
# x = cp.Variable(nb_cost_mat_new.shape[1]) # x = cp.Variable(nb_cost_mat_new.shape[1])
# cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
# cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
# prob = cp.Problem(cp.Minimize(cost_fun), constraints) # prob = cp.Problem(cp.Minimize(cost_fun), constraints)
@@ -508,15 +515,40 @@ class MedianPreimageGenerator(PreimageGenerator):
# edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]] # edit_costs_new = [x.value[0], x.value[0], x.value[1], x.value[2], x.value[2]]
# edit_costs_new = np.array(edit_costs_new) # edit_costs_new = np.array(edit_costs_new)
# residual = np.sqrt(prob.value) # residual = np.sqrt(prob.value)
elif self.__triangle_rule and not self.__allow_zeros:
# c_vs <= c_vi + c_vr.
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
elif rw_constraints == '2constraints': # @todo: rearrange it later.
# c_vs <= c_vi + c_vr and c_vi == c_vr, c_ei == c_er.
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0,
np.array([1.0, -1.0, 0.0, 0.0, 0.0]).T@x == 0.0,
np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
edit_costs_new = x.value
residual = np.sqrt(prob.value)

elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC': elif self.__ged_options['edit_cost'] == 'NON_SYMBOLIC':
is_n_attr = np.count_nonzero(nb_cost_mat[:,2]) is_n_attr = np.count_nonzero(nb_cost_mat[:,2])
is_e_attr = np.count_nonzero(nb_cost_mat[:,5]) is_e_attr = np.count_nonzero(nb_cost_mat[:,5])
if self.__ds_name == 'SYNTHETICnew':
if self.__ds_name == 'SYNTHETICnew': # @todo: rearrenge this later.
# nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] # nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
nb_cost_mat_new = nb_cost_mat[:,[2,3,4]] nb_cost_mat_new = nb_cost_mat[:,[2,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
# constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])], # constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
# np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0] # np.array([0.0, 0.0, 0.0, 1.0, -1.0]).T@x == 0.0]
# constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]] # constraints = [x >= [0.0001 for i in range(nb_cost_mat_new.shape[1])]]
@@ -529,12 +561,154 @@ class MedianPreimageGenerator(PreimageGenerator):
np.array([0.0]))) np.array([0.0])))
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif rw_constraints == 'inequality':
elif not self.__triangle_rule and self.__allow_zeros:
if is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
elif is_n_attr and not is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value, np.array([0.0])))
residual = np.sqrt(prob.value)
elif not is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
residual = np.sqrt(prob.value)
else:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
x.value[2:], np.array([0.0])))
residual = np.sqrt(prob.value)
elif self.__triangle_rule and self.__allow_zeros:
if is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
elif is_n_attr and not is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0]).T@x >= 0.01,
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value, np.array([0.0])))
residual = np.sqrt(prob.value)
elif not is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
residual = np.sqrt(prob.value)
else:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
x.value[2:], np.array([0.0])))
residual = np.sqrt(prob.value)
elif not self.__triangle_rule and not self.__allow_zeros:
if is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
elif is_n_attr and not is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value, np.array([0.0])))
residual = np.sqrt(prob.value)
elif not is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
residual = np.sqrt(prob.value)
else:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
x.value[2:], np.array([0.0])))
residual = np.sqrt(prob.value)
elif self.__triangle_rule and not self.__allow_zeros:
# c_vs <= c_vi + c_vr. # c_vs <= c_vi + c_vr.
if is_n_attr and is_e_attr: if is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]] nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
@@ -545,7 +719,7 @@ class MedianPreimageGenerator(PreimageGenerator):
elif is_n_attr and not is_e_attr: elif is_n_attr and not is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
@@ -555,7 +729,7 @@ class MedianPreimageGenerator(PreimageGenerator):
elif not is_n_attr and is_e_attr: elif not is_n_attr and is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
@@ -565,24 +739,61 @@ class MedianPreimageGenerator(PreimageGenerator):
else: else:
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]] nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
cost_fun = cp.sum_squares(nb_cost_mat_new @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob) self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
x.value[2:], np.array([0.0]))) x.value[2:], np.array([0.0])))
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)

elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled. elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled.
x = cp.Variable(nb_cost_mat.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
if not self.__triangle_rule and self.__allow_zeros:
x = cp.Variable(nb_cost_mat.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
elif self.__triangle_rule and self.__allow_zeros:
x = cp.Variable(nb_cost_mat.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 0.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 1.0, 0.0, 0.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 1.0, 0.0, 0.0]).T@x >= 0.01,
np.array([0.0, 0.0, 0.0, 0.0, 1.0, 0.0]).T@x >= 0.01,
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
elif not self.__triangle_rule and not self.__allow_zeros:
x = cp.Variable(nb_cost_mat.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
elif self.__triangle_rule and not self.__allow_zeros:
x = cp.Variable(nb_cost_mat.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
else: else:
raise Exception('The edit cost "', self.__ged_options['edit_cost'], '" is not supported for update progress.')
# # method 1: simple least square method. # # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
# rcond=None) # rcond=None)
@@ -607,7 +818,7 @@ class MedianPreimageGenerator(PreimageGenerator):
# G = -1 * np.identity(nb_cost_mat.shape[1]) # G = -1 * np.identity(nb_cost_mat.shape[1])
# h = np.array([0 for i in range(nb_cost_mat.shape[1])]) # h = np.array([0 for i in range(nb_cost_mat.shape[1])])
x = cp.Variable(nb_cost_mat.shape[1]) x = cp.Variable(nb_cost_mat.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
cost_fun = cp.sum_squares(nb_cost_mat @ x - dis_k_vec)
constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])], constraints = [x >= [0.0 for i in range(nb_cost_mat.shape[1])],
# np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] # np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
@@ -669,6 +880,7 @@ class MedianPreimageGenerator(PreimageGenerator):
options = self.__mge_options.copy() options = self.__mge_options.copy()
if not 'seed' in options: if not 'seed' in options:
options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage. options['seed'] = int(round(time.time() * 1000)) # @todo: may not work correctly for possible parallel usage.
options['parallel'] = self.__parallel
# Select the GED algorithm. # Select the GED algorithm.
self.__mge.set_options(mge_options_to_string(options)) self.__mge.set_options(mge_options_to_string(options))
@@ -676,8 +888,11 @@ class MedianPreimageGenerator(PreimageGenerator):
edge_labels=self._dataset.edge_labels, edge_labels=self._dataset.edge_labels,
node_attrs=self._dataset.node_attrs, node_attrs=self._dataset.node_attrs,
edge_attrs=self._dataset.edge_attrs) edge_attrs=self._dataset.edge_attrs)
self.__mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
self.__mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
ged_options = self.__ged_options.copy()
if self.__parallel:
ged_options['threads'] = 1
self.__mge.set_init_method(ged_options['method'], ged_options_to_string(ged_options))
self.__mge.set_descent_method(ged_options['method'], ged_options_to_string(ged_options))
# Run the estimator. # Run the estimator.
self.__mge.run(graph_ids, set_median_id, gen_median_id) self.__mge.run(graph_ids, set_median_id, gen_median_id)


+ 423
- 0
gklearn/preimage/remove_best_graph.py View File

@@ -0,0 +1,423 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wen May 27 14:27:15 2020

@author: ljia
"""
import numpy as np
import csv
import os
import os.path
from gklearn.utils import Dataset
from gklearn.preimage import MedianPreimageGenerator
from gklearn.utils import normalize_gram_matrix
from gklearn.utils import split_dataset_by_target
from gklearn.preimage.utils import compute_k_dis
from gklearn.utils.graphfiles import saveGXL
import networkx as nx

def remove_best_graph(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, load_gm='auto', dir_save='', irrelevant_labels=None, edge_required=False, cut_range=None):
"""Remove the best graph from the median set w.r.t. distance in kernel space, and to see if it is possible to generate the removed graph using the graphs left in the median set.
"""
# 1. get dataset.
print('1. getting dataset...')
dataset_all = Dataset()
dataset_all.load_predefined_dataset(ds_name)
dataset_all.trim_dataset(edge_required=edge_required)
if irrelevant_labels is not None:
dataset_all.remove_labels(**irrelevant_labels)
if cut_range is not None:
dataset_all.cut_graphs(cut_range)
datasets = split_dataset_by_target(dataset_all)

if save_results:
# create result files.
print('creating output files...')
fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save)
else:
fn_output_detail, fn_output_summary = None, None
# 2. compute/load Gram matrix a priori.
print('2. computing/loading Gram matrix...')
gram_matrix_unnorm_list, time_precompute_gm_list = __get_gram_matrix(load_gm, dir_save, ds_name, kernel_options, datasets)
sod_sm_list = []
sod_gm_list = []
dis_k_sm_list = []
dis_k_gm_list = []
dis_k_gi_min_list = []
time_optimize_ec_list = []
time_generate_list = []
time_total_list = []
itrs_list = []
converged_list = []
num_updates_ecc_list = []
mge_decrease_order_list = []
mge_increase_order_list = []
mge_converged_order_list = []
nb_sod_sm2gm = [0, 0, 0]
nb_dis_k_sm2gm = [0, 0, 0]
nb_dis_k_gi2sm = [0, 0, 0]
nb_dis_k_gi2gm = [0, 0, 0]
dis_k_max_list = []
dis_k_min_list = []
dis_k_mean_list = []
best_dis_list = []
print('starting experiment for each class of target...')
idx_offset = 0
for idx, dataset in enumerate(datasets):
target = dataset.targets[0]
print('\ntarget =', target, '\n')
# if target != 1:
# continue
num_graphs = len(dataset.graphs)
if num_graphs < 2:
print('\nnumber of graphs = ', num_graphs, ', skip.\n')
idx_offset += 1
continue

# 3. get the best graph and remove it from median set.
print('3. getting and removing the best graph...')
gram_matrix_unnorm = gram_matrix_unnorm_list[idx - idx_offset]
best_index, best_dis, best_graph = __get_best_graph([g.copy() for g in dataset.graphs], normalize_gram_matrix(gram_matrix_unnorm.copy()))
median_set_new = [dataset.graphs[i] for i in range(len(dataset.graphs)) if i != best_index]
num_graphs -= 1
if num_graphs == 1:
continue
best_dis_list.append(best_dis)
dataset.load_graphs(median_set_new, targets=None)
gram_matrix_unnorm_new = np.delete(gram_matrix_unnorm, best_index, axis=0)
gram_matrix_unnorm_new = np.delete(gram_matrix_unnorm_new, best_index, axis=1)
# 4. set parameters.
print('4. initializing mpg and setting parameters...')
mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_new
mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx - idx_offset]
mpg = MedianPreimageGenerator()
mpg.dataset = dataset
mpg.set_options(**mpg_options.copy())
mpg.kernel_options = kernel_options.copy()
mpg.ged_options = ged_options.copy()
mpg.mge_options = mge_options.copy()

# 5. compute median preimage.
print('5. computing median preimage...')
mpg.run()
results = mpg.get_results()
# 6. compute pairwise kernel distances.
print('6. computing pairwise kernel distances...')
_, dis_k_max, dis_k_min, dis_k_mean = mpg.graph_kernel.compute_distance_matrix()
dis_k_max_list.append(dis_k_max)
dis_k_min_list.append(dis_k_min)
dis_k_mean_list.append(dis_k_mean)
# 7. save results (and median graphs).
print('7. saving results (and median graphs)...')
# write result detail.
if save_results:
print('writing results to files...')
sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median']))
dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median']))
dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset']))
dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset']))

f_detail = open(dir_save + fn_output_detail, 'a')
csv.writer(f_detail).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, target, 1,
results['sod_set_median'], results['sod_gen_median'],
results['k_dis_set_median'], results['k_dis_gen_median'],
results['k_dis_dataset'], best_dis, best_index,
sod_sm2gm, dis_k_sm2gm,
dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'],
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc'],
results['mge']['num_decrease_order'] > 0, # @todo: not suitable for multi-start mge
results['mge']['num_increase_order'] > 0,
results['mge']['num_converged_descents'] > 0])
f_detail.close()
# compute result summary.
sod_sm_list.append(results['sod_set_median'])
sod_gm_list.append(results['sod_gen_median'])
dis_k_sm_list.append(results['k_dis_set_median'])
dis_k_gm_list.append(results['k_dis_gen_median'])
dis_k_gi_min_list.append(results['k_dis_dataset'])
time_precompute_gm_list.append(results['runtime_precompute_gm'])
time_optimize_ec_list.append(results['runtime_optimize_ec'])
time_generate_list.append(results['runtime_generate_preimage'])
time_total_list.append(results['runtime_total'])
itrs_list.append(results['itrs'])
converged_list.append(results['converged'])
num_updates_ecc_list.append(results['num_updates_ecc'])
mge_decrease_order_list.append(results['mge']['num_decrease_order'] > 0)
mge_increase_order_list.append(results['mge']['num_increase_order'] > 0)
mge_converged_order_list.append(results['mge']['num_converged_descents'] > 0)
# # SOD SM -> GM
if results['sod_set_median'] > results['sod_gen_median']:
nb_sod_sm2gm[0] += 1
# repeats_better_sod_sm2gm.append(1)
elif results['sod_set_median'] == results['sod_gen_median']:
nb_sod_sm2gm[1] += 1
elif results['sod_set_median'] < results['sod_gen_median']:
nb_sod_sm2gm[2] += 1
# # dis_k SM -> GM
if results['k_dis_set_median'] > results['k_dis_gen_median']:
nb_dis_k_sm2gm[0] += 1
# repeats_better_dis_k_sm2gm.append(1)
elif results['k_dis_set_median'] == results['k_dis_gen_median']:
nb_dis_k_sm2gm[1] += 1
elif results['k_dis_set_median'] < results['k_dis_gen_median']:
nb_dis_k_sm2gm[2] += 1
# # dis_k gi -> SM
if results['k_dis_dataset'] > results['k_dis_set_median']:
nb_dis_k_gi2sm[0] += 1
# repeats_better_dis_k_gi2sm.append(1)
elif results['k_dis_dataset'] == results['k_dis_set_median']:
nb_dis_k_gi2sm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_set_median']:
nb_dis_k_gi2sm[2] += 1
# # dis_k gi -> GM
if results['k_dis_dataset'] > results['k_dis_gen_median']:
nb_dis_k_gi2gm[0] += 1
# repeats_better_dis_k_gi2gm.append(1)
elif results['k_dis_dataset'] == results['k_dis_gen_median']:
nb_dis_k_gi2gm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_gen_median']:
nb_dis_k_gi2gm[2] += 1

# write result summary for each letter.
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, target,
results['sod_set_median'], results['sod_gen_median'],
results['k_dis_set_median'], results['k_dis_gen_median'],
results['k_dis_dataset'], best_dis, best_index,
sod_sm2gm, dis_k_sm2gm,
dis_k_gi2sm, dis_k_gi2gm,
results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'],
results['num_updates_ecc'],
results['mge']['num_decrease_order'] > 0, # @todo: not suitable for multi-start mge
results['mge']['num_increase_order'] > 0,
results['mge']['num_converged_descents'] > 0,
nb_sod_sm2gm,
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm])
f_summary.close()
# save median graphs.
if save_medians:
if not os.path.exists(dir_save + 'medians/'):
os.makedirs(dir_save + 'medians/')
print('Saving median graphs to files...')
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(best_graph, fn_best_dataset + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_best_median_set = dir_save + 'medians/g_best_median_set.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.best_from_dataset, fn_best_median_set + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
# plot median graphs.
if plot_medians and save_medians:
if ged_options['edit_cost'] == 'LETTER2' or ged_options['edit_cost'] == 'LETTER' or ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low':
draw_Letter_graph(mpg.set_median, fn_pre_sm)
draw_Letter_graph(mpg.gen_median, fn_pre_gm)
draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset)

# write result summary for each letter.
if save_results:
sod_sm_mean = np.mean(sod_sm_list)
sod_gm_mean = np.mean(sod_gm_list)
dis_k_sm_mean = np.mean(dis_k_sm_list)
dis_k_gm_mean = np.mean(dis_k_gm_list)
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
best_dis_mean = np.mean(best_dis_list)
time_precompute_gm_mean = np.mean(time_precompute_gm_list)
time_optimize_ec_mean = np.mean(time_optimize_ec_list)
time_generate_mean = np.mean(time_generate_list)
time_total_mean = np.mean(time_total_list)
itrs_mean = np.mean(itrs_list)
num_converged = np.sum(converged_list)
num_updates_ecc_mean = np.mean(num_updates_ecc_list)
num_mge_decrease_order = np.sum(mge_decrease_order_list)
num_mge_increase_order = np.sum(mge_increase_order_list)
num_mge_converged = np.sum(mge_converged_order_list)
sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean))
dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
f_summary = open(dir_save + fn_output_summary, 'a')
csv.writer(f_summary).writerow([ds_name, kernel_options['name'],
ged_options['edit_cost'], ged_options['method'],
ged_options['attr_distance'], mpg_options['fit_method'],
num_graphs, 'all',
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
dis_k_gi_min_mean, best_dis_mean, '-',
sod_sm2gm_mean, dis_k_sm2gm_mean,
dis_k_gi2sm_mean, dis_k_gi2gm_mean,
time_precompute_gm_mean, time_optimize_ec_mean,
time_generate_mean, time_total_mean, itrs_mean,
num_converged, num_updates_ecc_mean,
num_mge_decrease_order, num_mge_increase_order,
num_mge_converged])
f_summary.close()
# save total pairwise kernel distances.
dis_k_max = np.max(dis_k_max_list)
dis_k_min = np.min(dis_k_min_list)
dis_k_mean = np.mean(dis_k_mean_list)
print('The maximum pairwise distance in kernel space:', dis_k_max)
print('The minimum pairwise distance in kernel space:', dis_k_min)
print('The average pairwise distance in kernel space:', dis_k_mean)

print('\ncomplete.\n')


def __get_best_graph(Gn, gram_matrix):
k_dis_list = []
for idx in range(len(Gn)):
k_dis_list.append(compute_k_dis(idx, range(0, len(Gn)), [1 / len(Gn)] * len(Gn), gram_matrix, withterm3=False))
best_index = np.argmin(k_dis_list)
best_dis = k_dis_list[best_index]
best_graph = Gn[best_index].copy()
return best_index, best_dis, best_graph


def get_relations(sign):
if sign == -1:
return 'better'
elif sign == 0:
return 'same'
elif sign == 1:
return 'worse'


def __get_gram_matrix(load_gm, dir_save, ds_name, kernel_options, datasets):
if load_gm == 'auto':
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz'
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname))
if gmfile_exist:
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe.
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']]
time_precompute_gm_list = gmfile['run_time_list'].tolist()
else:
gram_matrix_unnorm_list = []
time_precompute_gm_list = []
for dataset in datasets:
gram_matrix_unnorm, time_precompute_gm = __compute_gram_matrix_unnorm(dataset, kernel_options)
gram_matrix_unnorm_list.append(gram_matrix_unnorm)
time_precompute_gm_list.append(time_precompute_gm)
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list)
elif not load_gm:
gram_matrix_unnorm_list = []
time_precompute_gm_list = []
for dataset in datasets:
gram_matrix_unnorm, time_precompute_gm = __compute_gram_matrix_unnorm(dataset, kernel_options)
gram_matrix_unnorm_list.append(gram_matrix_unnorm)
time_precompute_gm_list.append(time_precompute_gm)
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list)
else:
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz'
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe.
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']]
time_precompute_gm_list = gmfile['run_time_list'].tolist()
return gram_matrix_unnorm_list, time_precompute_gm_list


def __get_graph_kernel(dataset, kernel_options):
from gklearn.utils.utils import get_graph_kernel_by_name
graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
kernel_options=kernel_options)
return graph_kernel
def __compute_gram_matrix_unnorm(dataset, kernel_options):
from gklearn.utils.utils import get_graph_kernel_by_name
graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
kernel_options=kernel_options)
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options)
gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm
return gram_matrix_unnorm, run_time
def __init_output_file(ds_name, gkernel, fit_method, dir_output):
if not os.path.exists(dir_output):
os.makedirs(dir_output)
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv'
f_detail = open(dir_output + fn_output_detail, 'a')
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost',
'GED method', 'attr distance', 'fit method', 'num graphs',
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
'min dis_k gi', 'best kernel dis', 'best graph index',
'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'edit cost constants', 'time precompute gm',
'time optimize ec', 'time generate preimage', 'time total',
'itrs', 'converged', 'num updates ecc', 'mge decrease order',
'mge increase order', 'mge converged'])
f_detail.close()
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv'
f_summary = open(dir_output + fn_output_summary, 'a')
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost',
'GED method', 'attr distance', 'fit method', 'num graphs',
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
'min dis_k gi', 'best kernel dis', 'best graph index',
'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
'dis_k gi -> GM', 'time precompute gm', 'time optimize ec',
'time generate preimage', 'time total', 'itrs', 'num converged',
'num updates ecc', 'mge num decrease order', 'mge num increase order',
'mge num converged', '# SOD SM -> GM', '# dis_k SM -> GM',
'# dis_k gi -> SM', '# dis_k gi -> GM'])
f_summary.close()
return fn_output_detail, fn_output_summary


#Dessin median courrant
def draw_Letter_graph(graph, file_prefix):
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
plt.figure()
pos = {}
for n in graph.nodes:
pos[n] = np.array([float(graph.nodes[n]['x']),float(graph.nodes[n]['y'])])
nx.draw_networkx(graph, pos)
plt.savefig(file_prefix + '.eps', format='eps', dpi=300)
# plt.show()
plt.clf()
plt.close()

+ 28
- 26
gklearn/preimage/utils.py View File

@@ -45,7 +45,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
if save_results: if save_results:
# create result files. # create result files.
print('creating output files...') print('creating output files...')
fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save)
fn_output_detail, fn_output_summary = __init_output_file_preimage(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save)
sod_sm_list = [] sod_sm_list = []
sod_gm_list = [] sod_gm_list = []
@@ -82,22 +82,22 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
gram_matrix_unnorm_list = [] gram_matrix_unnorm_list = []
time_precompute_gm_list = [] time_precompute_gm_list = []
else: else:
gmfile = np.load()
gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list']
time_precompute_gm_list = gmfile['run_time_list']
# repeats_better_sod_sm2gm = []
# repeats_better_dis_k_sm2gm = []
# repeats_better_dis_k_gi2sm = []
# repeats_better_dis_k_gi2gm = []
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz'
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe.
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']]
time_precompute_gm_list = gmfile['run_time_list'].tolist()
# repeats_better_sod_sm2gm = []
# repeats_better_dis_k_sm2gm = []
# repeats_better_dis_k_gi2sm = []
# repeats_better_dis_k_gi2gm = []
print('start generating preimage for each class of target...')
print('starting generating preimage for each class of target...')
idx_offset = 0 idx_offset = 0
for idx, dataset in enumerate(datasets): for idx, dataset in enumerate(datasets):
target = dataset.targets[0] target = dataset.targets[0]
print('\ntarget =', target, '\n') print('\ntarget =', target, '\n')
# if target != 1:
# continue
# if target != 1:
# continue
num_graphs = len(dataset.graphs) num_graphs = len(dataset.graphs)
if num_graphs < 2: if num_graphs < 2:
@@ -148,7 +148,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
results['sod_set_median'], results['sod_gen_median'], results['sod_set_median'], results['sod_gen_median'],
results['k_dis_set_median'], results['k_dis_gen_median'], results['k_dis_set_median'], results['k_dis_gen_median'],
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm,
dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'],
dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'],
results['runtime_precompute_gm'], results['runtime_optimize_ec'], results['runtime_precompute_gm'], results['runtime_optimize_ec'],
results['runtime_generate_preimage'], results['runtime_total'], results['runtime_generate_preimage'], results['runtime_total'],
results['itrs'], results['converged'], results['itrs'], results['converged'],
@@ -177,7 +177,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
# # SOD SM -> GM # # SOD SM -> GM
if results['sod_set_median'] > results['sod_gen_median']: if results['sod_set_median'] > results['sod_gen_median']:
nb_sod_sm2gm[0] += 1 nb_sod_sm2gm[0] += 1
# repeats_better_sod_sm2gm.append(1)
# repeats_better_sod_sm2gm.append(1)
elif results['sod_set_median'] == results['sod_gen_median']: elif results['sod_set_median'] == results['sod_gen_median']:
nb_sod_sm2gm[1] += 1 nb_sod_sm2gm[1] += 1
elif results['sod_set_median'] < results['sod_gen_median']: elif results['sod_set_median'] < results['sod_gen_median']:
@@ -185,7 +185,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
# # dis_k SM -> GM # # dis_k SM -> GM
if results['k_dis_set_median'] > results['k_dis_gen_median']: if results['k_dis_set_median'] > results['k_dis_gen_median']:
nb_dis_k_sm2gm[0] += 1 nb_dis_k_sm2gm[0] += 1
# repeats_better_dis_k_sm2gm.append(1)
# repeats_better_dis_k_sm2gm.append(1)
elif results['k_dis_set_median'] == results['k_dis_gen_median']: elif results['k_dis_set_median'] == results['k_dis_gen_median']:
nb_dis_k_sm2gm[1] += 1 nb_dis_k_sm2gm[1] += 1
elif results['k_dis_set_median'] < results['k_dis_gen_median']: elif results['k_dis_set_median'] < results['k_dis_gen_median']:
@@ -193,7 +193,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
# # dis_k gi -> SM # # dis_k gi -> SM
if results['k_dis_dataset'] > results['k_dis_set_median']: if results['k_dis_dataset'] > results['k_dis_set_median']:
nb_dis_k_gi2sm[0] += 1 nb_dis_k_gi2sm[0] += 1
# repeats_better_dis_k_gi2sm.append(1)
# repeats_better_dis_k_gi2sm.append(1)
elif results['k_dis_dataset'] == results['k_dis_set_median']: elif results['k_dis_dataset'] == results['k_dis_set_median']:
nb_dis_k_gi2sm[1] += 1 nb_dis_k_gi2sm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_set_median']: elif results['k_dis_dataset'] < results['k_dis_set_median']:
@@ -201,7 +201,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
# # dis_k gi -> GM # # dis_k gi -> GM
if results['k_dis_dataset'] > results['k_dis_gen_median']: if results['k_dis_dataset'] > results['k_dis_gen_median']:
nb_dis_k_gi2gm[0] += 1 nb_dis_k_gi2gm[0] += 1
# repeats_better_dis_k_gi2gm.append(1)
# repeats_better_dis_k_gi2gm.append(1)
elif results['k_dis_dataset'] == results['k_dis_gen_median']: elif results['k_dis_dataset'] == results['k_dis_gen_median']:
nb_dis_k_gi2gm[1] += 1 nb_dis_k_gi2gm[1] += 1
elif results['k_dis_dataset'] < results['k_dis_gen_median']: elif results['k_dis_dataset'] < results['k_dis_gen_median']:
@@ -225,7 +225,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
results['mge']['num_increase_order'] > 0, results['mge']['num_increase_order'] > 0,
results['mge']['num_converged_descents'] > 0, results['mge']['num_converged_descents'] > 0,
nb_sod_sm2gm, nb_sod_sm2gm,
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm])
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm])
f_summary.close() f_summary.close()
# save median graphs. # save median graphs.
@@ -235,15 +235,15 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
print('Saving median graphs to files...') print('Saving median graphs to files...')
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default', saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1) fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.nbg' + str(num_graphs) + '.y' + str(target) + '.repeat' + str(1)
saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default',
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs)
# plot median graphs. # plot median graphs.
@@ -304,10 +304,10 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
if (load_gm == 'auto' and not gmfile_exist) or not load_gm: if (load_gm == 'auto' and not gmfile_exist) or not load_gm:
np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list) np.savez(dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm', gram_matrix_unnorm_list=gram_matrix_unnorm_list, run_time_list=time_precompute_gm_list)


print('\ncomplete.')
print('\ncomplete.\n')


def __init_output_file(ds_name, gkernel, fit_method, dir_output):
def __init_output_file_preimage(ds_name, gkernel, fit_method, dir_output):
if not os.path.exists(dir_output): if not os.path.exists(dir_output):
os.makedirs(dir_output) os.makedirs(dir_output)
# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' # fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv'
@@ -335,9 +335,9 @@ def __init_output_file(ds_name, gkernel, fit_method, dir_output):
'num updates ecc', 'mge num decrease order', 'mge num increase order', 'num updates ecc', 'mge num decrease order', 'mge num increase order',
'mge num converged', '# SOD SM -> GM', '# dis_k SM -> GM', 'mge num converged', '# SOD SM -> GM', '# dis_k SM -> GM',
'# dis_k gi -> SM', '# dis_k gi -> GM']) '# dis_k gi -> SM', '# dis_k gi -> GM'])
# 'repeats better SOD SM -> GM',
# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
# 'repeats better dis_k gi -> GM'])
# 'repeats better SOD SM -> GM',
# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
# 'repeats better dis_k gi -> GM'])
f_summary.close() f_summary.close()
return fn_output_detail, fn_output_summary return fn_output_detail, fn_output_summary
@@ -462,6 +462,8 @@ def gram2distances(Kmatrix):


def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None,
gkernel=None, verbose=True): gkernel=None, verbose=True):
import warnings
warnings.warn('gklearn.preimage.utils.kernel_distance_matrix is deprecated, use gklearn.kernels.graph_kernel.compute_distance_matrix or gklearn.utils.compute_distance_matrix instead', DeprecationWarning)
dis_mat = np.empty((len(Gn), len(Gn))) dis_mat = np.empty((len(Gn), len(Gn)))
if Kmatrix is None: if Kmatrix is None:
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose)


+ 71
- 0
gklearn/tests/test_median_preimage_generator.py View File

@@ -0,0 +1,71 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jan 14 15:39:29 2020

@author: ljia
"""
import multiprocessing
import functools
from gklearn.preimage.utils import generate_median_preimages_by_class

def test_median_preimage_generator():
"""MAO, Treelet, using CONSTANT, symbolic only.
"""
from gklearn.utils.kernels import polynomialkernel
# set parameters.
ds_name = 'MAO' #
mpg_options = {'fit_method': 'k-graphs',
'init_ecc': [4, 4, 2, 1, 1, 1], #
'ds_name': ds_name,
'parallel': True, # False
'time_limit_in_sec': 0,
'max_itrs': 3, #
'max_itrs_without_update': 3,
'epsilon_residual': 0.01,
'epsilon_ec': 0.1,
'verbose': 2}
pkernel = functools.partial(polynomialkernel, d=4, c=1e+7)
kernel_options = {'name': 'Treelet', #
'sub_kernel': pkernel,
'parallel': 'imap_unordered',
# 'parallel': None,
'n_jobs': multiprocessing.cpu_count(),
'normalize': True,
'verbose': 2}
ged_options = {'method': 'IPFP',
'initialization_method': 'RANDOM', # 'NODE'
'initial_solutions': 1, # 1
'edit_cost': 'CONSTANT', #
'attr_distance': 'euclidean',
'ratio_runs_from_initial_solutions': 1,
'threads': multiprocessing.cpu_count(),
'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'}
mge_options = {'init_type': 'MEDOID',
'random_inits': 10,
'time_limit': 600,
'verbose': 2,
'refine': False}
save_results = True
dir_save = ds_name + '.' + kernel_options['name'] + '.symb.pytest/'
irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']} #
edge_required = False #
# print settings.
print('parameters:')
print('dataset name:', ds_name)
print('mpg_options:', mpg_options)
print('kernel_options:', kernel_options)
print('ged_options:', ged_options)
print('mge_options:', mge_options)
print('save_results:', save_results)
print('irrelevant_labels:', irrelevant_labels)
print()
# generate preimages.
for fit_method in ['k-graphs', 'expert', 'random']:
print('\n-------------------------------------')
print('fit method:', fit_method, '\n')
mpg_options['fit_method'] = fit_method
generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=save_results, save_medians=True, plot_medians=True, load_gm='auto', dir_save=dir_save, irrelevant_labels=irrelevant_labels, edge_required=edge_required, cut_range=range(0, 4))

+ 2
- 0
gklearn/utils/__init__.py View File

@@ -21,4 +21,6 @@ from gklearn.utils.timer import Timer
from gklearn.utils.utils import get_graph_kernel_by_name from gklearn.utils.utils import get_graph_kernel_by_name
from gklearn.utils.utils import compute_gram_matrices_by_class from gklearn.utils.utils import compute_gram_matrices_by_class
from gklearn.utils.utils import SpecialLabel from gklearn.utils.utils import SpecialLabel
from gklearn.utils.utils import normalize_gram_matrix, compute_distance_matrix
from gklearn.utils.trie import Trie from gklearn.utils.trie import Trie
from gklearn.utils.knn import knn_cv, knn_classification

+ 19
- 1
gklearn/utils/dataset.py View File

@@ -522,6 +522,20 @@ class Dataset(object):
self.__targets = [self.__targets[i] for i in idx] self.__targets = [self.__targets[i] for i in idx]
self.clean_labels() self.clean_labels()
def copy(self):
dataset = Dataset()
graphs = [g.copy() for g in self.__graphs] if self.__graphs is not None else None
target = self.__targets.copy() if self.__targets is not None else None
node_labels = self.__node_labels.copy() if self.__node_labels is not None else None
node_attrs = self.__node_attrs.copy() if self.__node_attrs is not None else None
edge_labels = self.__edge_labels.copy() if self.__edge_labels is not None else None
edge_attrs = self.__edge_attrs.copy() if self.__edge_attrs is not None else None
dataset.load_graphs(graphs, target)
dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
# @todo: clean_labels and add other class members?
return dataset
def __get_dataset_size(self): def __get_dataset_size(self):
return len(self.__graphs) return len(self.__graphs)
@@ -721,7 +735,11 @@ def split_dataset_by_target(dataset):
sub_graphs = [graphs[i] for i in val] sub_graphs = [graphs[i] for i in val]
sub_dataset = Dataset() sub_dataset = Dataset()
sub_dataset.load_graphs(sub_graphs, [key] * len(val)) sub_dataset.load_graphs(sub_graphs, [key] * len(val))
sub_dataset.set_labels(node_labels=dataset.node_labels, node_attrs=dataset.node_attrs, edge_labels=dataset.edge_labels, edge_attrs=dataset.edge_attrs)
node_labels = dataset.node_labels.copy() if dataset.node_labels is not None else None
node_attrs = dataset.node_attrs.copy() if dataset.node_attrs is not None else None
edge_labels = dataset.edge_labels.copy() if dataset.edge_labels is not None else None
edge_attrs = dataset.edge_attrs.copy() if dataset.edge_attrs is not None else None
sub_dataset.set_labels(node_labels=node_labels, node_attrs=node_attrs, edge_labels=edge_labels, edge_attrs=edge_attrs)
datasets.append(sub_dataset) datasets.append(sub_dataset)
# @todo: clean_labels? # @todo: clean_labels?
return datasets return datasets

+ 24
- 13
gklearn/utils/graph_files.py View File

@@ -494,7 +494,8 @@ def load_tud(filename):
'edge_labels': [], 'edge_attrs': []} 'edge_labels': [], 'edge_attrs': []}
class_label_map = None class_label_map = None
class_label_map_strings = [] class_label_map_strings = []
content_rm = open(frm).read().splitlines()
with open(frm) as rm:
content_rm = rm.read().splitlines()
i = 0 i = 0
while i < len(content_rm): while i < len(content_rm):
line = content_rm[i].strip() line = content_rm[i].strip()
@@ -558,16 +559,20 @@ def load_tud(filename):
label_names = {'node_labels': [], 'node_attrs': [], label_names = {'node_labels': [], 'node_attrs': [],
'edge_labels': [], 'edge_attrs': []} 'edge_labels': [], 'edge_attrs': []}
class_label_map = None class_label_map = None

content_gi = open(fgi).read().splitlines() # graph indicator
content_am = open(fam).read().splitlines() # adjacency matrix
with open(fgi) as gi:
content_gi = gi.read().splitlines() # graph indicator
with open(fam) as am:
content_am = am.read().splitlines() # adjacency matrix
# load targets. # load targets.
if 'fgl' in locals(): if 'fgl' in locals():
content_targets = open(fgl).read().splitlines() # targets (classification)
with open(fgl) as gl:
content_targets = gl.read().splitlines() # targets (classification)
targets = [float(i) for i in content_targets] targets = [float(i) for i in content_targets]
elif 'fga' in locals(): elif 'fga' in locals():
content_targets = open(fga).read().splitlines() # targets (regression)
with open(fga) as ga:
content_targets = ga.read().splitlines() # targets (regression)
targets = [int(i) for i in content_targets] targets = [int(i) for i in content_targets]
else: else:
raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.') raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.')
@@ -577,7 +582,8 @@ def load_tud(filename):
# create graphs and add nodes # create graphs and add nodes
data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))] data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))]
if 'fnl' in locals(): if 'fnl' in locals():
content_nl = open(fnl).read().splitlines() # node labels
with open(fnl) as nl:
content_nl = nl.read().splitlines() # node labels
for idx, line in enumerate(content_gi): for idx, line in enumerate(content_gi):
# transfer to int first in case of unexpected blanks # transfer to int first in case of unexpected blanks
data[int(line) - 1].add_node(idx) data[int(line) - 1].add_node(idx)
@@ -605,7 +611,8 @@ def load_tud(filename):


# add edge labels # add edge labels
if 'fel' in locals(): if 'fel' in locals():
content_el = open(fel).read().splitlines()
with open(fel) as el:
content_el = el.read().splitlines()
for idx, line in enumerate(content_el): for idx, line in enumerate(content_el):
labels = [l.strip() for l in line.split(',')] labels = [l.strip() for l in line.split(',')]
n = [int(i) - 1 for i in content_am[idx].split(',')] n = [int(i) - 1 for i in content_am[idx].split(',')]
@@ -621,7 +628,8 @@ def load_tud(filename):


# add node attributes # add node attributes
if 'fna' in locals(): if 'fna' in locals():
content_na = open(fna).read().splitlines()
with open(fna) as na:
content_na = na.read().splitlines()
for idx, line in enumerate(content_na): for idx, line in enumerate(content_na):
attrs = [a.strip() for a in line.split(',')] attrs = [a.strip() for a in line.split(',')]
g = int(content_gi[idx]) - 1 g = int(content_gi[idx]) - 1
@@ -636,7 +644,8 @@ def load_tud(filename):


# add edge attributes # add edge attributes
if 'fea' in locals(): if 'fea' in locals():
content_ea = open(fea).read().splitlines()
with open(fea) as ea:
content_ea = ea.read().splitlines()
for idx, line in enumerate(content_ea): for idx, line in enumerate(content_ea):
attrs = [a.strip() for a in line.split(',')] attrs = [a.strip() for a in line.split(',')]
n = [int(i) - 1 for i in content_am[idx].split(',')] n = [int(i) - 1 for i in content_am[idx].split(',')]
@@ -669,7 +678,8 @@ def load_from_ds(filename, filename_targets):
data = [] data = []
y = [] y = []
label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []} label_names = {'node_labels': [], 'edge_labels': [], 'node_attrs': [], 'edge_attrs': []}
content = open(filename).read().splitlines()
with open(filename) as fn:
content = fn.read().splitlines()
extension = splitext(content[0].split(' ')[0])[1][1:] extension = splitext(content[0].split(' ')[0])[1][1:]
if extension == 'ct': if extension == 'ct':
load_file_fun = load_ct load_file_fun = load_ct
@@ -691,8 +701,9 @@ def load_from_ds(filename, filename_targets):
g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1)) g, l_names = load_file_fun(dirname_dataset + '/' + tmp.replace('#', '', 1))
data.append(g) data.append(g)
__append_label_names(label_names, l_names) __append_label_names(label_names, l_names)
content_y = open(filename_targets).read().splitlines()
with open(filename_targets) as fnt:
content_y = fnt.read().splitlines()
# assume entries in filename and filename_targets have the same order. # assume entries in filename and filename_targets have the same order.
for item in content_y: for item in content_y:
tmp = item.split(' ') tmp = item.split(' ')


+ 141
- 0
gklearn/utils/knn.py View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon May 11 11:03:01 2020

@author: ljia
"""
import numpy as np
from sklearn.model_selection import ShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from gklearn.utils.utils import get_graph_kernel_by_name
# from gklearn.preimage.utils import get_same_item_indices

def sum_squares(a, b):
"""
Return the sum of squares of the difference between a and b, aka MSE
"""
return np.sum([(a[i] - b[i])**2 for i in range(len(a))])


def euclid_d(x, y):
"""
1D euclidean distance
"""
return np.sqrt((x-y)**2)


def man_d(x, y):
"""
1D manhattan distance
"""
return np.abs((x-y))


def knn_regression(D_app, D_test, y_app, y_test, n_neighbors, verbose=True, text=None):

from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor(n_neighbors=n_neighbors, metric='precomputed')
knn.fit(D_app, y_app)
y_pred = knn.predict(D_app)
y_pred_test = knn.predict(D_test.T)
perf_app = np.sqrt(sum_squares(y_pred, y_app)/len(y_app))
perf_test = np.sqrt(sum_squares(y_pred_test, y_test)/len(y_test))

if (verbose):
print("Learning error with {} train examples : {}".format(text, perf_app))
print("Test error with {} train examples : {}".format(text, perf_test))

return perf_app, perf_test


def knn_classification(d_app, d_test, y_app, y_test, n_neighbors, verbose=True, text=None):
knn = KNeighborsClassifier(n_neighbors=n_neighbors, metric='precomputed')
knn.fit(d_app, y_app)
y_pred = knn.predict(d_app)
y_pred_test = knn.predict(d_test.T)
perf_app = accuracy_score(y_app, y_pred)
perf_test = accuracy_score(y_test, y_pred_test)

if (verbose):
print("Learning accuracy with {} costs : {}".format(text, perf_app))
print("Test accuracy with {} costs : {}".format(text, perf_test))
return perf_app, perf_test

def knn_cv(dataset, kernel_options, trainset=None, n_neighbors=1, n_splits=50, test_size=0.9, verbose=True):
'''
Perform a knn classification cross-validation on given dataset.
'''
# Gn = dataset.graphs
y_all = dataset.targets
# compute kernel distances.
dis_mat = __compute_kernel_distances(dataset, kernel_options, trainset=trainset)
rs = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=0)
# train_indices = [[] for _ in range(n_splits)]
# test_indices = [[] for _ in range(n_splits)]
# idx_targets = get_same_item_indices(y_all)
# for key, item in idx_targets.items():
# i = 0
# for train_i, test_i in rs.split(item): # @todo: careful when parallel.
# train_indices[i] += [item[idx] for idx in train_i]
# test_indices[i] += [item[idx] for idx in test_i]
# i += 1
accuracies = []
# for trial in range(len(train_indices)):
# train_index = train_indices[trial]
# test_index = test_indices[trial]
for train_index, test_index in rs.split(y_all):
# print(train_index, test_index)
# G_app = [Gn[i] for i in train_index]
# G_test = [Gn[i] for i in test_index]
y_app = [y_all[i] for i in train_index]
y_test = [y_all[i] for i in test_index]
N = len(train_index)
d_app = dis_mat.copy()
d_app = d_app[train_index,:]
d_app = d_app[:,train_index]
d_test = np.zeros((N, len(test_index)))
for i in range(N):
for j in range(len(test_index)):
d_test[i, j] = dis_mat[train_index[i], test_index[j]]
accuracies.append(knn_classification(d_app, d_test, y_app, y_test, n_neighbors, verbose=verbose, text=''))
results = {}
results['ave_perf_train'] = np.mean([i[0] for i in accuracies], axis=0)
results['std_perf_train'] = np.std([i[0] for i in accuracies], axis=0, ddof=1)
results['ave_perf_test'] = np.mean([i[1] for i in accuracies], axis=0)
results['std_perf_test'] = np.std([i[1] for i in accuracies], axis=0, ddof=1)

return results
def __compute_kernel_distances(dataset, kernel_options, trainset=None):
graph_kernel = get_graph_kernel_by_name(kernel_options['name'],
node_labels=dataset.node_labels,
edge_labels=dataset.edge_labels,
node_attrs=dataset.node_attrs,
edge_attrs=dataset.edge_attrs,
ds_infos=dataset.get_dataset_infos(keys=['directed']),
kernel_options=kernel_options)
gram_matrix, run_time = graph_kernel.compute(dataset.graphs, **kernel_options)

dis_mat, _, _, _ = graph_kernel.compute_distance_matrix()
if trainset is not None:
gram_matrix_unnorm = graph_kernel.gram_matrix_unnorm

return dis_mat

+ 33
- 4
gklearn/utils/utils.py View File

@@ -1,7 +1,7 @@
import networkx as nx import networkx as nx
import numpy as np import numpy as np
from copy import deepcopy from copy import deepcopy
from enum import Enum, auto
from enum import Enum, unique
#from itertools import product #from itertools import product


# from tqdm import tqdm # from tqdm import tqdm
@@ -468,7 +468,36 @@ def get_mlti_dim_edge_attrs(G, attr_names):
return attributes return attributes




@unique
class SpecialLabel(Enum): class SpecialLabel(Enum):
"""can be used to define special labels.
"""
DUMMY = auto # The dummy label.
"""can be used to define special labels.
"""
DUMMY = 1 # The dummy label.
# DUMMY = auto # enum.auto does not exist in Python 3.5.
def normalize_gram_matrix(gram_matrix):
diag = gram_matrix.diagonal().copy()
for i in range(len(gram_matrix)):
for j in range(i, len(gram_matrix)):
gram_matrix[i][j] /= np.sqrt(diag[i] * diag[j])
gram_matrix[j][i] = gram_matrix[i][j]
return gram_matrix
def compute_distance_matrix(gram_matrix):
dis_mat = np.empty((len(gram_matrix), len(gram_matrix)))
for i in range(len(gram_matrix)):
for j in range(i, len(gram_matrix)):
dis = gram_matrix[i, i] + gram_matrix[j, j] - 2 * gram_matrix[i, j]
if dis < 0:
if dis > -1e-10:
dis = 0
else:
raise ValueError('The distance is negative.')
dis_mat[i, j] = np.sqrt(dis)
dis_mat[j, i] = dis_mat[i, j]
dis_max = np.max(np.max(dis_mat))
dis_min = np.min(np.min(dis_mat[dis_mat != 0]))
dis_mean = np.mean(np.mean(dis_mat))
return dis_mat, dis_max, dis_min, dis_mean

+ 5
- 4
requirements.txt View File

@@ -1,10 +1,11 @@
numpy>=1.15.2
numpy>=1.16.2
scipy>=1.1.0 scipy>=1.1.0
matplotlib>=3.0.0 matplotlib>=3.0.0
networkx>=2.2 networkx>=2.2
scikit-learn>=0.20.0 scikit-learn>=0.20.0
tabulate>=0.8.2 tabulate>=0.8.2
tqdm>=4.26.0 tqdm>=4.26.0
# cvxpy # for preimage.
# cvxopt # for preimage.
# mosek # for preimage.
cvxpy>=1.0.31 # for preimage. Does not work for "pip install graphkit-learn".
# -e https://files.pythonhosted.org/packages/11/d0/d900870dc2d02ea74961b90c353666c6528a33ea61a10aa59a0d5574ae59/cvxpy-1.0.31.tar.gz # for preimage.
cvxopt>=1.2.5 # for preimage.
mosek>=9.2.5; python_version >= '3.6' # for preimage.

+ 11
- 0
requirements_pypi.txt View File

@@ -0,0 +1,11 @@
numpy>=1.16.2
scipy>=1.1.0
matplotlib>=3.0.0
networkx>=2.2
scikit-learn>=0.20.0
tabulate>=0.8.2
tqdm>=4.26.0
# cvxpy>=1.0.31 # for preimage. Does not work for "pip install graphkit-learn".
# -e https://files.pythonhosted.org/packages/11/d0/d900870dc2d02ea74961b90c353666c6528a33ea61a10aa59a0d5574ae59/cvxpy-1.0.31.tar.gz # for preimage.
cvxopt>=1.2.5 # for preimage.
mosek>=9.2.5; python_version >= '3.6' # for preimage.

+ 3
- 3
setup.py View File

@@ -3,15 +3,15 @@ import setuptools
with open("README.md", "r") as fh: with open("README.md", "r") as fh:
long_description = fh.read() long_description = fh.read()


with open('requirements.txt') as fp:
with open('requirements_pypi.txt') as fp:
install_requires = fp.read() install_requires = fp.read()


setuptools.setup( setuptools.setup(
name="graphkit-learn", name="graphkit-learn",
version="0.2b1",
version="0.2b2",
author="Linlin Jia", author="Linlin Jia",
author_email="linlin.jia@insa-rouen.fr", author_email="linlin.jia@insa-rouen.fr",
description="A Python library for graph kernels based on linear patterns",
description="A Python library for graph kernels, graph edit distances, and graph pre-images",
long_description=long_description, long_description=long_description,
long_description_content_type="text/markdown", long_description_content_type="text/markdown",
url="https://github.com/jajupmochi/graphkit-learn", url="https://github.com/jajupmochi/graphkit-learn",


Loading…
Cancel
Save