Browse Source

Merge pull request #11 from jajupmochi/v0.2

V0.2
tags/v0.2.0
linlin GitHub 5 years ago
parent
commit
8efc673bde
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 335 additions and 88 deletions
  1. +1
    -0
      .gitignore
  2. +5
    -1
      .travis.yml
  3. +172
    -3
      gklearn/ged/median/median_graph_estimator.py
  4. +4
    -0
      gklearn/ged/median/utils.py
  5. +45
    -21
      gklearn/ged/util/util.py
  6. +62
    -35
      gklearn/preimage/median_preimage_generator.py
  7. +7
    -5
      gklearn/preimage/utils.py
  8. +23
    -15
      gklearn/utils/dataset.py
  9. +16
    -8
      gklearn/utils/graph_files.py

+ 1
- 0
.gitignore View File

@@ -29,6 +29,7 @@ gklearn/kernels/*_sym.py


gklearn/preimage/* gklearn/preimage/*
!gklearn/preimage/*.py !gklearn/preimage/*.py
!gklearn/preimage/experiments/*.py


__pycache__ __pycache__
##*# ##*#


+ 5
- 1
.travis.yml View File

@@ -1,6 +1,10 @@
language: python language: python
python: python:
- '3.6.9'
- '3.0'
- '3.1'
- '3.2'
- '3.3'
- '3.4'
- '3.5' - '3.5'
- '3.6' - '3.6'
- '3.7' - '3.7'


+ 172
- 3
gklearn/ged/median/median_graph_estimator.py View File

@@ -70,6 +70,7 @@ class MedianGraphEstimator(object):
self.__num_increase_order = 0 self.__num_increase_order = 0
self.__num_converged_descents = 0 self.__num_converged_descents = 0
self.__state = AlgorithmState.TERMINATED self.__state = AlgorithmState.TERMINATED
self.__label_names = {}
if ged_env is None: if ged_env is None:
raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.') raise Exception('The GED environment pointer passed to the constructor of MedianGraphEstimator is null.')
@@ -551,6 +552,7 @@ class MedianGraphEstimator(object):
self.__init_type_increase_order = 'K-MEANS++' self.__init_type_increase_order = 'K-MEANS++'
self.__max_itrs_increase_order = 10 self.__max_itrs_increase_order = 10
self.__print_to_stdout = 2 self.__print_to_stdout = 2
self.__label_names = {}
def __construct_initial_medians(self, graph_ids, timer, initial_medians): def __construct_initial_medians(self, graph_ids, timer, initial_medians):
@@ -666,7 +668,8 @@ class MedianGraphEstimator(object):
# Compute the median label and update the median. # Compute the median label and update the median.
if len(node_labels) > 0: if len(node_labels) > 0:
median_label = self.__ged_env.get_median_node_label(node_labels)
# median_label = self.__ged_env.get_median_node_label(node_labels)
median_label = self.__get_median_node_label(node_labels)
if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon: if self.__ged_env.get_node_rel_cost(median.nodes[i], median_label) > self.__epsilon:
nx.set_node_attributes(median, {i: median_label}) nx.set_node_attributes(median, {i: median_label})
@@ -701,7 +704,7 @@ class MedianGraphEstimator(object):
if median.has_edge(i, j): if median.has_edge(i, j):
median_label = median.edges[(i, j)] median_label = median.edges[(i, j)]
if self.__labeled_edges and len(edge_labels) > 0: if self.__labeled_edges and len(edge_labels) > 0:
new_median_label = self.__ged_env.median_edge_label(edge_labels)
new_median_label = self.__get_median_edge_label(edge_labels)
if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon: if self.__ged_env.get_edge_rel_cost(median_label, new_median_label) > self.__epsilon:
median_label = new_median_label median_label = new_median_label
for edge_label in edge_labels: for edge_label in edge_labels:
@@ -821,4 +824,170 @@ class MedianGraphEstimator(object):
def compute_my_cost(g, h, node_map): def compute_my_cost(g, h, node_map):
cost = 0.0 cost = 0.0
for node in g.nodes: for node in g.nodes:
cost += 0
cost += 0
def set_label_names(self, node_labels=[], edge_labels=[], node_attrs=[], edge_attrs=[]):
self.__label_names = {'node_labels': node_labels, 'edge_labels': edge_labels,
'node_attrs': node_attrs, 'edge_attrs': edge_attrs}
def __get_median_node_label(self, node_labels):
if len(self.__label_names['node_labels']) > 0:
return self.__get_median_label_symbolic(node_labels)
elif len(self.__label_names['node_attrs']) > 0:
return self.__get_median_label_nonsymbolic(node_labels)
else:
raise Exception('Node label names are not given.')
def __get_median_edge_label(self, edge_labels):
if len(self.__label_names['edge_labels']) > 0:
return self.__get_median_label_symbolic(edge_labels)
elif len(self.__label_names['edge_attrs']) > 0:
return self.__get_median_label_nonsymbolic(edge_labels)
else:
raise Exception('Edge label names are not given.')
def __get_median_label_symbolic(self, labels):
# Construct histogram.
hist = {}
for label in labels:
label = tuple([kv for kv in label.items()]) # @todo: this may be slow.
if label not in hist:
hist[label] = 1
else:
hist[label] += 1
# Return the label that appears most frequently.
best_count = 0
median_label = {}
for label, count in hist.items():
if count > best_count:
best_count = count
median_label = {kv[0]: kv[1] for kv in label}
return median_label
def __get_median_label_nonsymbolic(self, labels):
if len(labels) == 0:
return {} # @todo
else:
# Transform the labels into coordinates and compute mean label as initial solution.
labels_as_coords = []
sums = {}
for key, val in labels[0].items():
sums[key] = 0
for label in labels:
coords = {}
for key, val in label.items():
label = float(val)
sums[key] += label
coords[key] = label
labels_as_coords.append(coords)
median = {}
for key, val in sums.items():
median[key] = val / len(labels)
# Run main loop of Weiszfeld's Algorithm.
epsilon = 0.0001
delta = 1.0
num_itrs = 0
all_equal = False
while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)):
numerator = {}
for key, val in sums.items():
numerator[key] = 0
denominator = 0
for label_as_coord in labels_as_coords:
norm = 0
for key, val in label_as_coord.items():
norm += (val - median[key]) ** 2
norm += np.sqrt(norm)
if norm > 0:
for key, val in label_as_coord.items():
numerator[key] += val / norm
denominator += 1.0 / norm
if denominator == 0:
all_equal = True
else:
new_median = {}
delta = 0.0
for key, val in numerator.items():
this_median = val / denominator
new_median[key] = this_median
delta += np.abs(median[key] - this_median)
median = new_median
num_itrs += 1
# Transform the solution to strings and return it.
median_label = {}
for key, val in median.items():
median_label[key] = str(val)
return median_label

# def __get_median_edge_label_symbolic(self, edge_labels):
# pass
# def __get_median_edge_label_nonsymbolic(self, edge_labels):
# if len(edge_labels) == 0:
# return {}
# else:
# # Transform the labels into coordinates and compute mean label as initial solution.
# edge_labels_as_coords = []
# sums = {}
# for key, val in edge_labels[0].items():
# sums[key] = 0
# for edge_label in edge_labels:
# coords = {}
# for key, val in edge_label.items():
# label = float(val)
# sums[key] += label
# coords[key] = label
# edge_labels_as_coords.append(coords)
# median = {}
# for key, val in sums.items():
# median[key] = val / len(edge_labels)
#
# # Run main loop of Weiszfeld's Algorithm.
# epsilon = 0.0001
# delta = 1.0
# num_itrs = 0
# all_equal = False
# while ((delta > epsilon) and (num_itrs < 100) and (not all_equal)):
# numerator = {}
# for key, val in sums.items():
# numerator[key] = 0
# denominator = 0
# for edge_label_as_coord in edge_labels_as_coords:
# norm = 0
# for key, val in edge_label_as_coord.items():
# norm += (val - median[key]) ** 2
# norm += np.sqrt(norm)
# if norm > 0:
# for key, val in edge_label_as_coord.items():
# numerator[key] += val / norm
# denominator += 1.0 / norm
# if denominator == 0:
# all_equal = True
# else:
# new_median = {}
# delta = 0.0
# for key, val in numerator.items():
# this_median = val / denominator
# new_median[key] = this_median
# delta += np.abs(median[key] - this_median)
# median = new_median
#
# num_itrs += 1
#
# # Transform the solution to ged::GXLLabel and return it.
# median_label = {}
# for key, val in median.items():
# median_label[key] = str(val)
# return median_label

+ 4
- 0
gklearn/ged/median/utils.py View File

@@ -9,6 +9,10 @@ Created on Wed Apr 1 15:12:31 2020
def constant_node_costs(edit_cost_name): def constant_node_costs(edit_cost_name):
if edit_cost_name == 'NON_SYMBOLIC' or edit_cost_name == 'LETTER2' or edit_cost_name == 'LETTER': if edit_cost_name == 'NON_SYMBOLIC' or edit_cost_name == 'LETTER2' or edit_cost_name == 'LETTER':
return False return False
elif edit_cost_name == 'CONSTANT':
return True
else:
raise Exception('Can not recognize the given edit cost. Possible edit costs include: "NON_SYMBOLIC", "LETTER", "LETTER2", "CONSTANT".')
# elif edit_cost_name != '': # elif edit_cost_name != '':
# # throw ged::Error("Invalid dataset " + dataset + ". Usage: ./median_tests <AIDS|Mutagenicity|Letter-high|Letter-med|Letter-low|monoterpenoides|SYNTHETICnew|Fingerprint|COIL-DEL>"); # # throw ged::Error("Invalid dataset " + dataset + ". Usage: ./median_tests <AIDS|Mutagenicity|Letter-high|Letter-med|Letter-low|monoterpenoides|SYNTHETICnew|Fingerprint|COIL-DEL>");
# return False # return False


+ 45
- 21
gklearn/ged/util/util.py View File

@@ -57,7 +57,10 @@ def compute_geds(graphs, options={}, parallel=False):
ged_env.set_method(options['method'], ged_options_to_string(options)) ged_env.set_method(options['method'], ged_options_to_string(options))
ged_env.init_method() ged_env.init_method()


# compute ged.
# compute ged.
neo_options = {'edit_cost': options['edit_cost'],
'node_labels': options['node_labels'], 'edge_labels': options['edge_labels'],
'node_attrs': options['node_attrs'], 'edge_attrs': options['edge_attrs']}
ged_mat = np.zeros((len(graphs), len(graphs))) ged_mat = np.zeros((len(graphs), len(graphs)))
if parallel: if parallel:
len_itr = int(len(graphs) * (len(graphs) - 1) / 2) len_itr = int(len(graphs) * (len(graphs) - 1) / 2)
@@ -74,7 +77,7 @@ def compute_geds(graphs, options={}, parallel=False):
G_graphs = graphs_toshare G_graphs = graphs_toshare
G_ged_env = ged_env_toshare G_ged_env = ged_env_toshare
G_listID = listID_toshare G_listID = listID_toshare
do_partial = partial(_wrapper_compute_ged_parallel, options)
do_partial = partial(_wrapper_compute_ged_parallel, neo_options)
pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID)) pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(graphs, ged_env, listID))
iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize), iterator = tqdm(pool.imap_unordered(do_partial, itr, chunksize),
desc='computing GEDs', file=sys.stdout) desc='computing GEDs', file=sys.stdout)
@@ -100,7 +103,7 @@ def compute_geds(graphs, options={}, parallel=False):
ged_vec.append(dis) ged_vec.append(dis)
ged_mat[i][j] = dis ged_mat[i][j] = dis
ged_mat[j][i] = dis ged_mat[j][i] = dis
n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, edit_cost=options['edit_cost'])
n_eo_tmp = get_nb_edit_operations(graphs[i], graphs[j], pi_forward, pi_backward, **neo_options)
n_edit_operations.append(n_eo_tmp) n_edit_operations.append(n_eo_tmp)
return ged_vec, ged_mat, n_edit_operations return ged_vec, ged_mat, n_edit_operations
@@ -115,7 +118,7 @@ def _wrapper_compute_ged_parallel(options, itr):


def _compute_ged_parallel(env, gid1, gid2, g1, g2, options): def _compute_ged_parallel(env, gid1, gid2, g1, g2, options):
dis, pi_forward, pi_backward = _compute_ged(env, gid1, gid2, g1, g2) dis, pi_forward, pi_backward = _compute_ged(env, gid1, gid2, g1, g2)
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward, edit_cost=options['edit_cost']) # [0,0,0,0,0,0]
n_eo_tmp = get_nb_edit_operations(g1, g2, pi_forward, pi_backward, **options) # [0,0,0,0,0,0]
return dis, n_eo_tmp return dis, n_eo_tmp




@@ -137,17 +140,26 @@ def _compute_ged(env, gid1, gid2, g1, g2):
return dis, pi_forward, pi_backward return dis, pi_forward, pi_backward




def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None):
def get_nb_edit_operations(g1, g2, forward_map, backward_map, edit_cost=None, **kwargs):
if edit_cost == 'LETTER' or edit_cost == 'LETTER2': if edit_cost == 'LETTER' or edit_cost == 'LETTER2':
return get_nb_edit_operations_letter(g1, g2, forward_map, backward_map) return get_nb_edit_operations_letter(g1, g2, forward_map, backward_map)
elif edit_cost == 'NON_SYMBOLIC': elif edit_cost == 'NON_SYMBOLIC':
return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map)
node_attrs = kwargs.get('node_attrs', [])
edge_attrs = kwargs.get('edge_attrs', [])
return get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
node_attrs=node_attrs, edge_attrs=edge_attrs)
elif edit_cost == 'CONSTANT':
node_labels = kwargs.get('node_labels', [])
edge_labels = kwargs.get('edge_labels', [])
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
node_labels=node_labels, edge_labels=edge_labels)
else: else:
return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map) return get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map)


def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map):
"""Compute the number of each edit operations.
def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map,
node_labels=[], edge_labels=[]):
"""Compute the number of each edit operations for symbolic-labeled graphs.
""" """
n_vi = 0 n_vi = 0
n_vr = 0 n_vr = 0
@@ -160,8 +172,13 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map):
for i, map_i in enumerate(forward_map): for i, map_i in enumerate(forward_map):
if map_i == np.inf: if map_i == np.inf:
n_vr += 1 n_vr += 1
elif g1.node[nodes1[i]]['atom'] != g2.node[map_i]['atom']:
n_vs += 1
else:
for nl in node_labels:
label1 = g1.nodes[nodes1[i]][nl]
label2 = g2.nodes[map_i][nl]
if label1 != label2:
n_vs += 1
break
for map_i in backward_map: for map_i in backward_map:
if map_i == np.inf: if map_i == np.inf:
n_vi += 1 n_vi += 1
@@ -180,15 +197,21 @@ def get_nb_edit_operations_symbolic(g1, g2, forward_map, backward_map):
elif (forward_map[idx1], forward_map[idx2]) in g2.edges(): elif (forward_map[idx1], forward_map[idx2]) in g2.edges():
nb_edges2_cnted += 1 nb_edges2_cnted += 1
# edge labels are different. # edge labels are different.
if g2.edges[((forward_map[idx1], forward_map[idx2]))]['bond_type'] \
!= g1.edges[(n1, n2)]['bond_type']:
for el in edge_labels:
label1 = g2.edges[((forward_map[idx1], forward_map[idx2]))][el]
label2 = g1.edges[(n1, n2)][el]
if label1 != label2:
n_es += 1 n_es += 1
break
elif (forward_map[idx2], forward_map[idx1]) in g2.edges(): elif (forward_map[idx2], forward_map[idx1]) in g2.edges():
nb_edges2_cnted += 1 nb_edges2_cnted += 1
# edge labels are different. # edge labels are different.
if g2.edges[((forward_map[idx2], forward_map[idx1]))]['bond_type'] \
!= g1.edges[(n1, n2)]['bond_type']:
n_es += 1
for el in edge_labels:
label1 = g2.edges[((forward_map[idx2], forward_map[idx1]))][el]
label2 = g1.edges[(n1, n2)][el]
if label1 != label2:
n_es += 1
break
# corresponding nodes are in g2, however the edge is removed. # corresponding nodes are in g2, however the edge is removed.
else: else:
n_er += 1 n_er += 1
@@ -242,7 +265,8 @@ def get_nb_edit_operations_letter(g1, g2, forward_map, backward_map):
return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er return n_vi, n_vr, n_vs, sod_vs, n_ei, n_er




def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map):
def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map,
node_attrs=[], edge_attrs=[]):
"""Compute the number of each edit operations. """Compute the number of each edit operations.
""" """
n_vi = 0 n_vi = 0
@@ -261,7 +285,7 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map):
else: else:
n_vs += 1 n_vs += 1
sum_squares = 0 sum_squares = 0
for a_name in g1.graph['node_attrs']:
for a_name in node_attrs:
diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name]) diff = float(g1.nodes[nodes1[i]][a_name]) - float(g2.nodes[map_i][a_name])
sum_squares += np.square(diff) sum_squares += np.square(diff)
sod_vs += np.sqrt(sum_squares) sod_vs += np.sqrt(sum_squares)
@@ -284,15 +308,15 @@ def get_nb_edit_operations_nonsymbolic(g1, g2, forward_map, backward_map):
elif (n1_g2, n2_g2) in g2.edges(): elif (n1_g2, n2_g2) in g2.edges():
n_es += 1 n_es += 1
sum_squares = 0 sum_squares = 0
for a_name in g1.graph['edge_attrs']:
diff = float(g1.edges[n1, n2][a_name]) - float(g2.nodes[n1_g2, n2_g2][a_name])
for a_name in edge_attrs:
diff = float(g1.edges[n1, n2][a_name]) - float(g2.edges[n1_g2, n2_g2][a_name])
sum_squares += np.square(diff) sum_squares += np.square(diff)
sod_es += np.sqrt(sum_squares) sod_es += np.sqrt(sum_squares)
elif (n2_g2, n1_g2) in g2.edges(): elif (n2_g2, n1_g2) in g2.edges():
n_es += 1 n_es += 1
sum_squares = 0 sum_squares = 0
for a_name in g1.graph['edge_attrs']:
diff = float(g1.edges[n2, n1][a_name]) - float(g2.nodes[n2_g2, n1_g2][a_name])
for a_name in edge_attrs:
diff = float(g1.edges[n2, n1][a_name]) - float(g2.edges[n2_g2, n1_g2][a_name])
sum_squares += np.square(diff) sum_squares += np.square(diff)
sod_es += np.sqrt(sum_squares) sod_es += np.sqrt(sum_squares)
# corresponding nodes are in g2, however the edge is removed. # corresponding nodes are in g2, however the edge is removed.


+ 62
- 35
gklearn/preimage/median_preimage_generator.py View File

@@ -96,7 +96,10 @@ class MedianPreimageGenerator(PreimageGenerator):
if self.__runtime_precompute_gm is None: if self.__runtime_precompute_gm is None:
raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.') raise Exception('Parameter "runtime_precompute_gm" must be given when using pre-computed Gram matrix.')
self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm self._graph_kernel.gram_matrix_unnorm = self.__gram_matrix_unnorm
self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm))
if self._kernel_options['normalize']:
self._graph_kernel.gram_matrix = self._graph_kernel.normalize_gm(np.copy(self.__gram_matrix_unnorm))
else:
self._graph_kernel.gram_matrix = np.copy(self.__gram_matrix_unnorm)
end_precompute_gm = time.time() end_precompute_gm = time.time()
start -= self.__runtime_precompute_gm start -= self.__runtime_precompute_gm
@@ -259,6 +262,10 @@ class MedianPreimageGenerator(PreimageGenerator):
self.__edit_cost_constants = self.__init_ecc self.__edit_cost_constants = self.__init_ecc
options = self.__ged_options.copy() options = self.__ged_options.copy()
options['edit_cost_constants'] = self.__edit_cost_constants # @todo options['edit_cost_constants'] = self.__edit_cost_constants # @todo
options['node_labels'] = self._dataset.node_labels
options['edge_labels'] = self._dataset.edge_labels
options['node_attrs'] = self._dataset.node_attrs
options['edge_attrs'] = self._dataset.edge_attrs
ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) ged_vec_init, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))] residual_list = [np.sqrt(np.sum(np.square(np.array(ged_vec_init) - dis_k_vec)))]
time_list = [time.time() - time0] time_list = [time.time() - time0]
@@ -297,6 +304,10 @@ class MedianPreimageGenerator(PreimageGenerator):
# compute new GEDs and numbers of edit operations. # compute new GEDs and numbers of edit operations.
options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75]) options = self.__ged_options.copy() # np.array([self.__edit_cost_constants[0], self.__edit_cost_constants[1], 0.75])
options['edit_cost_constants'] = self.__edit_cost_constants # @todo options['edit_cost_constants'] = self.__edit_cost_constants # @todo
options['node_labels'] = self._dataset.node_labels
options['edge_labels'] = self._dataset.edge_labels
options['node_attrs'] = self._dataset.node_attrs
options['edge_attrs'] = self._dataset.edge_attrs
ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel) ged_vec, ged_mat, n_edit_operations = compute_geds(graphs, options=options, parallel=self.__parallel)
residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec)))) residual_list.append(np.sqrt(np.sum(np.square(np.array(ged_vec) - dis_k_vec))))
time_list.append(time.time() - time0) time_list.append(time.time() - time0)
@@ -444,34 +455,10 @@ class MedianPreimageGenerator(PreimageGenerator):
nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]] nb_cost_mat_new = nb_cost_mat[:,[0,1,3,4,5]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
try:
prob.solve(verbose=True)
except MemoryError as error0:
if self._verbose >= 2:
print('\nUsing solver "OSQP" caused a memory error.')
print('the original error message is\n', error0)
print('solver status: ', prob.status)
print('trying solver "CVXOPT" instead...\n')
try:
prob.solve(solver=cp.CVXOPT, verbose=True)
except Exception as error1:
if self._verbose >= 2:
print('\nAn error occured when using solver "CVXOPT".')
print('the original error message is\n', error1)
print('solver status: ', prob.status)
print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
prob.solve(solver=cp.MOSEK, verbose=True)
else:
if self._verbose >= 2:
print('solver status: ', prob.status)
else:
if self._verbose >= 2:
print('solver status: ', prob.status)
if self._verbose >= 2:
print()
self.__execute_cvx(prob)
edit_costs_new = x.value edit_costs_new = x.value
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif rw_constraints == '2constraints': elif rw_constraints == '2constraints':
@@ -541,19 +528,17 @@ class MedianPreimageGenerator(PreimageGenerator):
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
self.__execute_cvx(prob)
edit_costs_new = x.value edit_costs_new = x.value
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif is_n_attr and not is_e_attr: elif is_n_attr and not is_e_attr:
nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]] nb_cost_mat_new = nb_cost_mat[:,[0,1,2,3,4]]
x = cp.Variable(nb_cost_mat_new.shape[1]) x = cp.Variable(nb_cost_mat_new.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
constraints = [x >= [0.001 for i in range(nb_cost_mat_new.shape[1])],
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0] np.array([1.0, 1.0, -1.0, 0.0, 0.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
if self._verbose >= 2:
print(x.value)
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value, np.array([0.0]))) edit_costs_new = np.concatenate((x.value, np.array([0.0])))
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif not is_n_attr and is_e_attr: elif not is_n_attr and is_e_attr:
@@ -563,7 +548,7 @@ class MedianPreimageGenerator(PreimageGenerator):
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])], constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])],
np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] np.array([0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:])) edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), x.value[2:]))
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
else: else:
@@ -572,10 +557,20 @@ class MedianPreimageGenerator(PreimageGenerator):
cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec) cost_fun = cp.sum_squares(nb_cost_mat_new * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]] constraints = [x >= [0.01 for i in range(nb_cost_mat_new.shape[1])]]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
self.__execute_cvx(prob)
edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]), edit_costs_new = np.concatenate((x.value[0:2], np.array([0.0]),
x.value[2:], np.array([0.0]))) x.value[2:], np.array([0.0])))
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
elif self.__ged_options['edit_cost'] == 'CONSTANT': # @todo: node/edge may not labeled.
x = cp.Variable(nb_cost_mat.shape[1])
cost_fun = cp.sum_squares(nb_cost_mat * x - dis_k_vec)
constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints)
self.__execute_cvx(prob)
edit_costs_new = x.value
residual = np.sqrt(prob.value)
else: else:
# # method 1: simple least square method. # # method 1: simple least square method.
# edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec, # edit_costs_new, residual, _, _ = np.linalg.lstsq(nb_cost_mat, dis_k_vec,
@@ -607,7 +602,7 @@ class MedianPreimageGenerator(PreimageGenerator):
np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
prob = cp.Problem(cp.Minimize(cost_fun), constraints) prob = cp.Problem(cp.Minimize(cost_fun), constraints)
prob.solve()
self.__execute_cvx(prob)
edit_costs_new = x.value edit_costs_new = x.value
residual = np.sqrt(prob.value) residual = np.sqrt(prob.value)
@@ -616,6 +611,34 @@ class MedianPreimageGenerator(PreimageGenerator):
return edit_costs_new, residual return edit_costs_new, residual
def __execute_cvx(self, prob):
try:
prob.solve(verbose=(self._verbose>=2))
except MemoryError as error0:
if self._verbose >= 2:
print('\nUsing solver "OSQP" caused a memory error.')
print('the original error message is\n', error0)
print('solver status: ', prob.status)
print('trying solver "CVXOPT" instead...\n')
try:
prob.solve(solver=cp.CVXOPT, verbose=(self._verbose>=2))
except Exception as error1:
if self._verbose >= 2:
print('\nAn error occured when using solver "CVXOPT".')
print('the original error message is\n', error1)
print('solver status: ', prob.status)
print('trying solver "MOSEK" instead. Notice this solver is commercial and a lisence is required.\n')
prob.solve(solver=cp.MOSEK, verbose=(self._verbose>=2))
else:
if self._verbose >= 2:
print('solver status: ', prob.status)
else:
if self._verbose >= 2:
print('solver status: ', prob.status)
if self._verbose >= 2:
print()

def __generate_preimage_iam(self): def __generate_preimage_iam(self):
# Set up the ged environment. # Set up the ged environment.
ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible. ged_env = gedlibpy.GEDEnv() # @todo: maybe create a ged_env as a private varible.
@@ -638,6 +661,10 @@ class MedianPreimageGenerator(PreimageGenerator):
# Select the GED algorithm. # Select the GED algorithm.
mge.set_options(mge_options_to_string(options)) mge.set_options(mge_options_to_string(options))
mge.set_label_names(node_labels=self._dataset.node_labels,
edge_labels=self._dataset.edge_labels,
node_attrs=self._dataset.node_attrs,
edge_attrs=self._dataset.edge_attrs)
mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) mge.set_init_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))
mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options)) mge.set_descent_method(self.__ged_options['method'], ged_options_to_string(self.__ged_options))


+ 7
- 5
gklearn/preimage/utils.py View File

@@ -37,7 +37,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
dataset_all.trim_dataset(edge_required=edge_required) dataset_all.trim_dataset(edge_required=edge_required)
if irrelevant_labels is not None: if irrelevant_labels is not None:
dataset_all.remove_labels(**irrelevant_labels) dataset_all.remove_labels(**irrelevant_labels)
# dataset_all.cut_graphs(range(0, 100))
# dataset_all.cut_graphs(range(0, 10))
datasets = split_dataset_by_target(dataset_all) datasets = split_dataset_by_target(dataset_all)


if save_results: if save_results:
@@ -67,8 +67,8 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz' gm_fname = dir_save + 'gram_matrix_unnorm.' + ds_name + '.' + kernel_options['name'] + '.gm.npz'
gmfile_exist = os.path.isfile(os.path.abspath(gm_fname)) gmfile_exist = os.path.isfile(os.path.abspath(gm_fname))
if gmfile_exist: if gmfile_exist:
gmfile = np.load(gm_fname)
gram_matrix_unnorm_list = gmfile['gram_matrix_unnorm_list']
gmfile = np.load(gm_fname, allow_pickle=True) # @todo: may not be safe.
gram_matrix_unnorm_list = [item for item in gmfile['gram_matrix_unnorm_list']]
time_precompute_gm_list = gmfile['run_time_list'].tolist() time_precompute_gm_list = gmfile['run_time_list'].tolist()
else: else:
gram_matrix_unnorm_list = [] gram_matrix_unnorm_list = []
@@ -87,6 +87,7 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
print('start generating preimage for each class of target...') print('start generating preimage for each class of target...')
idx_offset = 0
for idx, dataset in enumerate(datasets): for idx, dataset in enumerate(datasets):
target = dataset.targets[0] target = dataset.targets[0]
print('\ntarget =', target, '\n') print('\ntarget =', target, '\n')
@@ -96,14 +97,15 @@ def generate_median_preimages_by_class(ds_name, mpg_options, kernel_options, ged
num_graphs = len(dataset.graphs) num_graphs = len(dataset.graphs)
if num_graphs < 2: if num_graphs < 2:
print('\nnumber of graphs = ', num_graphs, ', skip.\n') print('\nnumber of graphs = ', num_graphs, ', skip.\n')
idx_offset += 1
continue continue
# 2. set parameters. # 2. set parameters.
print('2. initializing mpg and setting parameters...') print('2. initializing mpg and setting parameters...')
if load_gm: if load_gm:
if gmfile_exist: if gmfile_exist:
mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx]
mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx]
mpg_options['gram_matrix_unnorm'] = gram_matrix_unnorm_list[idx - idx_offset]
mpg_options['runtime_precompute_gm'] = time_precompute_gm_list[idx - idx_offset]
mpg = MedianPreimageGenerator() mpg = MedianPreimageGenerator()
mpg.dataset = dataset mpg.dataset = dataset
mpg.set_options(**mpg_options.copy()) mpg.set_options(**mpg_options.copy())


+ 23
- 15
gklearn/utils/dataset.py View File

@@ -67,18 +67,35 @@ class Dataset(object):
def load_predefined_dataset(self, ds_name): def load_predefined_dataset(self, ds_name):
current_path = os.path.dirname(os.path.realpath(__file__)) + '/' current_path = os.path.dirname(os.path.realpath(__file__)) + '/'
if ds_name == 'Letter-high': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
if ds_name == 'acyclic':
pass
elif ds_name == 'COIL-DEL':
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-med': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt'
elif ds_name == 'COIL-RAG':
ds_file = current_path + '../../datasets/COIL-RAG/COIL-RAG_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-low': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt'
elif ds_name == 'COLORS-3':
ds_file = current_path + '../../datasets/COLORS-3/COLORS-3_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Fingerprint': elif ds_name == 'Fingerprint':
ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt' ds_file = current_path + '../../datasets/Fingerprint/Fingerprint_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'FRANKENSTEIN':
ds_file = current_path + '../../datasets/FRANKENSTEIN/FRANKENSTEIN_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-high': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-high_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-low': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-low_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Letter-med': # node non-symb
ds_file = current_path + '../../datasets/Letter-high/Letter-med_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'MUTAG':
ds_file = current_path + '../../datasets/MUTAG/MUTAG_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'SYNTHETIC': elif ds_name == 'SYNTHETIC':
pass pass
elif ds_name == 'SYNTHETICnew': elif ds_name == 'SYNTHETICnew':
@@ -86,15 +103,6 @@ class Dataset(object):
self.__graphs, self.__targets, label_names = load_dataset(ds_file) self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'Synthie': elif ds_name == 'Synthie':
pass pass
elif ds_name == 'COIL-DEL':
ds_file = current_path + '../../datasets/COIL-DEL/COIL-DEL_A.txt'
self.__graphs, self.__targets, label_names = load_dataset(ds_file)
elif ds_name == 'COIL-RAG':
pass
elif ds_name == 'COLORS-3':
pass
elif ds_name == 'FRANKENSTEIN':
pass
self.__node_labels = label_names['node_labels'] self.__node_labels = label_names['node_labels']
self.__node_attrs = label_names['node_attrs'] self.__node_attrs = label_names['node_attrs']


+ 16
- 8
gklearn/utils/graph_files.py View File

@@ -474,6 +474,7 @@ def load_tud(filename):


label_names = {'node_labels': [], 'node_attrs': [], label_names = {'node_labels': [], 'node_attrs': [],
'edge_labels': [], 'edge_attrs': []} 'edge_labels': [], 'edge_attrs': []}
class_label_map = None
class_label_map_strings = [] class_label_map_strings = []
content_rm = open(frm).read().splitlines() content_rm = open(frm).read().splitlines()
i = 0 i = 0
@@ -538,20 +539,32 @@ def load_tud(filename):
else: else:
label_names = {'node_labels': [], 'node_attrs': [], label_names = {'node_labels': [], 'node_attrs': [],
'edge_labels': [], 'edge_attrs': []} 'edge_labels': [], 'edge_attrs': []}
class_label_map = None


content_gi = open(fgi).read().splitlines() # graph indicator content_gi = open(fgi).read().splitlines() # graph indicator
content_am = open(fam).read().splitlines() # adjacency matrix content_am = open(fam).read().splitlines() # adjacency matrix
content_gl = open(fgl).read().splitlines() # graph labels
# load targets.
if 'fgl' in locals():
content_targets = open(fgl).read().splitlines() # targets (classification)
targets = [float(i) for i in content_targets]
elif 'fga' in locals():
content_targets = open(fga).read().splitlines() # targets (regression)
targets = [int(i) for i in content_targets]
if class_label_map is not None:
targets = [class_label_map[t] for t in targets]
else:
raise Exception('Can not find targets file. Please make sure there is a "', ds_name, '_graph_labels.txt" or "', ds_name, '_graph_attributes.txt"', 'file in your dataset folder.')


# create graphs and add nodes # create graphs and add nodes
data = [nx.Graph(name=str(i)) for i in range(0, len(content_gl))]
data = [nx.Graph(name=str(i)) for i in range(0, len(content_targets))]
if 'fnl' in locals(): if 'fnl' in locals():
content_nl = open(fnl).read().splitlines() # node labels content_nl = open(fnl).read().splitlines() # node labels
for idx, line in enumerate(content_gi): for idx, line in enumerate(content_gi):
# transfer to int first in case of unexpected blanks # transfer to int first in case of unexpected blanks
data[int(line) - 1].add_node(idx) data[int(line) - 1].add_node(idx)
labels = [l.strip() for l in content_nl[idx].split(',')] labels = [l.strip() for l in content_nl[idx].split(',')]
if label_names['node_labels'] == []:
if label_names['node_labels'] == []: # @todo: need fix bug.
for i, label in enumerate(labels): for i, label in enumerate(labels):
l_name = 'label_' + str(i) l_name = 'label_' + str(i)
data[int(line) - 1].nodes[idx][l_name] = label data[int(line) - 1].nodes[idx][l_name] = label
@@ -619,11 +632,6 @@ def load_tud(filename):
for i, a_name in enumerate(label_names['edge_attrs']): for i, a_name in enumerate(label_names['edge_attrs']):
data[g].edges[n[0], n[1]][a_name] = attrs[i] data[g].edges[n[0], n[1]][a_name] = attrs[i]


# load targets.
targets = [int(i) for i in content_gl]
if 'class_label_map' in locals():
targets = [class_label_map[t] for t in targets]

return data, targets, label_names return data, targets, label_names






Loading…
Cancel
Save