|
|
@@ -19,146 +19,408 @@ from gklearn.utils.kernels import deltakernel, gaussiankernel, kernelproduct, po |
|
|
|
from gklearn.kernels.structuralspKernel import structuralspkernel |
|
|
|
from gklearn.kernels.treeletKernel import treeletkernel |
|
|
|
from gklearn.kernels.weisfeilerLehmanKernel import weisfeilerlehmankernel |
|
|
|
from gklearn.utils import Dataset |
|
|
|
import csv |
|
|
|
import matplotlib.pyplot as plt |
|
|
|
import networkx as nx |
|
|
|
|
|
|
|
|
|
|
|
def generate_median_preimage_by_class(ds_name, mpg_options, kernel_options, ged_options, mge_options, save_results=True, save_medians=True, plot_medians=True, dir_save='', ): |
|
|
|
from gklearn.preimage import MedianPreimageGenerator |
|
|
|
from gklearn.utils import split_dataset_by_target |
|
|
|
from gklearn.utils.graphfiles import saveGXL |
|
|
|
|
|
|
|
# 1. get dataset. |
|
|
|
print('getting dataset...') |
|
|
|
dataset_all = Dataset() |
|
|
|
dataset_all.load_predefined_dataset(ds_name) |
|
|
|
datasets = split_dataset_by_target(dataset_all) |
|
|
|
# dataset.cut_graphs(range(0, 10)) |
|
|
|
|
|
|
|
if save_results: |
|
|
|
# create result files. |
|
|
|
print('creating output files...') |
|
|
|
fn_output_detail, fn_output_summary = __init_output_file(ds_name, kernel_options['name'], mpg_options['fit_method'], dir_save) |
|
|
|
|
|
|
|
sod_sm_list = [] |
|
|
|
sod_gm_list = [] |
|
|
|
dis_k_sm_list = [] |
|
|
|
dis_k_gm_list = [] |
|
|
|
dis_k_gi_min_list = [] |
|
|
|
time_precompute_gm_list = [] |
|
|
|
time_optimize_ec_list = [] |
|
|
|
time_generate_list = [] |
|
|
|
time_total_list = [] |
|
|
|
itrs_list = [] |
|
|
|
converged_list = [] |
|
|
|
num_updates_ecc_list = [] |
|
|
|
nb_sod_sm2gm = [0, 0, 0] |
|
|
|
nb_dis_k_sm2gm = [0, 0, 0] |
|
|
|
nb_dis_k_gi2sm = [0, 0, 0] |
|
|
|
nb_dis_k_gi2gm = [0, 0, 0] |
|
|
|
# repeats_better_sod_sm2gm = [] |
|
|
|
# repeats_better_dis_k_sm2gm = [] |
|
|
|
# repeats_better_dis_k_gi2sm = [] |
|
|
|
# repeats_better_dis_k_gi2gm = [] |
|
|
|
|
|
|
|
|
|
|
|
print('start generating preimage for each class of target...') |
|
|
|
for dataset in datasets: |
|
|
|
print('\ntarget =', dataset.targets[0], '\n') |
|
|
|
num_graphs = len(dataset.graphs) |
|
|
|
|
|
|
|
if num_graphs < 2: |
|
|
|
print('\nnumber of graphs = ', num_graphs, ', skip.\n') |
|
|
|
continue |
|
|
|
|
|
|
|
# 2. set parameters. |
|
|
|
print('1. initializing mpg and setting parameters...') |
|
|
|
mpg = MedianPreimageGenerator() |
|
|
|
mpg.dataset = dataset |
|
|
|
mpg.set_options(**mpg_options.copy()) |
|
|
|
mpg.kernel_options = kernel_options.copy() |
|
|
|
mpg.ged_options = ged_options.copy() |
|
|
|
mpg.mge_options = mge_options.copy() |
|
|
|
|
|
|
|
# 3. compute median preimage. |
|
|
|
print('2. computing median preimage...') |
|
|
|
mpg.run() |
|
|
|
results = mpg.get_results() |
|
|
|
|
|
|
|
# write result detail. |
|
|
|
if save_results: |
|
|
|
print('writing results to files...') |
|
|
|
sod_sm2gm = get_relations(np.sign(results['sod_gen_median'] - results['sod_set_median'])) |
|
|
|
dis_k_sm2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_set_median'])) |
|
|
|
dis_k_gi2sm = get_relations(np.sign(results['k_dis_set_median'] - results['k_dis_dataset'])) |
|
|
|
dis_k_gi2gm = get_relations(np.sign(results['k_dis_gen_median'] - results['k_dis_dataset'])) |
|
|
|
|
|
|
|
f_detail = open(dir_save + fn_output_detail, 'a') |
|
|
|
csv.writer(f_detail).writerow([ds_name, kernel_options['name'], |
|
|
|
ged_options['edit_cost'], ged_options['method'], |
|
|
|
ged_options['attr_distance'], mpg_options['fit_method'], |
|
|
|
num_graphs, dataset.targets[0], 1, |
|
|
|
results['sod_set_median'], results['sod_gen_median'], |
|
|
|
results['k_dis_set_median'], results['k_dis_gen_median'], |
|
|
|
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, |
|
|
|
dis_k_gi2sm, dis_k_gi2gm, results['edit_cost_constants'], |
|
|
|
results['runtime_precompute_gm'], results['runtime_optimize_ec'], |
|
|
|
results['runtime_generate_preimage'], results['runtime_total'], |
|
|
|
results['itrs'], results['converged'], |
|
|
|
results['num_updates_ecc']]) |
|
|
|
f_detail.close() |
|
|
|
|
|
|
|
# compute result summary. |
|
|
|
sod_sm_list.append(results['sod_set_median']) |
|
|
|
sod_gm_list.append(results['sod_gen_median']) |
|
|
|
dis_k_sm_list.append(results['k_dis_set_median']) |
|
|
|
dis_k_gm_list.append(results['k_dis_gen_median']) |
|
|
|
dis_k_gi_min_list.append(results['k_dis_dataset']) |
|
|
|
time_precompute_gm_list.append(results['runtime_precompute_gm']) |
|
|
|
time_optimize_ec_list.append(results['runtime_optimize_ec']) |
|
|
|
time_generate_list.append(results['runtime_generate_preimage']) |
|
|
|
time_total_list.append(results['runtime_total']) |
|
|
|
itrs_list.append(results['itrs']) |
|
|
|
converged_list.append(results['converged']) |
|
|
|
num_updates_ecc_list.append(results['num_updates_ecc']) |
|
|
|
# # SOD SM -> GM |
|
|
|
if results['sod_set_median'] > results['sod_gen_median']: |
|
|
|
nb_sod_sm2gm[0] += 1 |
|
|
|
# repeats_better_sod_sm2gm.append(1) |
|
|
|
elif results['sod_set_median'] == results['sod_gen_median']: |
|
|
|
nb_sod_sm2gm[1] += 1 |
|
|
|
elif results['sod_set_median'] < results['sod_gen_median']: |
|
|
|
nb_sod_sm2gm[2] += 1 |
|
|
|
# # dis_k SM -> GM |
|
|
|
if results['k_dis_set_median'] > results['k_dis_gen_median']: |
|
|
|
nb_dis_k_sm2gm[0] += 1 |
|
|
|
# repeats_better_dis_k_sm2gm.append(1) |
|
|
|
elif results['k_dis_set_median'] == results['k_dis_gen_median']: |
|
|
|
nb_dis_k_sm2gm[1] += 1 |
|
|
|
elif results['k_dis_set_median'] < results['k_dis_gen_median']: |
|
|
|
nb_dis_k_sm2gm[2] += 1 |
|
|
|
# # dis_k gi -> SM |
|
|
|
if results['k_dis_dataset'] > results['k_dis_set_median']: |
|
|
|
nb_dis_k_gi2sm[0] += 1 |
|
|
|
# repeats_better_dis_k_gi2sm.append(1) |
|
|
|
elif results['k_dis_dataset'] == results['k_dis_set_median']: |
|
|
|
nb_dis_k_gi2sm[1] += 1 |
|
|
|
elif results['k_dis_dataset'] < results['k_dis_set_median']: |
|
|
|
nb_dis_k_gi2sm[2] += 1 |
|
|
|
# # dis_k gi -> GM |
|
|
|
if results['k_dis_dataset'] > results['k_dis_gen_median']: |
|
|
|
nb_dis_k_gi2gm[0] += 1 |
|
|
|
# repeats_better_dis_k_gi2gm.append(1) |
|
|
|
elif results['k_dis_dataset'] == results['k_dis_gen_median']: |
|
|
|
nb_dis_k_gi2gm[1] += 1 |
|
|
|
elif results['k_dis_dataset'] < results['k_dis_gen_median']: |
|
|
|
nb_dis_k_gi2gm[2] += 1 |
|
|
|
|
|
|
|
# write result summary for each letter. |
|
|
|
f_summary = open(dir_save + fn_output_summary, 'a') |
|
|
|
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], |
|
|
|
ged_options['edit_cost'], ged_options['method'], |
|
|
|
ged_options['attr_distance'], mpg_options['fit_method'], |
|
|
|
num_graphs, dataset.targets[0], |
|
|
|
results['sod_set_median'], results['sod_gen_median'], |
|
|
|
results['k_dis_set_median'], results['k_dis_gen_median'], |
|
|
|
results['k_dis_dataset'], sod_sm2gm, dis_k_sm2gm, |
|
|
|
dis_k_gi2sm, dis_k_gi2gm, |
|
|
|
results['runtime_precompute_gm'], results['runtime_optimize_ec'], |
|
|
|
results['runtime_generate_preimage'], results['runtime_total'], |
|
|
|
results['itrs'], results['converged'], |
|
|
|
results['num_updates_ecc'], nb_sod_sm2gm, |
|
|
|
nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm]) |
|
|
|
f_summary.close() |
|
|
|
|
|
|
|
# save median graphs. |
|
|
|
if save_medians: |
|
|
|
fn_pre_sm = dir_save + 'medians/set_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) |
|
|
|
saveGXL(mpg.set_median, fn_pre_sm + '.gxl', method='default', |
|
|
|
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, |
|
|
|
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) |
|
|
|
fn_pre_gm = dir_save + 'medians/gen_median.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) |
|
|
|
saveGXL(mpg.gen_median, fn_pre_gm + '.gxl', method='default', |
|
|
|
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, |
|
|
|
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) |
|
|
|
fn_best_dataset = dir_save + 'medians/g_best_dataset.' + mpg_options['fit_method'] + '.k' + str(num_graphs) + '.y' + str(dataset.targets[0]) + '.repeat' + str(1) |
|
|
|
saveGXL(mpg.best_from_dataset, fn_best_dataset + '.gxl', method='default', |
|
|
|
node_labels=dataset.node_labels, edge_labels=dataset.edge_labels, |
|
|
|
node_attrs=dataset.node_attrs, edge_attrs=dataset.edge_attrs) |
|
|
|
|
|
|
|
# plot median graphs. |
|
|
|
if plot_medians and save_medians: |
|
|
|
if ds_name == 'Letter-high' or ds_name == 'Letter-med' or ds_name == 'Letter-low': |
|
|
|
draw_Letter_graph(mpg.set_median, fn_pre_sm) |
|
|
|
draw_Letter_graph(mpg.gen_median, fn_pre_gm) |
|
|
|
draw_Letter_graph(mpg.best_from_dataset, fn_best_dataset) |
|
|
|
|
|
|
|
|
|
|
|
# write result summary for each letter. |
|
|
|
if save_results: |
|
|
|
sod_sm_mean = np.mean(sod_sm_list) |
|
|
|
sod_gm_mean = np.mean(sod_gm_list) |
|
|
|
dis_k_sm_mean = np.mean(dis_k_sm_list) |
|
|
|
dis_k_gm_mean = np.mean(dis_k_gm_list) |
|
|
|
dis_k_gi_min_mean = np.mean(dis_k_gi_min_list) |
|
|
|
time_precompute_gm_mean = np.mean(time_precompute_gm_list) |
|
|
|
time_optimize_ec_mean = np.mean(time_optimize_ec_list) |
|
|
|
time_generate_mean = np.mean(time_generate_list) |
|
|
|
time_total_mean = np.mean(time_total_list) |
|
|
|
itrs_mean = np.mean(itrs_list) |
|
|
|
num_converged = np.sum(converged_list) |
|
|
|
num_updates_ecc_mean = np.mean(num_updates_ecc_list) |
|
|
|
sod_sm2gm_mean = get_relations(np.sign(sod_gm_mean - sod_sm_mean)) |
|
|
|
dis_k_sm2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_sm_mean)) |
|
|
|
dis_k_gi2sm_mean = get_relations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean)) |
|
|
|
dis_k_gi2gm_mean = get_relations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean)) |
|
|
|
f_summary = open(dir_save + fn_output_summary, 'a') |
|
|
|
csv.writer(f_summary).writerow([ds_name, kernel_options['name'], |
|
|
|
ged_options['edit_cost'], ged_options['method'], |
|
|
|
ged_options['attr_distance'], mpg_options['fit_method'], |
|
|
|
num_graphs, 'all', |
|
|
|
sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean, |
|
|
|
dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean, |
|
|
|
dis_k_gi2sm_mean, dis_k_gi2gm_mean, |
|
|
|
time_precompute_gm_mean, time_optimize_ec_mean, |
|
|
|
time_generate_mean, time_total_mean, itrs_mean, |
|
|
|
num_converged, num_updates_ecc_mean]) |
|
|
|
f_summary.close() |
|
|
|
|
|
|
|
print('\ncomplete.') |
|
|
|
|
|
|
|
|
|
|
|
def __init_output_file(ds_name, gkernel, fit_method, dir_output): |
|
|
|
# fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' |
|
|
|
fn_output_detail = 'results_detail.' + ds_name + '.' + gkernel + '.csv' |
|
|
|
f_detail = open(dir_output + fn_output_detail, 'a') |
|
|
|
csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'edit cost', |
|
|
|
'GED method', 'attr distance', 'fit method', 'k', |
|
|
|
'target', 'repeat', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', |
|
|
|
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', |
|
|
|
'dis_k gi -> GM', 'edit cost constants', 'time precompute gm', |
|
|
|
'time optimize ec', 'time generate preimage', 'time total', |
|
|
|
'itrs', 'converged', 'num updates ecc']) |
|
|
|
f_detail.close() |
|
|
|
|
|
|
|
# fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.' + fit_method + '.csv' |
|
|
|
fn_output_summary = 'results_summary.' + ds_name + '.' + gkernel + '.csv' |
|
|
|
f_summary = open(dir_output + fn_output_summary, 'a') |
|
|
|
csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'edit cost', |
|
|
|
'GED method', 'attr distance', 'fit method', 'k', |
|
|
|
'target', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM', |
|
|
|
'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM', |
|
|
|
'dis_k gi -> GM', 'time precompute gm', 'time optimize ec', |
|
|
|
'time generate preimage', 'time total', 'itrs', 'num converged', |
|
|
|
'num updates ecc', '# SOD SM -> GM', '# dis_k SM -> GM', |
|
|
|
'# dis_k gi -> SM', '# dis_k gi -> GM']) |
|
|
|
# 'repeats better SOD SM -> GM', |
|
|
|
# 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM', |
|
|
|
# 'repeats better dis_k gi -> GM']) |
|
|
|
f_summary.close() |
|
|
|
|
|
|
|
return fn_output_detail, fn_output_summary |
|
|
|
|
|
|
|
|
|
|
|
def get_relations(sign): |
|
|
|
if sign == -1: |
|
|
|
return 'better' |
|
|
|
elif sign == 0: |
|
|
|
return 'same' |
|
|
|
elif sign == 1: |
|
|
|
return 'worse' |
|
|
|
|
|
|
|
|
|
|
|
#Dessin median courrant |
|
|
|
def draw_Letter_graph(graph, file_prefix): |
|
|
|
plt.figure() |
|
|
|
pos = {} |
|
|
|
for n in graph.nodes: |
|
|
|
pos[n] = np.array([float(graph.node[n]['x']),float(graph.node[n]['y'])]) |
|
|
|
nx.draw_networkx(graph, pos) |
|
|
|
plt.savefig(file_prefix + '.eps', format='eps', dpi=300) |
|
|
|
# plt.show() |
|
|
|
plt.clf() |
|
|
|
|
|
|
|
|
|
|
|
def remove_edges(Gn): |
|
|
|
for G in Gn: |
|
|
|
for _, _, attrs in G.edges(data=True): |
|
|
|
attrs.clear() |
|
|
|
|
|
|
|
for G in Gn: |
|
|
|
for _, _, attrs in G.edges(data=True): |
|
|
|
attrs.clear() |
|
|
|
|
|
|
|
def dis_gstar(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): |
|
|
|
term1 = Kmatrix[idx_g, idx_g] |
|
|
|
term2 = 0 |
|
|
|
for i, a in enumerate(alpha): |
|
|
|
term2 += a * Kmatrix[idx_g, idx_gi[i]] |
|
|
|
term2 *= 2 |
|
|
|
if withterm3 == False: |
|
|
|
for i1, a1 in enumerate(alpha): |
|
|
|
for i2, a2 in enumerate(alpha): |
|
|
|
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] |
|
|
|
return np.sqrt(term1 - term2 + term3) |
|
|
|
term1 = Kmatrix[idx_g, idx_g] |
|
|
|
term2 = 0 |
|
|
|
for i, a in enumerate(alpha): |
|
|
|
term2 += a * Kmatrix[idx_g, idx_gi[i]] |
|
|
|
term2 *= 2 |
|
|
|
if withterm3 == False: |
|
|
|
for i1, a1 in enumerate(alpha): |
|
|
|
for i2, a2 in enumerate(alpha): |
|
|
|
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] |
|
|
|
return np.sqrt(term1 - term2 + term3) |
|
|
|
|
|
|
|
|
|
|
|
def compute_k_dis(idx_g, idx_gi, alpha, Kmatrix, term3=0, withterm3=True): |
|
|
|
term1 = Kmatrix[idx_g, idx_g] |
|
|
|
term2 = 0 |
|
|
|
for i, a in enumerate(alpha): |
|
|
|
term2 += a * Kmatrix[idx_g, idx_gi[i]] |
|
|
|
term2 *= 2 |
|
|
|
if withterm3 == False: |
|
|
|
for i1, a1 in enumerate(alpha): |
|
|
|
for i2, a2 in enumerate(alpha): |
|
|
|
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] |
|
|
|
return np.sqrt(term1 - term2 + term3) |
|
|
|
term1 = Kmatrix[idx_g, idx_g] |
|
|
|
term2 = 0 |
|
|
|
for i, a in enumerate(alpha): |
|
|
|
term2 += a * Kmatrix[idx_g, idx_gi[i]] |
|
|
|
term2 *= 2 |
|
|
|
if withterm3 == False: |
|
|
|
for i1, a1 in enumerate(alpha): |
|
|
|
for i2, a2 in enumerate(alpha): |
|
|
|
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] |
|
|
|
return np.sqrt(term1 - term2 + term3) |
|
|
|
|
|
|
|
|
|
|
|
def compute_kernel(Gn, graph_kernel, node_label, edge_label, verbose, parallel='imap_unordered'): |
|
|
|
if graph_kernel == 'marginalizedkernel': |
|
|
|
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
p_quit=0.03, n_iteration=10, remove_totters=False, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
elif graph_kernel == 'untilhpathkernel': |
|
|
|
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
depth=7, k_func='MinMax', compute_method='trie', |
|
|
|
parallel=parallel, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
elif graph_kernel == 'spkernel': |
|
|
|
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) |
|
|
|
Kmatrix = np.empty((len(Gn), len(Gn))) |
|
|
|
# Kmatrix[:] = np.nan |
|
|
|
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= |
|
|
|
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
# for i, row in enumerate(idx): |
|
|
|
# for j, col in enumerate(idx): |
|
|
|
# Kmatrix[row, col] = Kmatrix_tmp[i, j] |
|
|
|
elif graph_kernel == 'structuralspkernel': |
|
|
|
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) |
|
|
|
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} |
|
|
|
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, |
|
|
|
edge_label=edge_label, node_kernels=sub_kernels, |
|
|
|
edge_kernels=sub_kernels, |
|
|
|
parallel=parallel, n_jobs=multiprocessing.cpu_count(), |
|
|
|
verbose=verbose) |
|
|
|
elif graph_kernel == 'treeletkernel': |
|
|
|
pkernel = functools.partial(polynomialkernel, d=2, c=1e5) |
|
|
|
# pkernel = functools.partial(gaussiankernel, gamma=1e-6) |
|
|
|
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) |
|
|
|
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
sub_kernel=pkernel, parallel=parallel, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
elif graph_kernel == 'weisfeilerlehmankernel': |
|
|
|
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
height=4, base_kernel='subtree', parallel=None, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
|
|
|
|
# normalization |
|
|
|
Kmatrix_diag = Kmatrix.diagonal().copy() |
|
|
|
for i in range(len(Kmatrix)): |
|
|
|
for j in range(i, len(Kmatrix)): |
|
|
|
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) |
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
return Kmatrix |
|
|
|
|
|
|
|
if graph_kernel == 'marginalizedkernel': |
|
|
|
Kmatrix, _ = marginalizedkernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
p_quit=0.03, n_iteration=10, remove_totters=False, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
elif graph_kernel == 'untilhpathkernel': |
|
|
|
Kmatrix, _ = untilhpathkernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
depth=7, k_func='MinMax', compute_method='trie', |
|
|
|
parallel=parallel, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
elif graph_kernel == 'spkernel': |
|
|
|
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) |
|
|
|
Kmatrix = np.empty((len(Gn), len(Gn))) |
|
|
|
# Kmatrix[:] = np.nan |
|
|
|
Kmatrix, _, idx = spkernel(Gn, node_label=node_label, node_kernels= |
|
|
|
{'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel}, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
# for i, row in enumerate(idx): |
|
|
|
# for j, col in enumerate(idx): |
|
|
|
# Kmatrix[row, col] = Kmatrix_tmp[i, j] |
|
|
|
elif graph_kernel == 'structuralspkernel': |
|
|
|
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) |
|
|
|
sub_kernels = {'symb': deltakernel, 'nsymb': gaussiankernel, 'mix': mixkernel} |
|
|
|
Kmatrix, _ = structuralspkernel(Gn, node_label=node_label, |
|
|
|
edge_label=edge_label, node_kernels=sub_kernels, |
|
|
|
edge_kernels=sub_kernels, |
|
|
|
parallel=parallel, n_jobs=multiprocessing.cpu_count(), |
|
|
|
verbose=verbose) |
|
|
|
elif graph_kernel == 'treeletkernel': |
|
|
|
pkernel = functools.partial(polynomialkernel, d=2, c=1e5) |
|
|
|
# pkernel = functools.partial(gaussiankernel, gamma=1e-6) |
|
|
|
mixkernel = functools.partial(kernelproduct, deltakernel, gaussiankernel) |
|
|
|
Kmatrix, _ = treeletkernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
sub_kernel=pkernel, parallel=parallel, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
elif graph_kernel == 'weisfeilerlehmankernel': |
|
|
|
Kmatrix, _ = weisfeilerlehmankernel(Gn, node_label=node_label, edge_label=edge_label, |
|
|
|
height=4, base_kernel='subtree', parallel=None, |
|
|
|
n_jobs=multiprocessing.cpu_count(), verbose=verbose) |
|
|
|
|
|
|
|
# normalization |
|
|
|
Kmatrix_diag = Kmatrix.diagonal().copy() |
|
|
|
for i in range(len(Kmatrix)): |
|
|
|
for j in range(i, len(Kmatrix)): |
|
|
|
Kmatrix[i][j] /= np.sqrt(Kmatrix_diag[i] * Kmatrix_diag[j]) |
|
|
|
Kmatrix[j][i] = Kmatrix[i][j] |
|
|
|
return Kmatrix |
|
|
|
|
|
|
|
|
|
|
|
def gram2distances(Kmatrix): |
|
|
|
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) |
|
|
|
for i1 in range(len(Kmatrix)): |
|
|
|
for i2 in range(len(Kmatrix)): |
|
|
|
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] |
|
|
|
dmatrix = np.sqrt(dmatrix) |
|
|
|
return dmatrix |
|
|
|
dmatrix = np.zeros((len(Kmatrix), len(Kmatrix))) |
|
|
|
for i1 in range(len(Kmatrix)): |
|
|
|
for i2 in range(len(Kmatrix)): |
|
|
|
dmatrix[i1, i2] = Kmatrix[i1, i1] + Kmatrix[i2, i2] - 2 * Kmatrix[i1, i2] |
|
|
|
dmatrix = np.sqrt(dmatrix) |
|
|
|
return dmatrix |
|
|
|
|
|
|
|
|
|
|
|
def kernel_distance_matrix(Gn, node_label, edge_label, Kmatrix=None, |
|
|
|
gkernel=None, verbose=True): |
|
|
|
dis_mat = np.empty((len(Gn), len(Gn))) |
|
|
|
if Kmatrix is None: |
|
|
|
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) |
|
|
|
for i in range(len(Gn)): |
|
|
|
for j in range(i, len(Gn)): |
|
|
|
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] |
|
|
|
if dis < 0: |
|
|
|
if dis > -1e-10: |
|
|
|
dis = 0 |
|
|
|
else: |
|
|
|
raise ValueError('The distance is negative.') |
|
|
|
dis_mat[i, j] = np.sqrt(dis) |
|
|
|
dis_mat[j, i] = dis_mat[i, j] |
|
|
|
dis_max = np.max(np.max(dis_mat)) |
|
|
|
dis_min = np.min(np.min(dis_mat[dis_mat != 0])) |
|
|
|
dis_mean = np.mean(np.mean(dis_mat)) |
|
|
|
return dis_mat, dis_max, dis_min, dis_mean |
|
|
|
gkernel=None, verbose=True): |
|
|
|
dis_mat = np.empty((len(Gn), len(Gn))) |
|
|
|
if Kmatrix is None: |
|
|
|
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, verbose) |
|
|
|
for i in range(len(Gn)): |
|
|
|
for j in range(i, len(Gn)): |
|
|
|
dis = Kmatrix[i, i] + Kmatrix[j, j] - 2 * Kmatrix[i, j] |
|
|
|
if dis < 0: |
|
|
|
if dis > -1e-10: |
|
|
|
dis = 0 |
|
|
|
else: |
|
|
|
raise ValueError('The distance is negative.') |
|
|
|
dis_mat[i, j] = np.sqrt(dis) |
|
|
|
dis_mat[j, i] = dis_mat[i, j] |
|
|
|
dis_max = np.max(np.max(dis_mat)) |
|
|
|
dis_min = np.min(np.min(dis_mat[dis_mat != 0])) |
|
|
|
dis_mean = np.mean(np.mean(dis_mat)) |
|
|
|
return dis_mat, dis_max, dis_min, dis_mean |
|
|
|
|
|
|
|
|
|
|
|
def get_same_item_indices(ls): |
|
|
|
"""Get the indices of the same items in a list. Return a dict keyed by items. |
|
|
|
""" |
|
|
|
idx_dict = {} |
|
|
|
for idx, item in enumerate(ls): |
|
|
|
if item in idx_dict: |
|
|
|
idx_dict[item].append(idx) |
|
|
|
else: |
|
|
|
idx_dict[item] = [idx] |
|
|
|
return idx_dict |
|
|
|
"""Get the indices of the same items in a list. Return a dict keyed by items. |
|
|
|
""" |
|
|
|
idx_dict = {} |
|
|
|
for idx, item in enumerate(ls): |
|
|
|
if item in idx_dict: |
|
|
|
idx_dict[item].append(idx) |
|
|
|
else: |
|
|
|
idx_dict[item] = [idx] |
|
|
|
return idx_dict |
|
|
|
|
|
|
|
|
|
|
|
def k_nearest_neighbors_to_median_in_kernel_space(Gn, Kmatrix=None, gkernel=None, |
|
|
|
node_label=None, edge_label=None): |
|
|
|
dis_k_all = [] # distance between g_star and each graph. |
|
|
|
alpha = [1 / len(Gn)] * len(Gn) |
|
|
|
if Kmatrix is None: |
|
|
|
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) |
|
|
|
term3 = 0 |
|
|
|
for i1, a1 in enumerate(alpha): |
|
|
|
for i2, a2 in enumerate(alpha): |
|
|
|
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] |
|
|
|
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): |
|
|
|
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) |
|
|
|
dis_all.append(dtemp) |
|
|
|
node_label=None, edge_label=None): |
|
|
|
dis_k_all = [] # distance between g_star and each graph. |
|
|
|
alpha = [1 / len(Gn)] * len(Gn) |
|
|
|
if Kmatrix is None: |
|
|
|
Kmatrix = compute_kernel(Gn, gkernel, node_label, edge_label, True) |
|
|
|
term3 = 0 |
|
|
|
for i1, a1 in enumerate(alpha): |
|
|
|
for i2, a2 in enumerate(alpha): |
|
|
|
term3 += a1 * a2 * Kmatrix[idx_gi[i1], idx_gi[i2]] |
|
|
|
for ig, g in tqdm(enumerate(Gn_init), desc='computing distances', file=sys.stdout): |
|
|
|
dtemp = dis_gstar(ig, idx_gi, alpha, Kmatrix, term3=term3) |
|
|
|
dis_all.append(dtemp) |
|
|
|
|
|
|
|
|
|
|
|
def normalize_distance_matrix(D): |
|
|
|
max_value = np.amax(D) |
|
|
|
min_value = np.amin(D) |
|
|
|
return (D - min_value) / (max_value - min_value) |
|
|
|
max_value = np.amax(D) |
|
|
|
min_value = np.amin(D) |
|
|
|
return (D - min_value) / (max_value - min_value) |