From 47b13b4bb5f86efceab81b012711cda533869b0f Mon Sep 17 00:00:00 2001 From: jajupmochi Date: Fri, 1 Jan 2021 16:21:22 +0100 Subject: [PATCH] Add GED fit distance experiments. --- .../experiments/thesis/ged/fit_distances/README.md | 11 + .../thesis/ged/fit_distances/distances.py | 43 +++ .../experiments/thesis/ged/fit_distances/ged.py | 85 +++++ .../fit_distances/ged_fit_distance_results_plot.py | 391 +++++++++++++++++++++ .../thesis/ged/fit_distances/learning.py | 108 ++++++ gklearn/experiments/thesis/ged/fit_distances/ml.py | 66 ++++ .../thesis/ged/fit_distances/optim_costs.py | 136 +++++++ .../thesis/ged/fit_distances/run_xps.py | 100 ++++++ .../experiments/thesis/ged/fit_distances/utils.py | 15 + 9 files changed, 955 insertions(+) create mode 100644 gklearn/experiments/thesis/ged/fit_distances/README.md create mode 100644 gklearn/experiments/thesis/ged/fit_distances/distances.py create mode 100644 gklearn/experiments/thesis/ged/fit_distances/ged.py create mode 100644 gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results_plot.py create mode 100644 gklearn/experiments/thesis/ged/fit_distances/learning.py create mode 100644 gklearn/experiments/thesis/ged/fit_distances/ml.py create mode 100644 gklearn/experiments/thesis/ged/fit_distances/optim_costs.py create mode 100644 gklearn/experiments/thesis/ged/fit_distances/run_xps.py create mode 100644 gklearn/experiments/thesis/ged/fit_distances/utils.py diff --git a/gklearn/experiments/thesis/ged/fit_distances/README.md b/gklearn/experiments/thesis/ged/fit_distances/README.md new file mode 100644 index 0000000..367a92d --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/README.md @@ -0,0 +1,11 @@ +# Fit Distances + +# Run xp: +``` +python3 -m pip install graphkit-learn +python3 run_xp.py +``` + +# Run xp (deprecated). +export PYTHONPATH="/path/to/gedlibpy:/path/to/py-graph" +python optim_costs.py dataset output_file diff --git a/gklearn/experiments/thesis/ged/fit_distances/distances.py b/gklearn/experiments/thesis/ged/fit_distances/distances.py new file mode 100644 index 0000000..3e27eb3 --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/distances.py @@ -0,0 +1,43 @@ +import numpy as np + + +def sum_squares(a, b): + """ + Return the sum of squares of the difference between a and b, aka MSE + """ + return np.sum([(a[i] - b[i])**2 for i in range(len(a))]) + + +def euclid_d(x, y): + """ + 1D euclidean distance + """ + return np.sqrt((x-y)**2) + + +def man_d(x, y): + """ + 1D manhattan distance + """ + return np.abs((x-y)) + + +def classif_d(x, y): + """ + Function adapted to classification problems + """ + return np.array(0 if x == y else 1) + + +def rmse(pred, ground_truth): + import numpy as np + return np.sqrt(sum_squares(pred, ground_truth)/len(ground_truth)) + + +def accuracy(pred, ground_truth): + import numpy as np + return np.mean([a == b for a, b in zip(pred, ground_truth)]) + + +def rbf_k(D, sigma=1): + return np.exp(-(D**2)/sigma) diff --git a/gklearn/experiments/thesis/ged/fit_distances/ged.py b/gklearn/experiments/thesis/ged/fit_distances/ged.py new file mode 100644 index 0000000..485a5fa --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/ged.py @@ -0,0 +1,85 @@ +from distances import euclid_d +from gklearn.ged.util import pairwise_ged, get_nb_edit_operations +from gklearn.utils import get_iters + +import sys + + +def compute_ged(Gi, Gj, edit_cost, method='BIPARTITE', **kwargs): + """ + Compute GED between two graph according to edit_cost + """ + ged_options = {'edit_cost': 'CONSTANT', + 'method': method, + 'edit_cost_constants': edit_cost} + node_labels = kwargs.get('node_labels', []) + edge_labels = kwargs.get('edge_labels', []) + dis, pi_forward, pi_backward = pairwise_ged(Gi, Gj, ged_options, repeats=10) + n_eo_tmp = get_nb_edit_operations(Gi, Gj, pi_forward, pi_backward, edit_cost='CONSTANT', node_labels=node_labels, edge_labels=edge_labels) + return dis, n_eo_tmp + + +def compute_ged_all_dataset(Gn, edit_cost, ed_method, **kwargs): + N = len(Gn) + G_pairs = [] + for i in range(N): + for j in range(i, N): + G_pairs.append([i, j]) + return compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs) + + +def compute_geds(G_pairs, Gn, edit_cost, ed_method, **kwargs): + """ + Compute GED between all indexes in G_pairs given edit_cost + :return: ged_vec : the list of computed distances, n_edit_operations : the list of edit operations + """ + ged_vec = [] + n_edit_operations = [] + for k in get_iters(range(len(G_pairs)), desc='Computing GED', file=sys.stdout, length=len(G_pairs)): + [i, j] = G_pairs[k] + dis, n_eo_tmp = compute_ged( + Gn[i], Gn[j], edit_cost = edit_cost, method=ed_method, **kwargs) + ged_vec.append(dis) + n_edit_operations.append(n_eo_tmp) + + return ged_vec, n_edit_operations + + +def compute_D(G_app, edit_cost, G_test=None, ed_method='BIPARTITE', **kwargs): + import numpy as np + N = len(G_app) + D_app = np.zeros((N, N)) + + for i, G1 in get_iters(enumerate(G_app), desc='Computing D - app', file=sys.stdout, length=N): + for j, G2 in enumerate(G_app[i+1:], i+1): + D_app[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) + D_app[j, i] = D_app[i, j] + if (G_test is None): + return D_app, edit_cost + else: + D_test = np.zeros((len(G_test), N)) + for i, G1 in get_iters(enumerate(G_test), desc='Computing D - test', file=sys.stdout, length=len(G_test)): + for j, G2 in enumerate(G_app): + D_test[i, j], _ = compute_ged(G1, G2, edit_cost, method=ed_method, **kwargs) + return D_app, D_test, edit_cost + + +def compute_D_random(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + import numpy as np + edit_costs = np.random.rand(6) + return compute_D(G_app, edit_costs, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_expert(G_app, G_test=None, ed_method='BIPARTITE', **kwargs): + edit_cost = [3, 3, 1, 3, 3, 1] + return compute_D(G_app, edit_cost, G_test, ed_method=ed_method, **kwargs) + + +def compute_D_fitted(G_app, y_app, G_test=None, y_distance=euclid_d, + mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs): + from optim_costs import compute_optimal_costs + + costs_optim = compute_optimal_costs( + G_app, y_app, y_distance=y_distance, + mode=mode, unlabeled=unlabeled, ed_method=ed_method, **kwargs) + return compute_D(G_app, costs_optim, G_test, ed_method=ed_method, **kwargs) diff --git a/gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results_plot.py b/gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results_plot.py new file mode 100644 index 0000000..6356794 --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/ged_fit_distance_results_plot.py @@ -0,0 +1,391 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Thu Dec 31 10:42:55 2020 + +@author: ljia +""" +import os +import numpy as np +import scipy.stats +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + + +def rounder(x, decimals): + x_strs = str(x).split('.') + if len(x_strs) == 2: + before = x_strs[0] + after = x_strs[1] + if len(after) > decimals: + if int(after[decimals]) >= 5: + after0s = '' + for c in after: + if c == '0': + after0s += '0' + elif c != '0': + break + after = after0s + str(int(after[0:decimals]) + 1)[-decimals:] + else: + after = after[0:decimals] + elif len(after) < decimals: + after += '0' * (decimals - len(after)) + return before + '.' + after + + elif len(x_strs) == 1: + return x_strs[0] + + +def df_to_latex_table(df, replace_header=True, end_mid_line=7): + ltx = df.to_latex(index=True, escape=False, multirow=True) + + # modify middle lines. + end_mid_line = str(end_mid_line) + ltx = ltx.replace('\\cline{1-' + end_mid_line + '}\n\\cline{2-' + end_mid_line + '}', '\\toprule') + ltx = ltx.replace('\\cline{2-' + end_mid_line + '}', '\\cmidrule(l){2-' + end_mid_line + '}') + + # Reset dataset name. + ltx = ltx.replace('Alkane_unlabeled', 'Alkane') + ltx = ltx.replace('Vitamin_D', 'Vitamin\_D') + + # modify header. + if replace_header: + i_start = ltx.find('\\begin{tabular}') + i_end = ltx.find('\\\\\n\\midrule\n') + replace = r"""\begin{tabular}{lll@{~~}c@{~~}c@{~~}c@{~~}c} +\toprule +\multirow{2}[2]{*}{\textbf{Dataset}} & \multirow{2}[2]{*}{\textbf{Distance}} & \multirow{2}[2]{*}{\textbf{Method}} & \multicolumn{2}{c}{\textbf{BIPARTITE}} & \multicolumn{2}{c}{\textbf{IPFP}} \\ +\cmidrule(lr){4-5}\cmidrule(lr){6-7} +& & & \textbf{Train errors} & \textbf{Test errors} & \textbf{Train errors} & \textbf{Test errors} \\ +\midrule +""" + ltx = ltx.replace(ltx[i_start:i_end+12], replace, 1) +# +# # add row numbers. +# ltx = ltx.replace('lllllllll', 'lllllllll|@{\\makebox[2em][r]{\\textit{\\rownumber\\space}}}', 1) +# ltx = replace_nth(ltx, '\\\\\n', '\\gdef\\rownumber{\\stepcounter{magicrownumbers}\\arabic{magicrownumbers}} \\\\\n', 1) + + return ltx + + +def beautify_df(df): +# df = df.sort_values(by=['Datasets', 'Graph Kernels']) +# df = df.set_index(['Datasets', 'Graph Kernels', 'Algorithms']) +# # index = pd.MultiIndex.from_frame(df[['Datasets', 'Graph Kernels', 'Algorithms']]) + + # bold the best results. + for ds in df.index.get_level_values('Dataset').unique(): + for gk in df.loc[ds].index.get_level_values('Distance').unique(): + for label, col in df.loc[(ds, gk)].items(): + min_val = np.inf + min_indices = [] + min_labels = [] + for index, row in col.items(): + value = row + if value != '-': + mean, interval = value.split('$\\pm$') + mean = float(mean.strip('/same')) + if mean < min_val: + min_val = mean + min_indices = [index] + min_labels = [label] + elif mean == min_val: + min_indices.append(index) + min_labels.append(label) + for idx, index in enumerate(min_indices): + df.loc[(ds, gk, index), min_labels[idx]] = '\\textbf{' + df.loc[(ds, gk, index), min_labels[idx]] + '}' + + return df + + +def params_to_latex_table(results): + import pandas as pd + + # Create df table. + row_indices = pd.MultiIndex.from_product([Dataset_list, Edit_Cost_List, Dis_List], names=['Dataset', 'Edit cost', 'Distance']) + df = pd.DataFrame(columns=['$c_{ni}$', '$c_{nr}$', '$c_{ns}$', '$c_{ei}$', '$c_{er}$', '$c_{es}$'], index=row_indices) + + # Set data. + for idx_r, row in df.iterrows(): + for idx, (idx_c, col) in enumerate(row.items()): + key = (idx_r[0], idx_r[2], idx_r[1]) + if key in results and results[key] is not None: +# if results[key][idx] != 0: + df.loc[idx_r, idx_c] = results[key][idx] +# else: +# df.loc[idx_r, idx_c] = '-' + else: + df.loc[idx_r, idx_c] = '-' + +# df = beautify_df(df) + ltx = df_to_latex_table(df, replace_header=False, end_mid_line=9) + return ltx + + +def results_to_latex_table(results): + import pandas as pd + + # Create df table. + col_indices = pd.MultiIndex.from_product([Edit_Cost_List, ['Train errors', 'Test errors']]) + row_indices = pd.MultiIndex.from_product([Dataset_list, Dis_List, ['random', 'expert', 'fitted']], names=['Dataset', 'Distance', 'Method']) + df = pd.DataFrame(columns=col_indices, index=row_indices) + + # Set data. + for idx_r, row in df.iterrows(): + for idx_c, col in row.items(): + key = (idx_r[0], idx_r[1], idx_c[0]) + if key in results and results[key] is not None: + mean = results[key][idx_r[2]]['mean'] + mean = mean[0] if idx_c[1] == 'Train errors' else mean[1] + interval = results[key][idx_r[2]]['interval'] + interval = interval[0] if idx_c[1] == 'Train errors' else interval[1] + df.loc[idx_r, idx_c] = rounder(mean, 2) + '$\pm$' + rounder(interval, 2) + else: + df.loc[idx_r, idx_c] = '-' + + df = beautify_df(df) + ltx = df_to_latex_table(df) + return ltx + + +def get_params(results): + edit_costs = [[] for i in range(6)] + for result in results['results']: + ed = result['fitted']['edit_costs'] + for i, e in enumerate(ed): + edit_costs[i].append(e) + + for i, ed in enumerate(edit_costs): + mean, interval = mean_confidence_interval(ed) + if mean == 0: + edit_costs[i] = '-' + else: + edit_costs[i] = rounder(mean, 2) + '$\pm$' + rounder(interval, 2) + + return edit_costs + + +def print_bars(ax, p, title, y_label='RMSE', export_filename=None): + + palette = plt.get_cmap('Set1') # ['red', 'blue', 'green'] + # width of the bars + barWidth = 0.1 + gap = 0.2 + + # The x position of bars +# nb_xp = len(p.keys()) +# r = np.arange(2) + r = [0, gap + barWidth * 3] +# r = [0 - barWidth, nb_xp * barWidth + gap * 0.5 - barWidth] + + #print(r) + for i, xp in enumerate(p.keys()): + bars = p[xp]['mean'] + y_err = p[xp]['interval'] + # Create blue bars + r_cur = [x + barWidth * (i - 1) * 1.03 for x in r] + plt.bar(r_cur, + bars, width=barWidth, color=palette(i), + edgecolor='black', linewidth=0.2, + yerr=y_err, error_kw=dict(lw=0.5, capsize=3, capthick=0.5), + label=xp) + # general layout + ax.set_xticks(r) + ax.set_xticklabels(['train', 'test'] ) # ['train errors', 'test errors']) + ax.xaxis.set_ticks_position('none') + ax.set_ylabel(y_label) +# ax.legend() + ax.set_title(title) + + if (export_filename is not None): + print(export_filename) + plt.savefig(export_filename) + + +def print_table_results(results_by_xp): + from tabulate import tabulate + tab = [] + tab.append(["Method", "App","Test"]) + #setups = ["random","expert","fitted"] + + + for i,setup in enumerate(results_by_xp.keys()): + current_line = [setup] + p = results_by_xp[setup] + current_line.append(f"{p['mean'][0]:.2f} +- {p['interval'][0]:.2f}") + + current_line.append(f"{p['mean'][1]:.2f} +- {p['interval'][1]:.2f}") + + tab.append(current_line) + + print(tabulate(tab, headers="firstrow")) + + +def mean_confidence_interval(data, confidence=0.95): + a = 1.0 * np.array(data) + n = len(a) + m, se = np.mean(a), scipy.stats.sem(a) + h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) + return m, h + + +def compute_perf(results, app_or_test): + return mean_confidence_interval(results[app_or_test]) + + +def compute_displayable_results(results_by_xp): + p = {} + for xp in results_by_xp.keys(): + p[xp] = {} + p[xp]["mean"] = [0] * 2 + p[xp]["interval"] = [0] * 2 + p[xp]["mean"][0], p[xp]["interval"][0] = compute_perf(results_by_xp[xp], 'app') + p[xp]["mean"][1], p[xp]["interval"][1] = compute_perf(results_by_xp[xp], 'test') + return p + + +def organize_results_by_cost_settings(results, xps): + all_results = results["results"] + + results_by_xp = {} + for xp in xps: + results_xp = { + 'app' :[], + 'test' : [] + } + + for i, split_res in enumerate(all_results): + results_xp['app'].append(split_res[xp]['perf_app']) + results_xp['test'].append(split_res[xp]['perf_test']) + results_by_xp[xp] = results_xp + return results_by_xp + + +def plot_a_task(ax, ds_name, edit_cost, distance, title, y_label): + # Load data. + root_dir = '/media/ljia/DATA/research-repo/codes/Linlin/graphkit-learn/gklearn/experiments/thesis/ged/fit_distances/outputs/' + fn = root_dir + 'results.' + '.'.join([ds_name, edit_cost, distance]) + '.pkl' + if os.path.isfile(fn): + with open(fn, 'rb') as file: + results = pickle.load(file) + else: + return None, None + +# print(results.keys()) +# print(results['y_distance']) +# print(results['dataset']) +# print(results['params']) +# #print(results['mode']) + +# print(len(results['results'])) +# len(results['results'][0]) + +# print(results['results'][0].keys()) + +# ### Schema Xp +# # acyclic_results['results'] est une liste qui contient les resultats de test et train/valid sur 10 split randoms. +# # Pour chaque split, results['results'][i] est un dict qui contient chaque xp avec le split i + +# print(results["results"][0]['random'].keys()) + +# xp = results["results"][4]['fitted'] +# for k in xp.keys(): +# print(f"{k} : {xp[k]}") + +# i=4 +# print(results["results"][i]['random']['perf_test']) +# print(results["results"][i]['expert']['perf_test']) +# print(results["results"][i]['fitted']['perf_test']) +# #print(xp['clf'].cv_results_) + + # Compute data. + xps = ["random", "expert", "fitted"] + results_by_xp = organize_results_by_cost_settings(results, xps) + p = compute_displayable_results(results_by_xp) +# print_bars(p,'KNN with CV and y_distance = {0}'.format(results['y_distance']),export_filename=export_filename) + print_bars(ax, p, title, y_label=y_label, export_filename=None) + c = get_params(results) + return p, c + + +def set_figure(nb_rows): + #plt.rc('font', size=SMALL_SIZE) # controls default text sizes +# plt.rc('axes', titlesize=15) # fontsize of the axes title +# plt.rc('axes', labelsize=15) # fontsize of the x and y labels +# plt.rc('xtick', labelsize=15) # fontsize of the tick labels +# plt.rc('ytick', labelsize=15) # fontsize of the tick labels +# plt.rc('legend', fontsize=15) # legend fontsize +# plt.rc('figure', titlesize=15) # fontsize of the figure title + + #fig, _ = plt.subplots(2, 2, figsize=(13, 12)) + #ax1 = plt.subplot(221) + #ax2 = plt.subplot(222) + #ax3 = plt.subplot(223) + #ax4 = plt.subplot(224) + fig = plt.figure(figsize=(11, 2.12 * nb_rows + 0.56)) + ax = fig.add_subplot(111) # The big subplot for common labels + + # Turn off axis lines and ticks of the big subplot + ax.spines['top'].set_color('none') + ax.spines['bottom'].set_color('none') + ax.spines['left'].set_color('none') + ax.spines['right'].set_color('none') + ax.tick_params(labelcolor='w', top='off', bottom='off', left='off', right='off') + ax.xaxis.set_ticks_position('none') + ax.yaxis.set_ticks_position('none') + # Set common labels + #ax.set_xlabel('accuracy(%)') + ax.yaxis.set_label_coords(-0.105, 0.5) + ax.set_ylabel('RMSE') + ax.yaxis.set_label_coords(-0.07, 0.5) + + return fig + + +if __name__ == '__main__': + from sklearn.model_selection import ParameterGrid + import pickle + + # Get task grid. + Edit_Cost_List = ['BIPARTITE', 'IPFP'] + Dataset_list = ['Alkane_unlabeled', 'Acyclic', 'Chiral', 'Vitamin_D', + 'Steroid'][0:2] + Dis_List = ['euclidean', 'manhattan'] +# row_grid = ParameterGrid({'edit_cost': Edit_Cost_List[0:], +# 'distance': Dis_List[0:]}) + # show by edit costs then by distances. + row_grid_list = [] + for i in Edit_Cost_List[0:]: + for j in Dis_List[0:]: + row_grid_list.append({'edit_cost': i, 'distance': j}) + + # Compute and plot. + fig = set_figure(len(Dataset_list)) + gs = gridspec.GridSpec(len(Dataset_list), len(row_grid_list)) + gs.update(hspace=0.3) + + results = {} + params = {} + for row, ds_name in enumerate(Dataset_list): + for col, contents in enumerate(row_grid_list): + ax = fig.add_subplot(gs[row, col]) + y_label = (ds_name[:-10] if ds_name.endswith('_unlabeled') else ds_name) if col == 0 else '' + title = contents['edit_cost'] + ', ' + contents['distance'] if row == 0 else '' + p, c = plot_a_task(ax, ds_name, contents['edit_cost'], contents['distance'], title, y_label) + results[(ds_name, contents['distance'], contents['edit_cost'])] = p + params[(ds_name, contents['distance'], contents['edit_cost'])] = c + if col == 0 and row == 0: + handles, labels = ax.get_legend_handles_labels() + + # Show graphic + size = fig.get_size_inches() + fig.subplots_adjust(bottom=0.56 / size[1]) + fig.legend(handles, labels, loc='lower center', ncol=3, frameon=False) # , ncol=5, labelspacing=0.1, handletextpad=0.4, columnspacing=0.6) + plt.savefig('ged_fit_distance_results.eps', format='eps', dpi=300, transparent=True, + bbox_inches='tight') + plt.show() + + # Convert results to latex table. + ltable_perf = results_to_latex_table(results) + ltable_params = params_to_latex_table(params) + print(ltable_perf) \ No newline at end of file diff --git a/gklearn/experiments/thesis/ged/fit_distances/learning.py b/gklearn/experiments/thesis/ged/fit_distances/learning.py new file mode 100644 index 0000000..74f4483 --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/learning.py @@ -0,0 +1,108 @@ +from distances import euclid_d + + +def split_data(D, y, train_index, test_index): + D_app = [D[i] for i in train_index] + D_test = [D[i] for i in test_index] + y_app = [y[i] for i in train_index] + y_test = [y[i] for i in test_index] + return D_app, D_test, y_app, y_test + + +def evaluate_D(D_app, y_app, D_test, y_test, mode='reg'): + from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier + from distances import rmse, accuracy + from sklearn.model_selection import GridSearchCV + + if (mode == 'reg'): + knn = KNeighborsRegressor(metric='precomputed') + scoring = 'neg_root_mean_squared_error' + perf_eval = rmse + else: + knn = KNeighborsClassifier(metric='precomputed') + scoring = 'accuracy' + perf_eval = accuracy + grid_params = { + 'n_neighbors': [3, 5, 7, 9, 11] + } + + clf = GridSearchCV(knn, param_grid=grid_params, + scoring=scoring, + cv=5, return_train_score=True, refit=True) + clf.fit(D_app, y_app) + y_pred_app = clf.predict(D_app) + y_pred_test = clf.predict(D_test) + return perf_eval(y_pred_app, y_app), perf_eval(y_pred_test, y_test), clf + + +def xp_knn(Gn, y_all, y_distance=euclid_d, + mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs): + ''' + Perform a knn regressor on given dataset + ''' + from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit + from ged import compute_D_random, compute_D_expert + from ged import compute_D_fitted + + stratified = False + if mode == 'classif': + stratified = True + + if stratified: + rs = StratifiedShuffleSplit(n_splits=10, test_size=.1) + else: + rs = ShuffleSplit(n_splits=10, test_size=.1) + + if stratified: + split_scheme = rs.split(Gn, y_all) + else: + split_scheme = rs.split(Gn) + + results = [] + i = 1 + for train_index, test_index in split_scheme: + print() + print("Split {0}/{1}".format(i, 10)) + i = i + 1 + cur_results = {} + # Get splitted data + G_app, G_test, y_app, y_test = split_data(Gn, y_all, + train_index, test_index) + + cur_results['y_app'] = y_app + cur_results['y_test'] = y_test + + # Feed distances will all methods to compare + distances = {} + distances['random'] = compute_D_random(G_app, G_test, ed_method, **kwargs) + distances['expert'] = compute_D_expert(G_app, G_test, ed_method, **kwargs) + distances['fitted'] = compute_D_fitted( + G_app, y_app, G_test, + y_distance=y_distance, + mode=mode, unlabeled=unlabeled, ed_method=ed_method, + **kwargs) + + for setup in distances.keys(): + print("{0} Mode".format(setup)) + setup_results = {} + D_app, D_test, edit_costs = distances[setup] + setup_results['D_app'] = D_app + setup_results['D_test'] = D_test + setup_results['edit_costs'] = edit_costs + print(edit_costs) + perf_app, perf_test, clf = evaluate_D( + D_app, y_app, D_test, y_test, mode) + + setup_results['perf_app'] = perf_app + setup_results['perf_test'] = perf_test + setup_results['clf'] = clf + + print( + "Learning performance with {1} costs : {0:.2f}".format( + perf_app, setup)) + print( + "Test performance with {1} costs : {0:.2f}".format( + perf_test, setup)) + cur_results[setup] = setup_results + results.append(cur_results) + return results diff --git a/gklearn/experiments/thesis/ged/fit_distances/ml.py b/gklearn/experiments/thesis/ged/fit_distances/ml.py new file mode 100644 index 0000000..1ee4608 --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/ml.py @@ -0,0 +1,66 @@ +def loglik(X, y, w): + import numpy as np + return np.sum(-y*(X@w) + np.log(1+np.exp(X@w))) + + +def reg_log(X, y, ite_max=100, lbd=1e-12, pos_contraint=False): + """ + y \in 1,0 + """ + import numpy as np + + def proj_on_pos(w): + return np.array([x if x > 0 else 0 for x in w]) + + tol = 1e-4 + N, d = X.shape + y = np.array(y) + + w = np.zeros(d) # see 4.4 of ESLII + weights = [w] + + J = [loglik(X, y, w)] + # print(f"J[0] = {J[0]}") + old_J = J[0] + 1 + conv = False + i = 0 + while(not conv): + i = i + 1 + + Xw = X @ w + + p = np.exp(Xw)/(1+np.exp(Xw)) + W = np.diag(p) + regul = lbd*np.identity(d) + descent = np.linalg.solve(X.T @ W @ X + regul, X.T@(y-p)) + # print(f"descent: {descent}") + step = 1 + update = 0.1 + cur_w = w+step*descent + + if pos_contraint: + cur_w = proj_on_pos(cur_w) + + # print(f"cur_w : {cur_w}") + # print(f"J : {loglik(X,y,cur_w)}") + + while (loglik(X, y, cur_w) > J[-1]): + step = step*update + cur_w = w + step*descent + if pos_contraint: + cur_w = proj_on_pos(cur_w) + # print(f"step : {step}") + + w = cur_w + + J.append(loglik(X, y, w)) + weights.append(w) + + if (i > ite_max): + conv = True + if ((old_J - J[-1]) < tol): + conv = True + else: + old_J = J[-1] + + return w, J, weights diff --git a/gklearn/experiments/thesis/ged/fit_distances/optim_costs.py b/gklearn/experiments/thesis/ged/fit_distances/optim_costs.py new file mode 100644 index 0000000..82c9cde --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/optim_costs.py @@ -0,0 +1,136 @@ +from ged import compute_geds +from distances import sum_squares, euclid_d +import numpy as np +# from tqdm import tqdm + +import sys +# sys.path.insert(0, "../") + + +def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: The N distances to fit + """ + import cvxpy as cp + import numpy as np + MAX_SAMPLE = 1000 + nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat]) + dis_k_vec = np.array(dis_k_vec) + # dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec) + + # import pickle + # pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb')) + N = nb_cost_mat_m.shape[0] + sub_sample = np.random.permutation(np.arange(N)) + sub_sample = sub_sample[:MAX_SAMPLE] + + x = cp.Variable(nb_cost_mat_m.shape[1]) + cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample]) + prob = cp.Problem(cp.Minimize(cost), [x >= 0]) + prob.solve() + edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0] + edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new] + residual = prob.value + return edit_costs_new, residual + + +def optimize_costs_classif_unlabeled(nb_cost_mat, Y): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in + nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit + operations for each pair of graph + :param dis_k_vec: {-1,1}^N vector of common classes + """ + # import cvxpy as cp + from ml import reg_log + # import pickle + # pickle.dump([nb_cost_mat, Y], open('debug', 'wb')) + nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] + for x in nb_cost_mat]) + w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True) + edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0] + residual = J[-1] + + return edit_costs_new, residual + + +def optimize_costs_classif(nb_cost_mat, Y): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: {-1,1}^N vector of common classes + """ + #import pickle + # pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb")) + from ml import reg_log + w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True) + return w, J[-1] + + +def optimize_costs(nb_cost_mat, dis_k_vec): + """ + Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat + ! take care that nb_cost_mat do not contains 0 lines + :param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph + :param dis_k_vec: The N distances to fit + """ + import cvxpy as cp + x = cp.Variable(nb_cost_mat.shape[1]) + cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec) + constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])], + np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0, + np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0] + prob = cp.Problem(cp.Minimize(cost), constraints) + prob.solve() + edit_costs_new = x.value + residual = prob.value + + return edit_costs_new, residual + + +def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1], + y_distance=euclid_d, + mode='reg', unlabeled=False, + ed_method='BIPARTITE', + **kwargs): + N = len(y) + + G_pairs = [] + distances_vec = [] + + for i in range(N): + for j in range(i+1, N): + G_pairs.append([i, j]) + distances_vec.append(y_distance(y[i], y[j])) + ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method, **kwargs) + + residual_list = [sum_squares(ged_vec_init, distances_vec)] + + if (mode == 'reg'): + if unlabeled: + method_optim = optimize_costs_unlabeled + else: + method_optim = optimize_costs + + elif (mode == 'classif'): + if unlabeled: + method_optim = optimize_costs_classif_unlabeled + else: + method_optim = optimize_costs_classif + + ite_max = 5 + for i in range(ite_max): + print('ite', i + 1, '/', ite_max, ':') + # compute GEDs and numbers of edit operations. + edit_costs_new, residual = method_optim( + np.array(n_edit_operations), distances_vec) + ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method, **kwargs) + residual_list.append(sum_squares(ged_vec, distances_vec)) + + return edit_costs_new diff --git a/gklearn/experiments/thesis/ged/fit_distances/run_xps.py b/gklearn/experiments/thesis/ged/fit_distances/run_xps.py new file mode 100644 index 0000000..8bf5d35 --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/run_xps.py @@ -0,0 +1,100 @@ + +import sys + + +def run_xp(ds_name, output_file, unlabeled, mode, y_distance, ed_method): + from gklearn.dataset import Dataset + from gklearn.experiments import DATASET_ROOT + from learning import xp_knn + + ds = Dataset(ds_name, root=DATASET_ROOT, verbose=True) + ds.remove_labels(node_attrs=ds.node_attrs, edge_attrs=ds.edge_attrs) # @todo: ged can not deal with sym and unsym labels. + Gn = ds.graphs + y_all = ds.targets + + resu = {} + resu['y_distance'] = y_distance + resu['dataset'] = ds_name + unlabeled = (len(ds.node_labels) == 0 and len(ds.edge_labels) == 0) + results = xp_knn(Gn, y_all, y_distance=y_distances[y_distance], + mode=mode, + unlabeled=unlabeled, ed_method=ed_method, + node_labels=ds.node_labels, edge_labels=ds.edge_labels) + resu['results'] = results + resu['unlabeled'] = unlabeled + resu['mode'] = mode + resu['ed_method'] = ed_method + pickle.dump(resu, open(output_result, 'wb')) + return output_result + + +def run_from_args(): + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("dataset", help="path to / name of the dataset to predict") + parser.add_argument( + "output_file", help="path to file which will contains the results") + parser.add_argument("-u", "--unlabeled", help="Specify that the dataset is unlabeled graphs", + action="store_true") + parser.add_argument("-m", "--mode", type=str, choices=['reg', 'classif'], + help="Specify if the dataset a classification or regression problem") + parser.add_argument("-y", "--y_distance", type=str, choices=['euclidean', 'manhattan', 'classif'], + default='euclid', + help="Specify the distance on y to fit the costs") + + args = parser.parse_args() + + dataset = args.dataset + output_result = args.output_file + unlabeled = args.unlabeled + mode = args.mode + + print(args) + y_distances = { + 'euclidean': euclid_d, + 'manhattan': man_d, + 'classif': classif_d + } + y_distance = y_distances['euclid'] + + run_xp(dataset, output_result, unlabeled, mode, y_distance) + print("Fini") + + +if __name__ == "__main__": + + import pickle + import os + + from distances import euclid_d, man_d, classif_d + y_distances = { + 'euclidean': euclid_d, + 'manhattan': man_d, + 'classif': classif_d + } + + # Read arguments. + if len(sys.argv) > 1: + run_from_args() + else: + from sklearn.model_selection import ParameterGrid + + # Get task grid. + Edit_Cost_List = ['BIPARTITE', 'IPFP'] + Dataset_list = ['Alkane_unlabeled', 'Acyclic', 'Chiral', 'Vitamin_D', + 'Steroid'] + Dis_List = ['euclidean', 'manhattan'] + task_grid = ParameterGrid({'edit_cost': Edit_Cost_List[0:1], + 'dataset': Dataset_list[1:2], + 'distance': Dis_List[:]}) + + unlabeled = False # @todo: Not actually used. + mode = 'reg' + # Run. + for task in list(task_grid): + print() + print(task) + + output_result = 'outputs/results.' + '.'.join([task['dataset'], task['edit_cost'], task['distance']]) + '.pkl' + if not os.path.isfile(output_result): + run_xp(task['dataset'], output_result, unlabeled, mode, task['distance'], task['edit_cost']) \ No newline at end of file diff --git a/gklearn/experiments/thesis/ged/fit_distances/utils.py b/gklearn/experiments/thesis/ged/fit_distances/utils.py new file mode 100644 index 0000000..8a0086a --- /dev/null +++ b/gklearn/experiments/thesis/ged/fit_distances/utils.py @@ -0,0 +1,15 @@ +import numpy as np + + +def vec2sym_mat(v): + """ + Convert a vector encoding a symmetric matrix into a matrix + See Golub and Van Loan, Matrix Computations, 3rd edition, p21 + """ + n = int((-1+np.sqrt(1+8*len(v)))/2) # second order resolution + M = np.zeros((n, n)) + for i in range(n): + for j in range(i, n): + # Golub van Loan, Matrix Computations, Eq. 1.2.2, p21 + M[i, j] = M[j, i] = v[i*n - (i+1)*(i)//2 + j] + return M