#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Dec 16 11:53:54 2019 @author: ljia """ import numpy as np import math import networkx as nx import matplotlib.pyplot as plt import time import random from tqdm import tqdm from itertools import combinations, islice import multiprocessing from multiprocessing import Pool from functools import partial #import os import sys sys.path.insert(0, "../") from pygraph.utils.graphfiles import loadDataset, loadGXL #from pygraph.utils.logger2file import * from iam import iam_upgraded, iam_bash from utils import compute_kernel, dis_gstar, kernel_distance_matrix from fitDistance import fit_GED_to_kernel_distance #from ged import ged_median def median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, fit_method, graph_dir='/media/ljia/DATA/research-repo/codes/Linlin/py-graph/datasets/monoterpenoides/', edit_costs=None, group_min=None, dataset='monoterpenoides', cost='CONSTANT', parallel=True): dataset = dataset.lower() # # compute distances in kernel space. # dis_mat, _, _, _ = kernel_distance_matrix(Gn, node_label, edge_label, # Kmatrix=None, gkernel=gkernel) # # ged. # gmfile = np.load('results/test_k_closest_graphs/ged_mat.fit_on_whole_dataset.with_medians.gm.npz') # ged_mat = gmfile['ged_mat'] # dis_mat = ged_mat[0:len(Gn), 0:len(Gn)] # # choose k closest graphs # time0 = time.time() # sod_ks_min, group_min = get_closest_k_graphs(dis_mat, k, parallel) # time_spent = time.time() - time0 # print('closest graphs:', sod_ks_min, group_min) # print('time spent:', time_spent) # group_min = (12, 13, 22, 29) # closest w.r.t path kernel # group_min = (77, 85, 160, 171) # closest w.r.t ged # group_min = (0,1,2,3,4,5,6,7,8,9,10,11) # closest w.r.t treelet kernel Gn_median = [Gn[g].copy() for g in group_min] # fit edit costs. if fit_method == 'random': # random if cost == 'LETTER': edit_cost_constant = random.sample(range(1, 10), 3) edit_cost_constant = [item * 0.1 for item in edit_cost_constant] elif cost == 'LETTER2': random.seed(time.time()) edit_cost_constant = random.sample(range(1, 10), 5) # edit_cost_constant = [item * 0.1 for item in edit_cost_constant] else: edit_cost_constant = random.sample(range(1, 10), 6) print('edit costs used:', edit_cost_constant) elif fit_method == 'expert': # expert edit_cost_constant = [3, 3, 1, 3, 3, 1] elif fit_method == 'k-graphs': itr_max = 6 if cost == 'LETTER': init_costs = [0.9, 1.7, 0.75] elif cost == 'LETTER2': init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] else: init_costs = [3, 3, 1, 3, 3, 1] algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', 'algo_options': algo_options, 'stabilizer': None} # fit on k-graph subset edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn_median, node_label, edge_label, gkernel, itr_max, params_ged=params_ged, init_costs=init_costs, dataset=dataset, parallel=True) elif fit_method == 'whole-dataset': itr_max = 6 if cost == 'LETTER': init_costs = [0.9, 1.7, 0.75] elif cost == 'LETTER2': init_costs = [0.675, 0.675, 0.75, 0.425, 0.425] else: init_costs = [3, 3, 1, 3, 3, 1] algo_options = '--threads 1 --initial-solutions 40 --ratio-runs-from-initial-solutions 1' params_ged = {'lib': 'gedlibpy', 'cost': cost, 'method': 'IPFP', 'algo_options': algo_options, 'stabilizer': None} # fit on all subset edit_cost_constant, _, _, _, _, _, _ = fit_GED_to_kernel_distance(Gn, node_label, edge_label, gkernel, itr_max, params_ged=params_ged, init_costs=init_costs, dataset=dataset, parallel=True) elif fit_method == 'precomputed': edit_cost_constant = edit_costs # compute set median and gen median using IAM (C++ through bash). group_fnames = [Gn[g].graph['filename'] for g in group_min] sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant, cost=cost, graph_dir=graph_dir, dataset=dataset) # compute distances in kernel space. Gn_median = [Gn[g].copy() for g in group_min] set_median = loadGXL(fname_sm) gen_median = loadGXL(fname_gm) # print(gen_median.nodes(data=True)) # print(gen_median.edges(data=True)) if dataset == 'letter': for g in Gn_median: reform_attributes(g) reform_attributes(set_median) reform_attributes(gen_median) # compute distance in kernel space for set median. Kmatrix_sm = compute_kernel([set_median] + Gn_median, gkernel, None if dataset == 'letter' else 'chem', None if dataset == 'letter' else 'valence', False) dis_k_sm = dis_gstar(0, range(1, 1+len(Gn_median)), [1 / len(Gn_median)] * len(Gn_median), Kmatrix_sm, withterm3=False) # print(gen_median.nodes(data=True)) # print(gen_median.edges(data=True)) # print(set_median.nodes(data=True)) # print(set_median.edges(data=True)) # compute distance in kernel space for generalized median. Kmatrix_gm = compute_kernel([gen_median] + Gn_median, gkernel, None if dataset == 'letter' else 'chem', None if dataset == 'letter' else 'valence', False) dis_k_gm = dis_gstar(0, range(1, 1+len(Gn_median)), [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False) # compute distance in kernel space for each graph in median set. dis_k_gi = [] for idx in range(len(Gn_median)): dis_k_gi.append(dis_gstar(idx+1, range(1, 1+len(Gn_median)), [1 / len(Gn_median)] * len(Gn_median), Kmatrix_gm, withterm3=False)) print('sod_sm:', sod_sm) print('sod_gm:', sod_gm) print('dis_k_sm:', dis_k_sm) print('dis_k_gm:', dis_k_gm) print('dis_k_gi:', dis_k_gi) idx_dis_k_gi_min = np.argmin(dis_k_gi) dis_k_gi_min = dis_k_gi[idx_dis_k_gi_min] print('index min dis_k_gi:', group_min[idx_dis_k_gi_min]) print('min dis_k_gi:', dis_k_gi_min) return sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min, group_min[idx_dis_k_gi_min] def reform_attributes(G): for node in G.nodes: G.nodes[node]['attributes'] = [G.nodes[node]['x'], G.nodes[node]['y']] def get_closest_k_graphs(dis_mat, k, parallel): k_graph_groups = combinations(range(0, len(dis_mat)), k) sod_ks_min = np.inf if parallel: len_combination = get_combination_length(len(dis_mat), k) len_itr_max = int(len_combination if len_combination < 1e7 else 1e7) # pos_cur = 0 graph_groups_slices = split_iterable(k_graph_groups, len_itr_max, len_combination) for graph_groups_cur in graph_groups_slices: # while True: # graph_groups_cur = islice(k_graph_groups, pos_cur, pos_cur + len_itr_max) graph_groups_cur_list = list(graph_groups_cur) print('current position:', graph_groups_cur_list[0]) len_itr_cur = len(graph_groups_cur_list) # if len_itr_cur < len_itr_max: # break itr = zip(graph_groups_cur_list, range(0, len_itr_cur)) sod_k_list = np.empty(len_itr_cur) graphs_list = [None] * len_itr_cur n_jobs = multiprocessing.cpu_count() chunksize = int(len_itr_max / n_jobs + 1) n_jobs = multiprocessing.cpu_count() def init_worker(dis_mat_toshare): global G_dis_mat G_dis_mat = dis_mat_toshare pool = Pool(processes=n_jobs, initializer=init_worker, initargs=(dis_mat,)) # iterator = tqdm(pool.imap_unordered(_get_closest_k_graphs_parallel, # itr, chunksize), # desc='Choosing k closest graphs', file=sys.stdout) iterator = pool.imap_unordered(_get_closest_k_graphs_parallel, itr, chunksize) for graphs, i, sod_ks in iterator: sod_k_list[i] = sod_ks graphs_list[i] = graphs pool.close() pool.join() arg_min = np.argmin(sod_k_list) sod_ks_cur = sod_k_list[arg_min] group_cur = graphs_list[arg_min] if sod_ks_cur < sod_ks_min: sod_ks_min = sod_ks_cur group_min = group_cur print('get closer graphs:', sod_ks_min, group_min) else: for items in tqdm(k_graph_groups, desc='Choosing k closest graphs', file=sys.stdout): # if items[0] != itmp: # itmp = items[0] # print(items) k_graph_pairs = combinations(items, 2) sod_ks = 0 for i1, i2 in k_graph_pairs: sod_ks += dis_mat[i1, i2] if sod_ks < sod_ks_min: sod_ks_min = sod_ks group_min = items print('get closer graphs:', sod_ks_min, group_min) return sod_ks_min, group_min def _get_closest_k_graphs_parallel(itr): k_graph_pairs = combinations(itr[0], 2) sod_ks = 0 for i1, i2 in k_graph_pairs: sod_ks += G_dis_mat[i1, i2] return itr[0], itr[1], sod_ks def split_iterable(iterable, n, len_iter): it = iter(iterable) for i in range(0, len_iter, n): piece = islice(it, n) yield piece def get_combination_length(n, k): len_combination = 1 for i in range(n, n - k, -1): len_combination *= i return int(len_combination / math.factorial(k)) ############################################################################### def test_k_closest_graphs(): ds = {'name': 'monoterpenoides', 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb Gn, y_all = loadDataset(ds['dataset']) # Gn = Gn[0:50] # gkernel = 'untilhpathkernel' # gkernel = 'weisfeilerlehmankernel' gkernel = 'treeletkernel' node_label = 'atom' edge_label = 'bond_type' k = 5 edit_costs = [0.16229209837639536, 0.06612870523413916, 0.04030113378793905, 0.20723547009415202, 0.3338607220394598, 0.27054392518077297] # sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ # = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, # 'precomputed', edit_costs=edit_costs, ## 'k-graphs', # parallel=False) # # sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ # = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, # 'expert', parallel=False) sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 'expert', parallel=False) return def test_k_closest_graphs_with_cv(): gkernel = 'untilhpathkernel' node_label = 'atom' edge_label = 'bond_type' k = 4 y_all = ['3', '1', '4', '6', '7', '8', '9', '2'] repeats = 50 collection_path = '/media/ljia/DATA/research-repo/codes/others/gedlib/tests_linlin/generated_datsets/monoterpenoides/' graph_dir = collection_path + 'gxl/' sod_sm_list = [] sod_gm_list = [] dis_k_sm_list = [] dis_k_gm_list = [] dis_k_gi_min_list = [] for y in y_all: print('\n-------------------------------------------------------') print('class of y:', y) sod_sm_list.append([]) sod_gm_list.append([]) dis_k_sm_list.append([]) dis_k_gm_list.append([]) dis_k_gi_min_list.append([]) for repeat in range(repeats): print('\nrepeat ', repeat) collection_file = collection_path + 'monoterpenoides_' + y + '_' + str(repeat) + '.xml' Gn, _ = loadDataset(collection_file, extra_params=graph_dir) sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \ = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k, 'whole-dataset', graph_dir=graph_dir, parallel=False) sod_sm_list[-1].append(sod_sm) sod_gm_list[-1].append(sod_gm) dis_k_sm_list[-1].append(dis_k_sm) dis_k_gm_list[-1].append(dis_k_gm) dis_k_gi_min_list[-1].append(dis_k_gi_min) print('\nsods of the set median for this class:', sod_sm_list[-1]) print('\nsods of the gen median for this class:', sod_gm_list[-1]) print('\ndistances in kernel space of set median for this class:', dis_k_sm_list[-1]) print('\ndistances in kernel space of gen median for this class:', dis_k_gm_list[-1]) print('\ndistances in kernel space of min graph for this class:', dis_k_gi_min_list[-1]) sod_sm_list[-1] = np.mean(sod_sm_list[-1]) sod_gm_list[-1] = np.mean(sod_gm_list[-1]) dis_k_sm_list[-1] = np.mean(dis_k_sm_list[-1]) dis_k_gm_list[-1] = np.mean(dis_k_gm_list[-1]) dis_k_gi_min_list[-1] = np.mean(dis_k_gi_min_list[-1]) print() print('\nmean sods of the set median for each class:', sod_sm_list) print('\nmean sods of the gen median for each class:', sod_gm_list) print('\nmean distance in kernel space of set median for each class:', dis_k_sm_list) print('\nmean distances in kernel space of gen median for each class:', dis_k_gm_list) print('\nmean distances in kernel space of min graph for each class:', dis_k_gi_min_list) print('\nmean sods of the set median of all:', np.mean(sod_sm_list)) print('\nmean sods of the gen median of all:', np.mean(sod_gm_list)) print('\nmean distances in kernel space of set median of all:', np.mean(dis_k_sm_list)) print('\nmean distances in kernel space of gen median of all:', np.mean(dis_k_gm_list)) print('\nmean distances in kernel space of min graph of all:', np.mean(dis_k_gi_min_list)) return if __name__ == '__main__': test_k_closest_graphs() # test_k_closest_graphs_with_cv()