|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Thu Jan 9 11:54:32 2020
-
- @author: ljia
- """
- import numpy as np
- import random
- import csv
-
- from gklearn.utils.graphfiles import loadDataset
- from gklearn.preimage.test_k_closest_graphs import median_on_k_closest_graphs
-
- def find_best_k():
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
- # Gn = Gn[0:50]
- gkernel = 'treeletkernel'
- node_label = 'atom'
- edge_label = 'bond_type'
- ds_name = 'mono'
- dir_output = 'results/test_find_best_k/'
-
- repeats = 50
- k_list = range(2, 11)
- fit_method = 'k-graphs'
- # fitted on the whole dataset - treelet - mono
- edit_costs = [0.1268873773592978, 0.004084633224249829, 0.0897581955378986, 0.15328856114451297, 0.3109956881625734, 0.0]
-
- # create result files.
- fn_output_detail = 'results_detail.' + fit_method + '.csv'
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'repeat', 'median set', 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM'])
- f_detail.close()
- fn_output_summary = 'results_summary.csv'
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow(['dataset', 'graph kernel', 'fit method', 'k',
- 'SOD SM', 'SOD GM', 'dis_k SM', 'dis_k GM',
- 'min dis_k gi', 'SOD SM -> GM', 'dis_k SM -> GM', 'dis_k gi -> SM',
- 'dis_k gi -> GM', '# SOD SM -> GM', '# dis_k SM -> GM',
- '# dis_k gi -> SM', '# dis_k gi -> GM', 'repeats better SOD SM -> GM',
- 'repeats better dis_k SM -> GM', 'repeats better dis_k gi -> SM',
- 'repeats better dis_k gi -> GM'])
- f_summary.close()
-
- random.seed(1)
- rdn_seed_list = random.sample(range(0, repeats * 100), repeats)
-
- for k in k_list:
- print('\n--------- k =', k, '----------')
-
- sod_sm_list = []
- sod_gm_list = []
- dis_k_sm_list = []
- dis_k_gm_list = []
- dis_k_gi_min_list = []
- nb_sod_sm2gm = [0, 0, 0]
- nb_dis_k_sm2gm = [0, 0, 0]
- nb_dis_k_gi2sm = [0, 0, 0]
- nb_dis_k_gi2gm = [0, 0, 0]
- repeats_better_sod_sm2gm = []
- repeats_better_dis_k_sm2gm = []
- repeats_better_dis_k_gi2sm = []
- repeats_better_dis_k_gi2gm = []
-
-
- for repeat in range(repeats):
- print('\nrepeat =', repeat)
- random.seed(rdn_seed_list[repeat])
- median_set_idx = random.sample(range(0, len(Gn)), k)
- print('median set: ', median_set_idx)
-
- sod_sm, sod_gm, dis_k_sm, dis_k_gm, dis_k_gi, dis_k_gi_min \
- = median_on_k_closest_graphs(Gn, node_label, edge_label, gkernel, k,
- fit_method='k-graphs',
- edit_costs=edit_costs,
- group_min=median_set_idx,
- parallel=False)
-
- # write result detail.
- sod_sm2gm = getRelations(np.sign(sod_gm - sod_sm))
- dis_k_sm2gm = getRelations(np.sign(dis_k_gm - dis_k_sm))
- dis_k_gi2sm = getRelations(np.sign(dis_k_sm - dis_k_gi_min))
- dis_k_gi2gm = getRelations(np.sign(dis_k_gm - dis_k_gi_min))
- f_detail = open(dir_output + fn_output_detail, 'a')
- csv.writer(f_detail).writerow([ds_name, gkernel, fit_method, k, repeat,
- median_set_idx, sod_sm, sod_gm, dis_k_sm, dis_k_gm,
- dis_k_gi_min, sod_sm2gm, dis_k_sm2gm, dis_k_gi2sm,
- dis_k_gi2gm])
- f_detail.close()
-
- # compute result summary.
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- dis_k_sm_list.append(dis_k_sm)
- dis_k_gm_list.append(dis_k_gm)
- dis_k_gi_min_list.append(dis_k_gi_min)
- # # SOD SM -> GM
- if sod_sm > sod_gm:
- nb_sod_sm2gm[0] += 1
- repeats_better_sod_sm2gm.append(repeat)
- elif sod_sm == sod_gm:
- nb_sod_sm2gm[1] += 1
- elif sod_sm < sod_gm:
- nb_sod_sm2gm[2] += 1
- # # dis_k SM -> GM
- if dis_k_sm > dis_k_gm:
- nb_dis_k_sm2gm[0] += 1
- repeats_better_dis_k_sm2gm.append(repeat)
- elif dis_k_sm == dis_k_gm:
- nb_dis_k_sm2gm[1] += 1
- elif dis_k_sm < dis_k_gm:
- nb_dis_k_sm2gm[2] += 1
- # # dis_k gi -> SM
- if dis_k_gi_min > dis_k_sm:
- nb_dis_k_gi2sm[0] += 1
- repeats_better_dis_k_gi2sm.append(repeat)
- elif dis_k_gi_min == dis_k_sm:
- nb_dis_k_gi2sm[1] += 1
- elif dis_k_gi_min < dis_k_sm:
- nb_dis_k_gi2sm[2] += 1
- # # dis_k gi -> GM
- if dis_k_gi_min > dis_k_gm:
- nb_dis_k_gi2gm[0] += 1
- repeats_better_dis_k_gi2gm.append(repeat)
- elif dis_k_gi_min == dis_k_gm:
- nb_dis_k_gi2gm[1] += 1
- elif dis_k_gi_min < dis_k_gm:
- nb_dis_k_gi2gm[2] += 1
-
- # write result summary.
- sod_sm_mean = np.mean(sod_sm_list)
- sod_gm_mean = np.mean(sod_gm_list)
- dis_k_sm_mean = np.mean(dis_k_sm_list)
- dis_k_gm_mean = np.mean(dis_k_gm_list)
- dis_k_gi_min_mean = np.mean(dis_k_gi_min_list)
- sod_sm2gm_mean = getRelations(np.sign(sod_gm_mean - sod_sm_mean))
- dis_k_sm2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_sm_mean))
- dis_k_gi2sm_mean = getRelations(np.sign(dis_k_sm_mean - dis_k_gi_min_mean))
- dis_k_gi2gm_mean = getRelations(np.sign(dis_k_gm_mean - dis_k_gi_min_mean))
- f_summary = open(dir_output + fn_output_summary, 'a')
- csv.writer(f_summary).writerow([ds_name, gkernel, fit_method, k,
- sod_sm_mean, sod_gm_mean, dis_k_sm_mean, dis_k_gm_mean,
- dis_k_gi_min_mean, sod_sm2gm_mean, dis_k_sm2gm_mean,
- dis_k_gi2sm_mean, dis_k_gi2gm_mean, nb_sod_sm2gm,
- nb_dis_k_sm2gm, nb_dis_k_gi2sm, nb_dis_k_gi2gm,
- repeats_better_sod_sm2gm, repeats_better_dis_k_sm2gm,
- repeats_better_dis_k_gi2sm, repeats_better_dis_k_gi2gm])
- f_summary.close()
-
- print('\ncomplete.')
- return
-
-
- def getRelations(sign):
- if sign == -1:
- return 'better'
- elif sign == 0:
- return 'same'
- elif sign == 1:
- return 'worse'
-
-
- if __name__ == '__main__':
- find_best_k()
|