|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- #!/usr/bin/env python3
- # -*- coding: utf-8 -*-
- """
- Created on Fri Jan 10 13:22:04 2020
-
- @author: ljia
- """
- import numpy as np
- #import matplotlib.pyplot as plt
- from tqdm import tqdm
- import random
- #import csv
- from shutil import copyfile
- import os
-
- from gklearn.preimage.iam import iam_bash
- from gklearn.utils.graphfiles import loadDataset, loadGXL
- from gklearn.preimage.ged import GED
- from gklearn.preimage.utils import get_same_item_indices
-
- def test_knn():
- ds = {'name': 'monoterpenoides',
- 'dataset': '../datasets/monoterpenoides/dataset_10+.ds'} # node/edge symb
- Gn, y_all = loadDataset(ds['dataset'])
- # Gn = Gn[0:50]
- # gkernel = 'treeletkernel'
- # node_label = 'atom'
- # edge_label = 'bond_type'
- # ds_name = 'mono'
- dir_output = 'results/knn/'
- graph_dir = os.path.dirname(os.path.realpath(__file__)) + '../../datasets/monoterpenoides/'
-
- k_nn = 1
- percent = 0.1
- repeats = 50
- edit_cost_constant = [3, 3, 1, 3, 3, 1]
-
- # get indices by classes.
- y_idx = get_same_item_indices(y_all)
- sod_sm_list_list
- for repeat in range(0, repeats):
- print('\n---------------------------------')
- print('repeat =', repeat)
- accuracy_sm_list = []
- accuracy_gm_list = []
- sod_sm_list = []
- sod_gm_list = []
-
- random.seed(repeat)
- set_median_list = []
- gen_median_list = []
- train_y_set = []
- for y, values in y_idx.items():
- print('\ny =', y)
- size_median_set = int(len(values) * percent)
- median_set_idx = random.sample(values, size_median_set)
- print('median set: ', median_set_idx)
-
- # compute set median and gen median using IAM (C++ through bash).
- # Gn_median = [Gn[idx] for idx in median_set_idx]
- group_fnames = [Gn[g].graph['filename'] for g in median_set_idx]
- sod_sm, sod_gm, fname_sm, fname_gm = iam_bash(group_fnames, edit_cost_constant,
- graph_dir=graph_dir)
- print('sod_sm, sod_gm:', sod_sm, sod_gm)
- sod_sm_list.append(sod_sm)
- sod_gm_list.append(sod_gm)
- fname_sm_new = dir_output + 'medians/set_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
- copyfile(fname_sm, fname_sm_new)
- fname_gm_new = dir_output + 'medians/gen_median.y' + str(int(y)) + '.repeat' + str(repeat) + '.gxl'
- copyfile(fname_gm, fname_gm_new)
- set_median_list.append(loadGXL(fname_sm_new))
- gen_median_list.append(loadGXL(fname_gm_new))
- train_y_set.append(int(y))
-
- print(sod_sm, sod_gm)
-
- # do 1-nn.
- test_y_set = [int(y) for y in y_all]
- accuracy_sm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
- accuracy_gm = knn(set_median_list, train_y_set, Gn, test_y_set, k=k_nn, distance='ged')
- accuracy_sm_list.append(accuracy_sm)
- accuracy_gm_list.append(accuracy_gm)
- print('current accuracy sm and gm:', accuracy_sm, accuracy_gm)
-
- # output
- accuracy_sm_mean = np.mean(accuracy_sm_list)
- accuracy_gm_mean = np.mean(accuracy_gm_list)
- print('\ntotal average accuracy sm and gm:', accuracy_sm_mean, accuracy_gm_mean)
-
-
- def knn(train_set, train_y_set, test_set, test_y_set, k=1, distance='ged'):
- if k == 1 and distance == 'ged':
- algo_options = '--threads 8 --initial-solutions 40 --ratio-runs-from-initial-solutions 1'
- params_ged = {'lib': 'gedlibpy', 'cost': 'CONSTANT', 'method': 'IPFP',
- 'algo_options': algo_options, 'stabilizer': None}
- accuracy = 0
- for idx_test, g_test in tqdm(enumerate(test_set), desc='computing 1-nn',
- file=sys.stdout):
- dis = np.inf
- for idx_train, g_train in enumerate(train_set):
- dis_cur, _, _ = GED(g_test, g_train, **params_ged)
- if dis_cur < dis:
- dis = dis_cur
- test_y_cur = train_y_set[idx_train]
- if test_y_cur == test_y_set[idx_test]:
- accuracy += 1
- accuracy = accuracy / len(test_set)
-
- return accuracy
-
-
-
- if __name__ == '__main__':
- test_knn()
|