From ffb26889b9874f09fc642f1aa82d3a0f02a30c43 Mon Sep 17 00:00:00 2001
From: jajupmochi <jajupmochi@gmail.com>
Date: Tue, 20 Oct 2020 16:03:00 +0200
Subject: [PATCH] Add exps for the GED stability.

---
 .../stability/edit_costs.nums_sols.ratios.IPFP.py  | 107 +++++++++++++++++++++
 1 file changed, 107 insertions(+)
 create mode 100644 gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py

diff --git a/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py
new file mode 100644
index 0000000..ed7eb2d
--- /dev/null
+++ b/gklearn/experiments/ged/stability/edit_costs.nums_sols.ratios.IPFP.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Wed Oct  20 11:48:02 2020
+
+@author: ljia
+"""	
+# This script tests the influence of the ratios between node costs and edge costs on the stability of the GED computation, where the base edit costs are [1, 1, 1, 1, 1, 1].
+
+import os
+import multiprocessing
+import pickle
+import logging
+from gklearn.utils import Dataset
+from gklearn.ged.util import compute_geds
+
+
+def get_dataset(ds_name):
+	# The node/edge labels that will not be used in the computation.
+	if ds_name == 'MAO':
+		irrelevant_labels = {'node_attrs': ['x', 'y', 'z'], 'edge_labels': ['bond_stereo']}
+	elif ds_name == 'Monoterpenoides':
+		irrelevant_labels = {'edge_labels': ['valence']}
+	elif ds_name == 'MUTAG':
+		irrelevant_labels = {'edge_labels': ['label_0']}
+	elif ds_name == 'AIDS_symb':
+		irrelevant_labels = {'node_attrs': ['chem', 'charge', 'x', 'y'], 'edge_labels': ['valence']}
+
+	# Initialize a Dataset.
+	dataset = Dataset()
+	# Load predefined dataset.
+	dataset.load_predefined_dataset(ds_name)
+	# Remove irrelevant labels.
+	dataset.remove_labels(**irrelevant_labels)
+	print('dataset size:', len(dataset.graphs))
+	return dataset
+
+
+def xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial):
+
+	save_dir = 'outputs/edit_costs.num_sols.ratios.IPFP/'
+	if not os.path.exists(save_dir):
+		os.makedirs(save_dir)
+		
+	save_file_suffix = '.' + ds_name + '.num_sols_' + str(num_solutions) + '.ratio_' + "{:.2f}".format(ratio) + '.trial_' + str(trial)
+	
+	"""**1.   Get dataset.**"""
+	dataset = get_dataset(ds_name)
+
+	"""**2.  Set parameters.**"""
+
+	# Parameters for GED computation.
+	ged_options = {'method': 'IPFP',  # use IPFP huristic.
+				   'initialization_method': 'RANDOM',  # or 'NODE', etc.
+				   # when bigger than 1, then the method is considered mIPFP.
+				   'initial_solutions': int(num_solutions * 4),
+				   'edit_cost': 'CONSTANT',  # use CONSTANT cost.
+				   # the distance between non-symbolic node/edge labels is computed by euclidean distance.
+				   'attr_distance': 'euclidean',
+				   'ratio_runs_from_initial_solutions': 0.25,
+				   # parallel threads. Do not work if mpg_options['parallel'] = False.
+				   'threads': multiprocessing.cpu_count(),
+				   'init_option': 'EAGER_WITHOUT_SHUFFLED_COPIES'
+				   }
+	
+	edit_cost_constants = [i * ratio for i in [1, 1, 1]] + [1, 1, 1]
+# 	edit_cost_constants = [item * 0.01 for item in edit_cost_constants]
+# 	pickle.dump(edit_cost_constants, open(save_dir + "edit_costs" + save_file_suffix + ".pkl", "wb"))
+
+	options = ged_options.copy()
+	options['edit_cost_constants'] = edit_cost_constants
+	options['node_labels'] = dataset.node_labels
+	options['edge_labels'] = dataset.edge_labels
+	options['node_attrs'] = dataset.node_attrs
+	options['edge_attrs'] = dataset.edge_attrs
+	parallel = True # if num_solutions == 1 else False
+	
+	"""**5.   Compute GED matrix.**"""
+	ged_mat = 'error'
+	try:
+		ged_vec_init, ged_mat, n_edit_operations = compute_geds(dataset.graphs, options=options, parallel=parallel, verbose=True)
+	except Exception as exp:
+		print('An exception occured when running this experiment:')
+		LOG_FILENAME = save_dir + 'error.txt'
+		logging.basicConfig(filename=LOG_FILENAME, level=logging.DEBUG)
+		logging.exception('save_file_suffix')
+		print(repr(exp))
+					
+	"""**6. Get results.**"""
+	
+	pickle.dump(ged_mat, open(save_dir + 'ged_matrix' + save_file_suffix + '.pkl', 'wb'))
+		
+
+if __name__ == '__main__':
+	for ds_name in ['MAO', 'Monoterpenoides', 'MUTAG', 'AIDS_symb']:
+		print()
+		print('Dataset:', ds_name)
+		for num_solutions in [1, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]:
+			print()
+			print('# of solutions:', num_solutions)
+			for ratio in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]:
+				print()
+				print('Ratio:', ratio)
+				for trial in range(1, 101):
+					print()
+					print('Trial:', trial)
+					xp_compute_ged_matrix(ds_name, num_solutions, ratio, trial)
\ No newline at end of file