OpenI
/
graphkit-learn

import numpy as np

from gklearn.ged.model.distances import sum_squares, euclid_d
from gklearn.ged.model.ged_com import compute_geds


def optimize_costs_unlabeled(nb_cost_mat, dis_k_vec):
	"""
	Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
	! take care that nb_cost_mat do not contains 0 lines
	:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
	:param dis_k_vec: The N distances to fit
	"""
	import cvxpy as cp
	import numpy as np
	MAX_SAMPLE = 1000
	nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]] for x in nb_cost_mat])
	dis_k_vec = np.array(dis_k_vec)
	# dis_k_vec_norm = dis_k_vec/np.max(dis_k_vec)

	# import pickle
	# pickle.dump([nb_cost_mat, dis_k_vec], open('debug', 'wb'))
	N = nb_cost_mat_m.shape[0]
	sub_sample = np.random.permutation(np.arange(N))
	sub_sample = sub_sample[:MAX_SAMPLE]

	x = cp.Variable(nb_cost_mat_m.shape[1])
	cost = cp.sum_squares((nb_cost_mat_m[sub_sample, :] @ x) - dis_k_vec[sub_sample])
	prob = cp.Problem(cp.Minimize(cost), [x >= 0])
	prob.solve()
	edit_costs_new = [x.value[0], x.value[1], 0, x.value[2], x.value[3], 0]
	edit_costs_new = [xi if xi > 0 else 0 for xi in edit_costs_new]
	residual = prob.value
	return edit_costs_new, residual


def optimize_costs_classif_unlabeled(nb_cost_mat, Y):
	"""
	Optimize edit costs to fit dis_k_vec according to edit operations in
	nb_cost_mat
	! take care that nb_cost_mat do not contains 0 lines
	:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit
	operations for each pair of graph
	:param dis_k_vec: {-1,1}^N vector of common classes
	"""
	# import cvxpy as cp
	from ml import reg_log
	# import pickle
	# pickle.dump([nb_cost_mat, Y], open('debug', 'wb'))
	nb_cost_mat_m = np.array([[x[0], x[1], x[3], x[4]]
							  for x in nb_cost_mat])
	w, J, _ = reg_log(nb_cost_mat_m, Y, pos_contraint=True)
	edit_costs_new = [w[0], w[1], 0, w[2], w[3], 0]
	residual = J[-1]

	return edit_costs_new, residual


def optimize_costs_classif(nb_cost_mat, Y):
	"""
		Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
		! take care that nb_cost_mat do not contains 0 lines
		:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
		:param dis_k_vec: {-1,1}^N vector of common classes
	"""
	#import pickle
	# pickle.dump([nb_cost_mat, Y], open("test.pickle", "wb"))
	from ml import reg_log
	w, J, _ = reg_log(nb_cost_mat, Y, pos_contraint=True)
	return w, J[-1]


def optimize_costs(nb_cost_mat, dis_k_vec):
	"""
	Optimize edit costs to fit dis_k_vec according to edit operations in nb_cost_mat
	! take care that nb_cost_mat do not contains 0 lines
	:param nb_cost_mat: \in \mathbb{N}^{N x 6} encoding the number of edit operations for each pair of graph
	:param dis_k_vec: The N distances to fit
	"""
	import cvxpy as cp
	x = cp.Variable(nb_cost_mat.shape[1])
	cost = cp.sum_squares((nb_cost_mat @ x) - dis_k_vec)
	constraints = [x >= [0.01 for i in range(nb_cost_mat.shape[1])],
				   np.array([1.0, 1.0, -1.0, 0.0, 0.0, 0.0]).T@x >= 0.0,
				   np.array([0.0, 0.0, 0.0, 1.0, 1.0, -1.0]).T@x >= 0.0]
	prob = cp.Problem(cp.Minimize(cost), constraints)
	prob.solve()
	edit_costs_new = x.value
	residual = prob.value

	return edit_costs_new, residual


def compute_optimal_costs(G, y, init_costs=[3, 3, 1, 3, 3, 1],
						  y_distance=euclid_d,
						  mode='reg', unlabeled=False,
						  ed_method='BIPARTITE',
						  verbose=True,
						  **kwargs):
	N = len(y)

	G_pairs = []
	distances_vec = []

	for i in range(N):
		for j in range(i+1, N):
			G_pairs.append([i, j])
			distances_vec.append(y_distance(y[i], y[j]))
	ged_vec_init, n_edit_operations = compute_geds(G_pairs, G, init_costs, ed_method,
												verbose=verbose, **kwargs)

	residual_list = [sum_squares(ged_vec_init, distances_vec)]

	if (mode == 'reg'):
		if unlabeled:
			method_optim = optimize_costs_unlabeled
		else:
			method_optim = optimize_costs

	elif (mode == 'classif'):
		if unlabeled:
			method_optim = optimize_costs_classif_unlabeled
		else:
			method_optim = optimize_costs_classif

	ite_max = 5
	for i in range(ite_max):
		if verbose:
			print('ite', i + 1, '/', ite_max, ':')
		# compute GEDs and numbers of edit operations.
		edit_costs_new, residual = method_optim(
			np.array(n_edit_operations), distances_vec)
		ged_vec, n_edit_operations = compute_geds(G_pairs, G, edit_costs_new, ed_method,
											verbose=verbose, **kwargs)
		residual_list.append(sum_squares(ged_vec, distances_vec))

	return edit_costs_new


def get_optimal_costs_GH2020(**kwargs):
	import pickle
	import os
	dir_root = 'cj/output/'
	ds_name = kwargs.get('ds_name')
	nb_trial = kwargs.get('nb_trial')
	file_name = os.path.join(dir_root, 'costs.' + ds_name + '.' + str(nb_trial) + '.pkl')
	with open(file_name, 'rb') as f:
		edit_costs = pickle.load(f)
	return edit_costs