OpenI
/
graphkit-learn

from distances import euclid_d


def split_data(D, y, train_index, test_index):
	D_app = [D[i] for i in train_index]
	D_test = [D[i] for i in test_index]
	y_app = [y[i] for i in train_index]
	y_test = [y[i] for i in test_index]
	return D_app, D_test, y_app, y_test


def evaluate_D(D_app, y_app, D_test, y_test, mode='reg'):
	from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
	from distances import rmse, accuracy
	from sklearn.model_selection import GridSearchCV

	if (mode == 'reg'):
		knn = KNeighborsRegressor(metric='precomputed')
		scoring = 'neg_root_mean_squared_error'
		perf_eval = rmse
	else:
		knn = KNeighborsClassifier(metric='precomputed')
		scoring = 'accuracy'
		perf_eval = accuracy
	grid_params = {
		'n_neighbors': [3, 5, 7, 9, 11]
	}

	clf = GridSearchCV(knn, param_grid=grid_params,
					   scoring=scoring,
					   cv=5, return_train_score=True, refit=True)
	clf.fit(D_app, y_app)
	y_pred_app = clf.predict(D_app)
	y_pred_test = clf.predict(D_test)
	return perf_eval(y_pred_app, y_app), perf_eval(y_pred_test, y_test), clf


def xp_knn(Gn, y_all, y_distance=euclid_d,
		   mode='reg', unlabeled=False, ed_method='BIPARTITE', **kwargs):
	'''
	Perform a knn regressor on given dataset
	'''
	from sklearn.model_selection import ShuffleSplit, StratifiedShuffleSplit
	from ged import compute_D_random, compute_D_expert
	from ged import compute_D_fitted

	stratified = False
	if mode == 'classif':
		stratified = True

	if stratified:
		rs = StratifiedShuffleSplit(n_splits=10, test_size=.1)
	else:
		rs = ShuffleSplit(n_splits=10, test_size=.1)

	if stratified:
		split_scheme = rs.split(Gn, y_all)
	else:
		split_scheme = rs.split(Gn)

	results = []
	i = 1
	for train_index, test_index in split_scheme:
		print()
		print("Split {0}/{1}".format(i, 10))
		i = i + 1
		cur_results = {}
		# Get splitted data
		G_app, G_test, y_app, y_test = split_data(Gn, y_all,
												  train_index, test_index)

		cur_results['y_app'] = y_app
		cur_results['y_test'] = y_test

		# Feed distances will all methods to compare
		distances = {}
		distances['random'] = compute_D_random(G_app, G_test, ed_method, **kwargs)
		distances['expert'] = compute_D_expert(G_app, G_test, ed_method, **kwargs)
		distances['fitted'] = compute_D_fitted(
			G_app, y_app, G_test,
			y_distance=y_distance,
			mode=mode, unlabeled=unlabeled, ed_method=ed_method,
			**kwargs)

		for setup in distances.keys():
			print("{0} Mode".format(setup))
			setup_results = {}
			D_app, D_test, edit_costs = distances[setup]
			setup_results['D_app'] = D_app
			setup_results['D_test'] = D_test
			setup_results['edit_costs'] = edit_costs
			print(edit_costs)
			perf_app, perf_test, clf = evaluate_D(
				D_app, y_app, D_test, y_test, mode)

			setup_results['perf_app'] = perf_app
			setup_results['perf_test'] = perf_test
			setup_results['clf'] = clf

			print(
				"Learning performance with {1} costs : {0:.2f}".format(
					perf_app, setup))
			print(
				"Test performance with {1} costs : {0:.2f}".format(
					perf_test, setup))
			cur_results[setup] = setup_results
		results.append(cur_results)
	return results