From 5eb90655a4986b301ccb2a4988ec4d3ddf651bf3 Mon Sep 17 00:00:00 2001
From: jajupmochi <jajupmochi@gmail.com>
Date: Fri, 24 Jun 2022 17:45:16 +0200
Subject: [PATCH] [Features] Add model seletion methods with validation set: ,
 , . Required version of scikit-learn is upgraded to 1.1.0, to support the 
 argument of  used in .

---
 gklearn/model_selection/__init__.py |  25 ++++
 gklearn/model_selection/_split.py   | 285 ++++++++++++++++++++++++++++++++++++
 requirements.txt                    |   2 +-
 requirements_pypi.txt               |   4 +-
 4 files changed, 313 insertions(+), 3 deletions(-)
 create mode 100644 gklearn/model_selection/__init__.py
 create mode 100644 gklearn/model_selection/_split.py

diff --git a/gklearn/model_selection/__init__.py b/gklearn/model_selection/__init__.py
new file mode 100644
index 0000000..661478b
--- /dev/null
+++ b/gklearn/model_selection/__init__.py
@@ -0,0 +1,25 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 24 14:25:57 2022
+
+@author: ljia
+"""
+
+from ._split import BaseCrossValidatorWithValid
+# from ._split import BaseShuffleSplit
+from ._split import KFoldWithValid
+# from ._split import GroupKFold
+# from ._split import StratifiedKFoldWithValid
+# from ._split import TimeSeriesSplit
+# from ._split import LeaveOneGroupOut
+# from ._split import LeaveOneOut
+# from ._split import LeavePGroupsOut
+# from ._split import LeavePOut
+from ._split import RepeatedKFoldWithValid
+# from ._split import RepeatedStratifiedKFold
+# from ._split import ShuffleSplit
+# from ._split import GroupShuffleSplit
+# from ._split import StratifiedShuffleSplit
+# from ._split import StratifiedGroupKFold
+# from ._split import PredefinedSplit
\ No newline at end of file
diff --git a/gklearn/model_selection/_split.py b/gklearn/model_selection/_split.py
new file mode 100644
index 0000000..a982fec
--- /dev/null
+++ b/gklearn/model_selection/_split.py
@@ -0,0 +1,285 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Fri Jun 24 11:13:26 2022
+
+@author: ljia
+"""
+from abc import abstractmethod
+import numbers
+import warnings
+import numpy as np
+from sklearn.utils import check_random_state, check_array, column_or_1d, indexable
+from sklearn.utils.validation import _num_samples
+from sklearn.utils.multiclass import type_of_target
+
+
+class BaseCrossValidatorWithValid(object):
+	"""Base class for all cross-validators.
+	Implementations must define `_iter_valid_test_masks` or `_iter_valid_stest_indices`.
+	"""
+
+	def split(self, X, y=None, groups=None):
+		"""Generate indices to split data into training, valid, and test set.
+
+		Parameters
+		----------
+
+		X : array-like of shape (n_samples, n_features)
+			Training data, where `n_samples` is the number of samples
+			and `n_features` is the number of features.
+
+		y : array-like of shape (n_samples,)
+			The target variable for supervised learning problems.
+
+		groups : array-like of shape (n_samples,), default=None
+			Group labels for the samples used while splitting the dataset into
+			train/test set.
+
+		Yields
+		------
+		train : ndarray
+			The training set indices for that split.
+
+		valid : ndarray
+			The valid set indices for that split.
+
+		test : ndarray
+			The testing set indices for that split.
+		"""
+		X, y, groups = indexable(X, y, groups)
+		indices = np.arange(_num_samples(X))
+		for valid_index, test_index in self._iter_valid_test_masks(X, y, groups):
+			train_index = indices[np.logical_not(np.logical_or(valid_index, test_index))]
+			valid_index = indices[valid_index]
+			test_index = indices[test_index]
+			yield train_index, valid_index, test_index
+
+
+	# Since subclasses must implement either _iter_valid_test_masks or
+	# _iter_valid_test_indices, neither can be abstract.
+	def _iter_valid_test_masks(self, X=None, y=None, groups=None):
+		"""Generates boolean masks corresponding to valid and test sets.
+		By default, delegates to _iter_valid_test_indices(X, y, groups)
+		"""
+		for valid_index, test_index in self._iter_valid_test_indices(X, y, groups):
+			valid_mask = np.zeros(_num_samples(X), dtype=bool)
+			test_mask = np.zeros(_num_samples(X), dtype=bool)
+			valid_mask[valid_index] = True
+			test_mask[test_index] = True
+			yield valid_mask, test_mask
+
+
+	def _iter_valid_test_indices(self, X=None, y=None, groups=None):
+		"""Generates integer indices corresponding to valid and test sets."""
+		raise NotImplementedError
+
+
+	@abstractmethod
+	def get_n_splits(self, X=None, y=None, groups=None):
+		"""Returns the number of splitting iterations in the cross-validator"""
+
+
+	def __repr__(self):
+		return _build_repr(self)
+
+
+class _BaseKFoldWithValid(BaseCrossValidatorWithValid):
+	"""Base class for KFold, GroupKFold, and StratifiedKFold"""
+
+	@abstractmethod
+	def __init__(self, n_splits, *, stratify, shuffle, random_state):
+		if not isinstance(n_splits, numbers.Integral):
+			raise ValueError(
+				'The number of folds must be of Integral type. '
+				'%s of type %s was passed.' % (n_splits, type(n_splits))
+			)
+		n_splits = int(n_splits)
+
+		if n_splits <= 2:
+			raise ValueError(
+				'k-fold cross-validation requires at least one'
+				' train/valid/test split by setting n_splits=3 or more,'
+				' got n_splits={0}.'.format(n_splits)
+			)
+
+		if not isinstance(shuffle, bool):
+			raise TypeError('shuffle must be True or False; got {0}'.format(shuffle))
+
+		if not shuffle and random_state is not None:  # None is the default
+			raise ValueError(
+				'Setting a random_state has no effect since shuffle is '
+				'False. You should leave '
+				'random_state to its default (None), or set shuffle=True.',
+			)
+
+		self.n_splits = n_splits
+		self.stratify = stratify
+		self.shuffle = shuffle
+		self.random_state = random_state
+
+
+	def split(self, X, y=None, groups=None):
+		"""Generate indices to split data into training, valid and test set."""
+		X, y, groups = indexable(X, y, groups)
+		n_samples = _num_samples(X)
+		if self.n_splits > n_samples:
+			raise ValueError(
+				(
+				 'Cannot have number of splits n_splits={0} greater'
+				 ' than the number of samples: n_samples={1}.'
+				 ).format(self.n_splits, n_samples)
+			)
+
+		for train, valid, test in super().split(X, y, groups):
+			yield train, valid, test
+
+
+class KFoldWithValid(_BaseKFoldWithValid):
+
+
+	def __init__(
+			self,
+			n_splits=5,
+			*,
+			stratify=False,
+			shuffle=False,
+			random_state=None
+			):
+		super().__init__(
+			n_splits=n_splits,
+			stratify=stratify,
+			shuffle=shuffle,
+			random_state=random_state
+			)
+
+
+	def _make_valid_test_folds(self, X, y=None):
+		rng = check_random_state(self.random_state)
+		y = np.asarray(y)
+		type_of_target_y = type_of_target(y)
+		allowed_target_types = ('binary', 'multiclass')
+		if type_of_target_y not in allowed_target_types:
+			raise ValueError(
+				'Supported target types are: {}. Got {!r} instead.'.format(
+					allowed_target_types, type_of_target_y
+				)
+			)
+
+		y = column_or_1d(y)
+
+		_, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
+		# y_inv encodes y according to lexicographic order. We invert y_idx to
+		# map the classes so that they are encoded by order of appearance:
+		# 0 represents the first label appearing in y, 1 the second, etc.
+		_, class_perm = np.unique(y_idx, return_inverse=True)
+		y_encoded = class_perm[y_inv]
+
+		n_classes = len(y_idx)
+		y_counts = np.bincount(y_encoded)
+		min_groups = np.min(y_counts)
+		if np.all(self.n_splits > y_counts):
+			raise ValueError(
+				"n_splits=%d cannot be greater than the"
+				" number of members in each class." % (self.n_splits)
+			)
+		if self.n_splits > min_groups:
+			warnings.warn(
+				"The least populated class in y has only %d"
+				" members, which is less than n_splits=%d."
+				% (min_groups, self.n_splits),
+				UserWarning,
+			)
+
+		# Determine the optimal number of samples from each class in each fold,
+		# using round robin over the sorted y. (This can be done direct from
+		# counts, but that code is unreadable.)
+		y_order = np.sort(y_encoded)
+		allocation = np.asarray(
+			[
+				np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
+				for i in range(self.n_splits)
+			]
+		)
+
+		# To maintain the data order dependencies as best as possible within
+		# the stratification constraint, we assign samples from each class in
+		# blocks (and then mess that up when shuffle=True).
+		test_folds = np.empty(len(y), dtype='i')
+		for k in range(n_classes):
+			# since the kth column of allocation stores the number of samples
+			# of class k in each test set, this generates blocks of fold
+			# indices corresponding to the allocation for class k.
+			folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
+			if self.shuffle:
+				rng.shuffle(folds_for_class)
+			test_folds[y_encoded == k] = folds_for_class
+		return test_folds
+
+
+	def _iter_valid_test_masks(self, X, y=None, groups=None):
+		test_folds = self._make_valid_test_folds(X, y)
+		for i in range(self.n_splits):
+			if i + 1 < self.n_splits:
+				j = i + 1
+			else:
+				j = 0
+			yield test_folds == i, test_folds == j
+
+
+	def split(self, X, y, groups=None):
+		y = check_array(y, input_name='y', ensure_2d=False, dtype=None)
+		return super().split(X, y, groups)
+
+
+class _RepeatedSplitsWithValid(object):
+
+
+	def __init__(
+			self,
+			cv,
+			*,
+			n_repeats=10,
+			random_state=None,
+			**cvargs
+			):
+		if not isinstance(n_repeats, int):
+			raise ValueError('Number of repetitions must be of integer type.')
+
+		if n_repeats <= 0:
+			raise ValueError('Number of repetitions must be greater than 0.')
+
+		self.cv = cv
+		self.n_repeats = n_repeats
+		self.random_state = random_state
+		self.cvargs = cvargs
+
+
+	def split(self, X, y=None, groups=None):
+		n_repeats = self.n_repeats
+		rng = check_random_state(self.random_state)
+
+		for idx in range(n_repeats):
+			cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
+			for train_index, valid_index, test_index in cv.split(X, y, groups):
+				yield train_index, valid_index, test_index
+
+
+class RepeatedKFoldWithValid(_RepeatedSplitsWithValid):
+
+
+	def __init__(
+			self,
+			*,
+			n_splits=5,
+			n_repeats=10,
+			stratify=False,
+			random_state=None
+			):
+		super().__init__(
+			KFoldWithValid,
+			n_repeats=n_repeats,
+			stratify=stratify,
+			random_state=random_state,
+			n_splits=n_splits,
+			)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 4b25bb3..da822f7 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,7 +2,7 @@ numpy>=1.16.2
 scipy>=1.1.0
 matplotlib>=3.1.0
 networkx>=2.2
-scikit-learn>=0.20.0
+scikit-learn>=1.1.0
 tabulate>=0.8.2
 tqdm>=4.26.0
 control>=0.8.2 # for generalized random walk kernels only.
diff --git a/requirements_pypi.txt b/requirements_pypi.txt
index 3c68618..d1718a0 100644
--- a/requirements_pypi.txt
+++ b/requirements_pypi.txt
@@ -1,8 +1,8 @@
 numpy>=1.16.2
 scipy>=1.1.0
-matplotlib>=3.0.0
+matplotlib>=3.1.0
 networkx>=2.2
-scikit-learn>=0.20.0
+scikit-learn>=1.1.0
 tabulate>=0.8.2
 tqdm>=4.26.0
 control>=0.8.2 # for generalized random walk kernels only.