Browse Source

[Features] Add model seletion methods with validation set: , , . Required version of scikit-learn is upgraded to 1.1.0, to support the argument of used in .

v0.2.x
jajupmochi 3 years ago
parent
commit
5eb90655a4
4 changed files with 313 additions and 3 deletions
  1. +25
    -0
      gklearn/model_selection/__init__.py
  2. +285
    -0
      gklearn/model_selection/_split.py
  3. +1
    -1
      requirements.txt
  4. +2
    -2
      requirements_pypi.txt

+ 25
- 0
gklearn/model_selection/__init__.py View File

@@ -0,0 +1,25 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 24 14:25:57 2022

@author: ljia
"""

from ._split import BaseCrossValidatorWithValid
# from ._split import BaseShuffleSplit
from ._split import KFoldWithValid
# from ._split import GroupKFold
# from ._split import StratifiedKFoldWithValid
# from ._split import TimeSeriesSplit
# from ._split import LeaveOneGroupOut
# from ._split import LeaveOneOut
# from ._split import LeavePGroupsOut
# from ._split import LeavePOut
from ._split import RepeatedKFoldWithValid
# from ._split import RepeatedStratifiedKFold
# from ._split import ShuffleSplit
# from ._split import GroupShuffleSplit
# from ._split import StratifiedShuffleSplit
# from ._split import StratifiedGroupKFold
# from ._split import PredefinedSplit

+ 285
- 0
gklearn/model_selection/_split.py View File

@@ -0,0 +1,285 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 24 11:13:26 2022

@author: ljia
"""
from abc import abstractmethod
import numbers
import warnings
import numpy as np
from sklearn.utils import check_random_state, check_array, column_or_1d, indexable
from sklearn.utils.validation import _num_samples
from sklearn.utils.multiclass import type_of_target


class BaseCrossValidatorWithValid(object):
"""Base class for all cross-validators.
Implementations must define `_iter_valid_test_masks` or `_iter_valid_stest_indices`.
"""

def split(self, X, y=None, groups=None):
"""Generate indices to split data into training, valid, and test set.

Parameters
----------

X : array-like of shape (n_samples, n_features)
Training data, where `n_samples` is the number of samples
and `n_features` is the number of features.

y : array-like of shape (n_samples,)
The target variable for supervised learning problems.

groups : array-like of shape (n_samples,), default=None
Group labels for the samples used while splitting the dataset into
train/test set.

Yields
------
train : ndarray
The training set indices for that split.

valid : ndarray
The valid set indices for that split.

test : ndarray
The testing set indices for that split.
"""
X, y, groups = indexable(X, y, groups)
indices = np.arange(_num_samples(X))
for valid_index, test_index in self._iter_valid_test_masks(X, y, groups):
train_index = indices[np.logical_not(np.logical_or(valid_index, test_index))]
valid_index = indices[valid_index]
test_index = indices[test_index]
yield train_index, valid_index, test_index


# Since subclasses must implement either _iter_valid_test_masks or
# _iter_valid_test_indices, neither can be abstract.
def _iter_valid_test_masks(self, X=None, y=None, groups=None):
"""Generates boolean masks corresponding to valid and test sets.
By default, delegates to _iter_valid_test_indices(X, y, groups)
"""
for valid_index, test_index in self._iter_valid_test_indices(X, y, groups):
valid_mask = np.zeros(_num_samples(X), dtype=bool)
test_mask = np.zeros(_num_samples(X), dtype=bool)
valid_mask[valid_index] = True
test_mask[test_index] = True
yield valid_mask, test_mask


def _iter_valid_test_indices(self, X=None, y=None, groups=None):
"""Generates integer indices corresponding to valid and test sets."""
raise NotImplementedError


@abstractmethod
def get_n_splits(self, X=None, y=None, groups=None):
"""Returns the number of splitting iterations in the cross-validator"""


def __repr__(self):
return _build_repr(self)


class _BaseKFoldWithValid(BaseCrossValidatorWithValid):
"""Base class for KFold, GroupKFold, and StratifiedKFold"""

@abstractmethod
def __init__(self, n_splits, *, stratify, shuffle, random_state):
if not isinstance(n_splits, numbers.Integral):
raise ValueError(
'The number of folds must be of Integral type. '
'%s of type %s was passed.' % (n_splits, type(n_splits))
)
n_splits = int(n_splits)

if n_splits <= 2:
raise ValueError(
'k-fold cross-validation requires at least one'
' train/valid/test split by setting n_splits=3 or more,'
' got n_splits={0}.'.format(n_splits)
)

if not isinstance(shuffle, bool):
raise TypeError('shuffle must be True or False; got {0}'.format(shuffle))

if not shuffle and random_state is not None: # None is the default
raise ValueError(
'Setting a random_state has no effect since shuffle is '
'False. You should leave '
'random_state to its default (None), or set shuffle=True.',
)

self.n_splits = n_splits
self.stratify = stratify
self.shuffle = shuffle
self.random_state = random_state


def split(self, X, y=None, groups=None):
"""Generate indices to split data into training, valid and test set."""
X, y, groups = indexable(X, y, groups)
n_samples = _num_samples(X)
if self.n_splits > n_samples:
raise ValueError(
(
'Cannot have number of splits n_splits={0} greater'
' than the number of samples: n_samples={1}.'
).format(self.n_splits, n_samples)
)

for train, valid, test in super().split(X, y, groups):
yield train, valid, test


class KFoldWithValid(_BaseKFoldWithValid):


def __init__(
self,
n_splits=5,
*,
stratify=False,
shuffle=False,
random_state=None
):
super().__init__(
n_splits=n_splits,
stratify=stratify,
shuffle=shuffle,
random_state=random_state
)


def _make_valid_test_folds(self, X, y=None):
rng = check_random_state(self.random_state)
y = np.asarray(y)
type_of_target_y = type_of_target(y)
allowed_target_types = ('binary', 'multiclass')
if type_of_target_y not in allowed_target_types:
raise ValueError(
'Supported target types are: {}. Got {!r} instead.'.format(
allowed_target_types, type_of_target_y
)
)

y = column_or_1d(y)

_, y_idx, y_inv = np.unique(y, return_index=True, return_inverse=True)
# y_inv encodes y according to lexicographic order. We invert y_idx to
# map the classes so that they are encoded by order of appearance:
# 0 represents the first label appearing in y, 1 the second, etc.
_, class_perm = np.unique(y_idx, return_inverse=True)
y_encoded = class_perm[y_inv]

n_classes = len(y_idx)
y_counts = np.bincount(y_encoded)
min_groups = np.min(y_counts)
if np.all(self.n_splits > y_counts):
raise ValueError(
"n_splits=%d cannot be greater than the"
" number of members in each class." % (self.n_splits)
)
if self.n_splits > min_groups:
warnings.warn(
"The least populated class in y has only %d"
" members, which is less than n_splits=%d."
% (min_groups, self.n_splits),
UserWarning,
)

# Determine the optimal number of samples from each class in each fold,
# using round robin over the sorted y. (This can be done direct from
# counts, but that code is unreadable.)
y_order = np.sort(y_encoded)
allocation = np.asarray(
[
np.bincount(y_order[i :: self.n_splits], minlength=n_classes)
for i in range(self.n_splits)
]
)

# To maintain the data order dependencies as best as possible within
# the stratification constraint, we assign samples from each class in
# blocks (and then mess that up when shuffle=True).
test_folds = np.empty(len(y), dtype='i')
for k in range(n_classes):
# since the kth column of allocation stores the number of samples
# of class k in each test set, this generates blocks of fold
# indices corresponding to the allocation for class k.
folds_for_class = np.arange(self.n_splits).repeat(allocation[:, k])
if self.shuffle:
rng.shuffle(folds_for_class)
test_folds[y_encoded == k] = folds_for_class
return test_folds


def _iter_valid_test_masks(self, X, y=None, groups=None):
test_folds = self._make_valid_test_folds(X, y)
for i in range(self.n_splits):
if i + 1 < self.n_splits:
j = i + 1
else:
j = 0
yield test_folds == i, test_folds == j


def split(self, X, y, groups=None):
y = check_array(y, input_name='y', ensure_2d=False, dtype=None)
return super().split(X, y, groups)


class _RepeatedSplitsWithValid(object):


def __init__(
self,
cv,
*,
n_repeats=10,
random_state=None,
**cvargs
):
if not isinstance(n_repeats, int):
raise ValueError('Number of repetitions must be of integer type.')

if n_repeats <= 0:
raise ValueError('Number of repetitions must be greater than 0.')

self.cv = cv
self.n_repeats = n_repeats
self.random_state = random_state
self.cvargs = cvargs


def split(self, X, y=None, groups=None):
n_repeats = self.n_repeats
rng = check_random_state(self.random_state)

for idx in range(n_repeats):
cv = self.cv(random_state=rng, shuffle=True, **self.cvargs)
for train_index, valid_index, test_index in cv.split(X, y, groups):
yield train_index, valid_index, test_index


class RepeatedKFoldWithValid(_RepeatedSplitsWithValid):


def __init__(
self,
*,
n_splits=5,
n_repeats=10,
stratify=False,
random_state=None
):
super().__init__(
KFoldWithValid,
n_repeats=n_repeats,
stratify=stratify,
random_state=random_state,
n_splits=n_splits,
)

+ 1
- 1
requirements.txt View File

@@ -2,7 +2,7 @@ numpy>=1.16.2
scipy>=1.1.0 scipy>=1.1.0
matplotlib>=3.1.0 matplotlib>=3.1.0
networkx>=2.2 networkx>=2.2
scikit-learn>=0.20.0
scikit-learn>=1.1.0
tabulate>=0.8.2 tabulate>=0.8.2
tqdm>=4.26.0 tqdm>=4.26.0
control>=0.8.2 # for generalized random walk kernels only. control>=0.8.2 # for generalized random walk kernels only.


+ 2
- 2
requirements_pypi.txt View File

@@ -1,8 +1,8 @@
numpy>=1.16.2 numpy>=1.16.2
scipy>=1.1.0 scipy>=1.1.0
matplotlib>=3.0.0
matplotlib>=3.1.0
networkx>=2.2 networkx>=2.2
scikit-learn>=0.20.0
scikit-learn>=1.1.0
tabulate>=0.8.2 tabulate>=0.8.2
tqdm>=4.26.0 tqdm>=4.26.0
control>=0.8.2 # for generalized random walk kernels only. control>=0.8.2 # for generalized random walk kernels only.


Loading…
Cancel
Save