Source code for metasklearn.utils.scaler

#!/usr/bin/env python
# Created by "Thieu" at 11:11, 08/05/2025 ----------%                                                                               
#       Email: nguyenthieu2102@gmail.com            %                                                    
#       Github: https://github.com/thieu1995        %                         
# --------------------------------------------------%

import numpy as np
from scipy.stats import boxcox, yeojohnson
from scipy.special import inv_boxcox
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler


[docs]class OneHotEncoder: """ A simple implementation of one-hot encoding for 1D categorical data. Attributes: categories_ (np.ndarray): Sorted array of unique categories fitted from the input data. """ def __init__(self): """Initialize the encoder with no categories.""" self.categories_ = None
[docs] def fit(self, X): """ Fit the encoder to the unique categories in X. Args: X (array-like): 1D array of categorical values. Returns: self: Fitted OneHotEncoder instance. """ X = np.asarray(X).ravel() self.categories_ = np.unique(X) return self
[docs] def transform(self, X): """ Transform input data into one-hot encoded format. Args: X (array-like): 1D array of categorical values. Returns: np.ndarray: One-hot encoded array of shape (n_samples, n_categories). Raises: ValueError: If the encoder has not been fitted or unknown category is found. """ if self.categories_ is None: raise ValueError("The encoder has not been fitted yet.") X = np.asarray(X).ravel() one_hot = np.zeros((X.shape[0], len(self.categories_)), dtype=int) for i, val in enumerate(X): indices = np.where(self.categories_ == val)[0] if len(indices) == 0: raise ValueError(f"Unknown category encountered during transform: {val}") one_hot[i, indices[0]] = 1 return one_hot
[docs] def fit_transform(self, X): """ Fit the encoder to X and transform X. Args: X (array-like): 1D array of categorical values. Returns: np.ndarray: One-hot encoded array of shape (n_samples, n_categories). """ return self.fit(X).transform(X)
[docs] def inverse_transform(self, one_hot): """ Convert one-hot encoded data back to original categories. Args: one_hot (np.ndarray): 2D array of one-hot encoded data. Returns: np.ndarray: 1D array of original categorical values. Raises: ValueError: If the encoder has not been fitted or shape mismatch occurs. """ if self.categories_ is None: raise ValueError("The encoder has not been fitted yet.") if one_hot.shape[1] != len(self.categories_): raise ValueError("The shape of the input does not match the number of categories.") return np.array([self.categories_[np.argmax(row)] for row in one_hot])
[docs]class LabelEncoder: """ Encode categorical labels as integer indices and decode them back. This class maps unique categorical labels to integers from 0 to n_classes - 1. """ def __init__(self): """ Initialize the label encoder. """ self.unique_labels = None self.label_to_index = {}
[docs] def fit(self, y): """ Fit the encoder by finding unique labels in the input data. Parameters ---------- y : array-like Input labels. Returns ------- self : LabelEncoder Fitted LabelEncoder instance. """ y = np.asarray(y).ravel() self.unique_labels = np.unique(y) self.label_to_index = {label: i for i, label in enumerate(self.unique_labels)} return self
[docs] def transform(self, y): """ Transform labels to integer indices. Parameters ---------- y : array-like Labels to encode. Returns ------- encoded_labels : np.ndarray Encoded integer labels. Raises ------ ValueError If the encoder has not been fitted or unknown labels are found. """ if self.unique_labels is None: raise ValueError("Label encoder has not been fit yet.") y = np.asarray(y).ravel() encoded = [] for label in y: if label not in self.label_to_index: raise ValueError(f"Unknown label: {label}") encoded.append(self.label_to_index[label]) return np.array(encoded)
[docs] def fit_transform(self, y): """ Fit the encoder and transform labels in one step. Parameters ---------- y : array-like of shape (n_samples,) Input labels. Returns ------- np.ndarray Encoded integer labels. """ return self.fit(y).transform(y)
[docs] def inverse_transform(self, y): """ Transform integer indices back to original labels. Parameters ---------- y : array-like of int Encoded integer labels. Returns ------- original_labels : np.ndarray Original labels. Raises ------ ValueError If the encoder has not been fitted or index is out of bounds. """ if self.unique_labels is None: raise ValueError("Label encoder has not been fit yet.") y = np.asarray(y).ravel() return np.array([self.unique_labels[i] if 0 <= i < len(self.unique_labels) else "unknown" for i in y])
[docs]class ObjectiveScaler: """ For label scaler in classification (binary and multiple classification) """ def __init__(self, obj_name="sigmoid", ohe_scaler=None): """ ohe_scaler: Need to be an instance of One-Hot-Encoder for softmax scaler (multiple classification problem) """ self.obj_name = obj_name self.ohe_scaler = ohe_scaler
[docs] def transform(self, data): if self.obj_name == "sigmoid" or self.obj_name == "self": return data elif self.obj_name == "hinge": data = np.squeeze(np.array(data)) data[np.where(data == 0)] = -1 return data elif self.obj_name == "softmax": data = self.ohe_scaler.fit_transform(np.reshape(data, (-1, 1))) return data
[docs] def inverse_transform(self, data): if self.obj_name == "sigmoid": data = np.squeeze(np.array(data)) data = np.rint(data).astype(int) elif self.obj_name == "hinge": data = np.squeeze(np.array(data)) data = np.ceil(data).astype(int) data[np.where(data == -1)] = 0 elif self.obj_name == "softmax": data = np.squeeze(np.array(data)) data = np.argmax(data, axis=1) return data
[docs]class Log1pScaler(BaseEstimator, TransformerMixin):
[docs] def fit(self, X, y=None): # LogETransformer doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the natural logarithm to each element of the input data return np.log1p(X)
[docs] def inverse_transform(self, X): # Apply the exponential function to reverse the logarithmic transformation return np.expm1(X)
[docs]class LogeScaler(BaseEstimator, TransformerMixin):
[docs] def fit(self, X, y=None): # LogETransformer doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the natural logarithm (base e) to each element of the input data return np.log(X)
[docs] def inverse_transform(self, X): # Apply the exponential function to reverse the logarithmic transformation return np.exp(X)
[docs]class SqrtScaler(BaseEstimator, TransformerMixin):
[docs] def fit(self, X, y=None): # SqrtScaler doesn't require fitting, so we simply return self. return self
[docs] def transform(self, X): # Apply the square root transformation to each element of the input data return np.sqrt(X)
[docs] def inverse_transform(self, X): # Apply the square of each element to reverse the square root transformation return X ** 2
[docs]class BoxCoxScaler(BaseEstimator, TransformerMixin): def __init__(self, lmbda=None): self.lmbda = lmbda
[docs] def fit(self, X, y=None): # Estimate the lambda parameter from the data if not provided if self.lmbda is None: _, self.lmbda = boxcox(X.flatten()) return self
[docs] def transform(self, X): # Apply the Box-Cox transformation to the data X_new = boxcox(X.flatten(), lmbda=self.lmbda) return X_new.reshape(X.shape)
[docs] def inverse_transform(self, X): # Inverse transform using the original lambda parameter return inv_boxcox(X, self.lmbda)
[docs]class YeoJohnsonScaler(BaseEstimator, TransformerMixin): def __init__(self, lmbda=None): self.lmbda = lmbda
[docs] def fit(self, X, y=None): # Estimate the lambda parameter from the data if not provided if self.lmbda is None: _, self.lmbda = yeojohnson(X.flatten()) return self
[docs] def transform(self, X): # Apply the Yeo-Johnson transformation to the data X_new = boxcox(X.flatten(), lmbda=self.lmbda) return X_new.reshape(X.shape)
[docs] def inverse_transform(self, X): # Inverse transform using the original lambda parameter return inv_boxcox(X, self.lmbda)
[docs]class SinhArcSinhScaler(BaseEstimator, TransformerMixin): # https://stats.stackexchange.com/questions/43482/transformation-to-increase-kurtosis-and-skewness-of-normal-r-v def __init__(self, epsilon=0.1, delta=1.0): self.epsilon = epsilon self.delta = delta
[docs] def fit(self, X, y=None): return self
[docs] def transform(self, X): return np.sinh(self.delta * np.arcsinh(X) - self.epsilon)
[docs] def inverse_transform(self, X): return np.sinh((np.arcsinh(X) + self.epsilon) / self.delta)