Source code for metasklearn.core.search
#!/usr/bin/env python
# Created by "Thieu" at 07:11, 08/05/2025 ----------%
# Email: nguyenthieu2102@gmail.com %
# Github: https://github.com/thieu1995 %
# --------------------------------------------------%
import inspect
import pickle
import pprint
from pathlib import Path
import numpy as np
import pandas as pd
from permetrics import RegressionMetric, ClassificationMetric
from sklearn.base import BaseEstimator
from sklearn.metrics import get_scorer_names
from mealpy import get_optimizer_by_class, Optimizer
from metasklearn.core.problem import HyperparameterProblem
from metasklearn.utils import validation
from metasklearn.utils.evaluation import get_all_classification_metrics, get_all_regression_metrics
[docs]class MetaSearchCV(BaseEstimator):
"""
A metaheuristic-powered hyperparameter optimization framework for scikit-learn models.
This class uses metaheuristic optimization algorithms from the Mealpy library to perform
hyperparameter tuning for scikit-learn-compatible models via cross-validation.
Parameters:
estimator (BaseEstimator):
The machine learning model to optimize. Must implement scikit-learn's fit/predict interface.
param_bounds (list, tuple, or dict):
A dictionary specifying the boundary of the hyperparameters to be optimized.
task_type (str, default='classification'):
The type of task: 'classification' or 'regression'. Determines the evaluation metric used.
optim (str or Optimizer, default='BaseGA'):
The name of the metaheuristic algorithm to use (from Mealpy), or an Optimizer instance.
optim_params (dict, optional):
Dictionary of additional parameters passed to the optimizer (e.g., pop_size, epoch, etc.).
cv (int, default=5):
Number of cross-validation folds.
scoring (str, optional):
Name of the scoring metric. Can be a scikit-learn metric or a custom metric supported by permetrics.
seed (int, optional):
Random seed for reproducibility.
n_jobs (int, default=1):
Number of jobs to run in parallel during cross-validation.
verbose (bool, default=True):
Whether to display logs and progress during optimization.
mode (str, default='single'):
Execution mode for the optimizer: 'single', 'swarm', 'thread', or 'process'.
n_workers (int, optional):
Number of parallel workers used by the optimizer in threaded or multiprocessing mode.
termination (dict, optional):
Dictionary defining custom termination conditions for the optimizer.
**kwargs:
Additional keyword arguments passed to the internal problem definition.
Attributes:
best_params (dict):
The best hyperparameter configuration found during optimization.
best_estimator (BaseEstimator):
A clone of the input estimator trained with the best-found parameters.
best_score (float):
The best evaluation score achieved during optimization.
loss_train (list):
A list of best scores over iterations.
problem (HyperparameterProblem):
Internal representation of the hyperparameter optimization problem.
"""
SUPPORTED_CLS_METRICS = get_all_classification_metrics()
SUPPORTED_REG_METRICS = get_all_regression_metrics()
def __init__(self, estimator, param_bounds, task_type="classification",
optim="BaseGA", optim_params=None,
cv=5, scoring=None, seed=None, n_jobs=1, verbose=True,
mode='single', n_workers=None, termination=None, **kwargs):
self.estimator = estimator
self.param_bounds = param_bounds
if task_type == "regression":
self.task_type = task_type
self.metric_class = RegressionMetric
else:
self.task_type = "classification"
self.metric_class = ClassificationMetric
self.scoring_name = scoring
if scoring in get_scorer_names():
self.sklearn_score = True
self.minmax = "max"
else:
self.sklearn_score = False
if task_type == "regression":
self.scoring_name = validation.check_str("scoring", scoring, self.SUPPORTED_REG_METRICS)
self.minmax = self.SUPPORTED_REG_METRICS[self.scoring_name]
else:
self.scoring_name = validation.check_str("scoring", scoring, self.SUPPORTED_CLS_METRICS)
self.minmax = self.SUPPORTED_CLS_METRICS[self.scoring_name]
self.optim = optim
self.optim_params = optim_params
self.cv = cv
self.seed = seed
self.n_jobs = n_jobs
self.verbose = verbose
self.mode = mode
self.n_workers = n_workers
self.termination = termination
self.best_params = None
self.best_estimator = None
self.best_score = None
self.loss_train = None
self.problem = None
self.kwargs = kwargs
def __repr__(self, **kwargs):
"""
Returns a string representation of the MetaSearchCV instance.
Returns:
str: A formatted string of the instance's parameters.
"""
param_order = list(inspect.signature(self.__init__).parameters.keys())
param_dict = {k: getattr(self, k) for k in param_order}
param_str = ", ".join(f"{k}={repr(v)}" for k, v in param_dict.items())
if len(param_str) <= 80:
return f"{self.__class__.__name__}({param_str})"
else:
formatted_params = ",\n ".join(f"{k}={pprint.pformat(v)}" for k, v in param_dict.items())
return f"{self.__class__.__name__}(\n {formatted_params}\n)"
def _set_optimizer(self, optim=None, optim_params=None):
"""
Sets the optimizer for the hyperparameter search.
Args:
optim: The name of the optimizer or an Optimizer instance.
optim_params: Parameters for the optimizer.
Returns:
Optimizer: An instance of the optimizer.
Raises:
TypeError: If the `optim` parameter is not a string or Optimizer instance.
"""
if type(optim) is str:
opt_class = get_optimizer_by_class(optim)
if type(optim_params) is dict:
return opt_class(**optim_params)
else:
return opt_class(epoch=250, pop_size=20)
elif isinstance(optim, Optimizer):
if type(optim_params) is dict:
if "name" in optim_params: # Check if key exists and remove it
optim.name = optim_params.pop("name")
optim.set_parameters(optim_params)
return optim
else:
raise TypeError(f"`optim` parameter needs to set as a string and supported by Mealpy library.")
[docs] def fit(self, X, y):
"""
Fits the model using the provided data and performs hyperparameter optimization.
Args:
X: The feature matrix.
y: The target vector.
Returns:
MetaSearchCV: The fitted instance.
"""
log_to = "console" if self.verbose else "None"
self.optim_params = self.optim_params or {}
self.optim = self._set_optimizer(self.optim, self.optim_params)
self.problem = HyperparameterProblem(self.param_bounds, self.minmax, X, y,
self.estimator, self.metric_class,
obj_name=self.scoring_name, sklearn_score=self.sklearn_score,
cv=self.cv, n_jobs=None, shuffle=True, seed=self.seed,
log_to=log_to, **self.kwargs)
self.optim.solve(self.problem, mode=self.mode, n_workers=self.n_workers, termination=self.termination, seed=self.seed)
self.best_params = self.optim.problem.decode_solution(self.optim.g_best.solution)
self.best_estimator = self.estimator.set_params(**self.best_params)
self.best_estimator.fit(X, y)
self.best_score = self.optim.g_best.target.fitness
self.loss_train = self.optim.history.list_global_best_fit
return self
[docs] def predict(self, X):
"""
Predicts the target values for the given feature matrix.
Args:
X: The feature matrix.
Returns:
np.ndarray: The predicted target values.
Raises:
ValueError: If the model is not trained.
"""
if self.best_params is None or self.best_estimator is None:
raise ValueError(f"Model is not trained, please call the fit() function.")
return self.best_estimator.predict(X)
[docs] def score(self, X, y):
"""
Computes the score of the model on the given data.
Args:
X: The feature matrix.
y: The target vector.
Returns:
float: The score of the model.
Raises:
ValueError: If the model is not trained.
"""
if self.best_params is None or self.best_estimator is None:
raise ValueError(f"Model is not trained, please call the fit() function.")
return self.best_estimator.score(X, y)
[docs] def evaluate(self, y_true, y_pred, list_metrics=("AS", "RS")):
"""
Evaluates the model's predictions using the specified metrics.
Args:
y_true: The ground truth target values.
y_pred: The predicted target values.
list_metrics: A list of metric names to evaluate.
Returns:
dict: A dictionary of metric names and their corresponding values.
"""
if self.task_type == "regression":
rm = RegressionMetric(y_true=y_true, y_pred=y_pred)
return rm.get_metrics_by_list_names(list_metrics)
else:
cm = ClassificationMetric(y_true, y_pred)
return cm.get_metrics_by_list_names(list_metrics)
[docs] def scores(self, X, y, list_metrics=("AS", "RS")):
"""
Computes evaluation metrics for the model's predictions.
Args:
X: The feature matrix.
y: The target vector.
list_metrics: A list of metric names to evaluate.
Returns:
dict: A dictionary of metric names and their corresponding values.
"""
y_pred = self.predict(X)
res = self.evaluate(y, y_pred, list_metrics=list_metrics)
return res
[docs] def save_convergence(self, save_path="history", filename="convergence.csv"):
"""
Save the convergence (fitness value) during the training process to csv file.
Parameters
----------
save_path : saved path (relative path, consider from current executed script path)
filename : name of the file, needs to have ".csv" extension
"""
Path(save_path).mkdir(parents=True, exist_ok=True)
if self.loss_train is None:
print(f"{self.__class__.__name__} network doesn't have training loss!")
else:
data = {"epoch": list(range(1, len(self.loss_train) + 1)), "loss": self.loss_train}
pd.DataFrame(data).to_csv(f"{save_path}/{filename}", index=False)
[docs] def save_performance_metrics(self, y_true, y_pred, list_metrics=("RMSE", "MAE"), save_path="history", filename="metrics.csv"):
"""
Save evaluation metrics to csv file
Parameters
----------
y_true : ground truth data
y_pred : predicted output
list_metrics : list of evaluation metrics
save_path : saved path (relative path, consider from current executed script path)
filename : name of the file, needs to have ".csv" extension
"""
Path(save_path).mkdir(parents=True, exist_ok=True)
results = self.best_estimator.evaluate(y_true, y_pred, list_metrics)
df = pd.DataFrame.from_dict(results, orient='index').T
df.to_csv(f"{save_path}/{filename}", index=False)
[docs] def save_y_predicted(self, X, y_true, save_path="history", filename="y_predicted.csv"):
"""
Save the predicted results to csv file
Parameters
----------
X : The features data, nd.ndarray
y_true : The ground truth data
save_path : saved path (relative path, consider from current executed script path)
filename : name of the file, needs to have ".csv" extension
"""
Path(save_path).mkdir(parents=True, exist_ok=True)
y_pred = self.predict(X)
data = {"y_true": np.squeeze(np.asarray(y_true)), "y_pred": np.squeeze(np.asarray(y_pred))}
pd.DataFrame(data).to_csv(f"{save_path}/{filename}", index=False)
[docs] def save_model(self, save_path="history", filename="network.pkl"):
"""
Save network to pickle file
Parameters
----------
save_path : saved path (relative path, consider from current executed script path)
filename : name of the file, needs to have ".pkl" extension
"""
Path(save_path).mkdir(parents=True, exist_ok=True)
if filename[-4:] != ".pkl":
filename += ".pkl"
pickle.dump(self, open(f"{save_path}/{filename}", 'wb'))
[docs] @staticmethod
def load_model(load_path="history", filename="network.pkl"):
"""
Load a saved model from a pickle file.
Parameters
----------
load_path : str, default="history"
Directory containing the saved file.
filename : str, default="network.pkl"
Name of the file (must end with `.pkl`).
Returns
-------
model : BaseRVFL
Loaded model instance.
"""
if filename[-4:] != ".pkl":
filename += ".pkl"
return pickle.load(open(f"{load_path}/{filename}", 'rb'))