Tutorial: Classification AutoML with imputation
Welcome to the classification AutoML tutorial!
This tutorial will show how to use AutoPrognosis to learn a model for datasets with missing data. We show how to use a predefined imputer or how to use AutoPrognosis to select the optimal imputer.
[ ]:
# stdlib
import json
import sys
import warnings
# third party
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")
# autoprognosis absolute
import autoprognosis.logger as log
from autoprognosis.studies.classifiers import ClassifierStudy
[ ]:
log.add(sink=sys.stderr, level="INFO")
Load toy dataset
[ ]:
# stdlib
from pathlib import Path
def get_dataset() -> pd.DataFrame:
Path("data").mkdir(parents=True, exist_ok=True)
bkp_file = Path("data") / "anneal.csv"
if bkp_file.exists():
return pd.read_csv(bkp_file)
df = pd.read_csv(
"https://archive.ics.uci.edu/ml/machine-learning-databases/annealing/anneal.data",
header=None,
)
df.to_csv(bkp_file, index=None)
return df
df = get_dataset()
df = df.replace("?", np.nan)
X = df.drop(columns=[df.columns[-1]])
y = df[df.columns[-1]]
X
[ ]:
dataset = X.copy()
dataset["target"] = y
[ ]:
for col in X.columns:
if X[col].isna().sum() == 0:
continue
col_type = "categorical" if len(X[col].unique()) < 10 else "cont"
print(
f"NaNs ratio in col = {col} col_type = {col_type} miss ratio = {X[col].isna().sum() / len(X[col])}"
)
[ ]:
[ ]:
# List available classifiers
# autoprognosis absolute
from autoprognosis.plugins.prediction import Classifiers
Classifiers().list_available()
Option 1: Predefined imputer
[ ]:
# stdlib
from pathlib import Path
workspace = Path("workspace")
workspace.mkdir(parents=True, exist_ok=True)
study_name = "test_classification_studies"
study = ClassifierStudy(
study_name=study_name,
dataset=dataset,
target="target",
num_iter=10, # DELETE THIS LINE FOR BETTER RESULTS.
num_study_iter=1, # DELETE THIS LINE FOR BETTER RESULTS.
imputers=["mean"],
classifiers=["logistic_regression", "lda"], # DELETE THIS LINE FOR BETTER RESULTS.
workspace=workspace,
)
[ ]:
study.run()
[ ]:
# autoprognosis absolute
from autoprognosis.plugins.imputers import Imputers
from autoprognosis.utils.serialization import load_model_from_file
from autoprognosis.utils.tester import evaluate_estimator
model_path = workspace / study_name / "model.p"
model = load_model_from_file(model_path)
evaluate_estimator(model, X, y)
[ ]:
model.name()
Option 2: Let the optimizer find the optimal imputer
[ ]:
# stdlib
from pathlib import Path
workspace = Path("workspace")
study_name = "test_classification_studies_v2"
study = ClassifierStudy(
study_name=study_name,
dataset=dataset,
target="target",
num_iter=10, # DELETE THIS LINE FOR BETTER RESULTS.
num_study_iter=1, # DELETE THIS LINE FOR BETTER RESULTS.
classifiers=[
"logistic_regression",
"lda",
"xgboost",
], # DELETE THIS LINE FOR BETTER RESULTS.
workspace=workspace,
)
[ ]:
study.run()
[ ]:
# autoprognosis absolute
from autoprognosis.plugins.imputers import Imputers
from autoprognosis.utils.serialization import load_model_from_file
from autoprognosis.utils.tester import evaluate_estimator
model_path = workspace / study_name / "model.p"
model = load_model_from_file(model_path)
evaluate_estimator(model, X, y)
[ ]:
model.name()
Serialization
[ ]:
# autoprognosis absolute
from autoprognosis.utils.serialization import load_from_file, save_to_file
out = workspace / "tmp.bkp"
# Fit the model
model.fit(X, y)
# Save
save_to_file(out, model)
# Reload
loaded_model = load_from_file(out)
print(loaded_model.name())
assert loaded_model.name() == model.name()
out.unlink()
Congratulations!
Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the movement towards Machine learning and AI for medicine, you can do so in the following ways!
Star AutoPrognosis on GitHub
The easiest way to help our community is just by starring the Repos! This helps raise awareness of the tools we’re building.