Tutorial: Classification AutoML with imputation

Welcome to the classification AutoML tutorial!

This tutorial will show how to use AutoPrognosis to learn a model for datasets with missing data. We show how to use a predefined imputer or how to use AutoPrognosis to select the optimal imputer.

[ ]:
# stdlib
import json
import sys
import warnings

# third party
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# autoprognosis absolute
import autoprognosis.logger as log
from autoprognosis.studies.classifiers import ClassifierStudy
[ ]:
log.add(sink=sys.stderr, level="INFO")

Load toy dataset

[ ]:
# stdlib
from pathlib import Path


def get_dataset() -> pd.DataFrame:
    Path("data").mkdir(parents=True, exist_ok=True)
    bkp_file = Path("data") / "anneal.csv"

    if bkp_file.exists():
        return pd.read_csv(bkp_file)

    df = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/annealing/anneal.data",
        header=None,
    )
    df.to_csv(bkp_file, index=None)

    return df


df = get_dataset()

df = df.replace("?", np.nan)

X = df.drop(columns=[df.columns[-1]])
y = df[df.columns[-1]]

X
[ ]:
dataset = X.copy()
dataset["target"] = y
[ ]:
for col in X.columns:
    if X[col].isna().sum() == 0:
        continue

    col_type = "categorical" if len(X[col].unique()) < 10 else "cont"
    print(
        f"NaNs ratio in col = {col} col_type = {col_type} miss ratio = {X[col].isna().sum() / len(X[col])}"
    )
[ ]:

[ ]:
# List available classifiers

# autoprognosis absolute
from autoprognosis.plugins.prediction import Classifiers

Classifiers().list_available()

Option 1: Predefined imputer

[ ]:
# stdlib
from pathlib import Path

workspace = Path("workspace")
workspace.mkdir(parents=True, exist_ok=True)

study_name = "test_classification_studies"

study = ClassifierStudy(
    study_name=study_name,
    dataset=dataset,
    target="target",
    num_iter=10,  # DELETE THIS LINE FOR BETTER RESULTS.
    num_study_iter=1,  # DELETE THIS LINE FOR BETTER RESULTS.
    imputers=["mean"],
    classifiers=["logistic_regression", "lda"],  # DELETE THIS LINE FOR BETTER RESULTS.
    workspace=workspace,
)
[ ]:
study.run()
[ ]:
# autoprognosis absolute
from autoprognosis.plugins.imputers import Imputers
from autoprognosis.utils.serialization import load_model_from_file
from autoprognosis.utils.tester import evaluate_estimator

model_path = workspace / study_name / "model.p"

model = load_model_from_file(model_path)

evaluate_estimator(model, X, y)
[ ]:
model.name()

Option 2: Let the optimizer find the optimal imputer

[ ]:
# stdlib
from pathlib import Path

workspace = Path("workspace")
study_name = "test_classification_studies_v2"

study = ClassifierStudy(
    study_name=study_name,
    dataset=dataset,
    target="target",
    num_iter=10,  # DELETE THIS LINE FOR BETTER RESULTS.
    num_study_iter=1,  # DELETE THIS LINE FOR BETTER RESULTS.
    classifiers=[
        "logistic_regression",
        "lda",
        "xgboost",
    ],  # DELETE THIS LINE FOR BETTER RESULTS.
    workspace=workspace,
)
[ ]:
study.run()
[ ]:
# autoprognosis absolute
from autoprognosis.plugins.imputers import Imputers
from autoprognosis.utils.serialization import load_model_from_file
from autoprognosis.utils.tester import evaluate_estimator

model_path = workspace / study_name / "model.p"

model = load_model_from_file(model_path)

evaluate_estimator(model, X, y)
[ ]:
model.name()

Serialization

[ ]:
# autoprognosis absolute
from autoprognosis.utils.serialization import load_from_file, save_to_file

out = workspace / "tmp.bkp"
# Fit the model
model.fit(X, y)

# Save
save_to_file(out, model)

# Reload
loaded_model = load_from_file(out)

print(loaded_model.name())

assert loaded_model.name() == model.name()

out.unlink()

Congratulations!

Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the movement towards Machine learning and AI for medicine, you can do so in the following ways!

Star AutoPrognosis on GitHub

The easiest way to help our community is just by starring the Repos! This helps raise awareness of the tools we’re building.