Tutorial: Simulating multiple imputation(MICE) using AutoPrognosis

Welcome to the classification AutoML tutorial!

This tutorial will show how to use AutoPrognosis and multiple imputation to learn a model for datasets with missing data.

[ ]:
# stdlib
import json
import sys
import warnings

# third party
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

warnings.filterwarnings("ignore")

# autoprognosis absolute
import autoprognosis.logger as log
from autoprognosis.studies.classifiers import ClassifierStudy
[ ]:
log.add(sink=sys.stderr, level="INFO")

Load toy dataset

[ ]:
# stdlib
from pathlib import Path


def get_dataset() -> pd.DataFrame:
    Path("data").mkdir(parents=True, exist_ok=True)
    bkp_file = Path("data") / "anneal.csv"

    if bkp_file.exists():
        return pd.read_csv(bkp_file)

    df = pd.read_csv(
        "https://archive.ics.uci.edu/ml/machine-learning-databases/annealing/anneal.data",
        header=None,
    )
    df.to_csv(bkp_file, index=None)

    return df


df = get_dataset()

df = df.replace("?", np.nan)

X = df.drop(columns=[df.columns[-1]])
y = df[df.columns[-1]]

X
[ ]:
dataset = X.copy()
dataset["target"] = y
[ ]:
for col in X.columns:
    if X[col].isna().sum() == 0:
        continue

    col_type = "categorical" if len(X[col].unique()) < 10 else "cont"
    print(
        f"NaNs ratio in col = {col} col_type = {col_type} miss ratio = {X[col].isna().sum() / len(X[col])}"
    )
[ ]:

[ ]:
# List available classifiers

# autoprognosis absolute
from autoprognosis.plugins.prediction import Classifiers

Classifiers().list_available()

Search model with the ICE imputer

[ ]:
# stdlib
from pathlib import Path

workspace = Path("workspace")
workspace.mkdir(parents=True, exist_ok=True)

study_name = "test_classification_studies_mice"

study = ClassifierStudy(
    study_name=study_name,
    dataset=dataset,
    target="target",
    imputers=[
        "ice"
    ],  # Using chained equations. Can use it for "missforest" or "hyperimpute" plugins as well.
    num_iter=10,  # DELETE THIS LINE FOR BETTER RESULTS.
    num_study_iter=1,  # DELETE THIS LINE FOR BETTER RESULTS.
    classifiers=["logistic_regression", "lda"],  # DELETE THIS LINE FOR BETTER RESULTS.
    workspace=workspace,
)
study.run()

Train the model template using multiple random seeds

[ ]:
# autoprognosis absolute
from autoprognosis.plugins.imputers import Imputers
from autoprognosis.utils.serialization import load_model_from_file

model_path = workspace / study_name / "model.p"

model = load_model_from_file(model_path)

model.name()
[ ]:
# autoprognosis absolute
from autoprognosis.utils.distributions import enable_reproducible_results
from autoprognosis.utils.tester import evaluate_estimator_multiple_seeds

score = evaluate_estimator_multiple_seeds(model, X, y, seeds=list(range(5)))
[ ]:
score

Congratulations!

Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the movement towards Machine learning and AI for medicine, you can do so in the following ways!

Star AutoPrognosis on GitHub

The easiest way to help our community is just by starring the Repos! This helps raise awareness of the tools we’re building.