{ "cells": [ { "cell_type": "markdown", "id": "623a0448", "metadata": {}, "source": [ "# Tutorial: Classification AutoML with imputation\n", "\n", "Welcome to the classification AutoML tutorial!\n", "\n", "This tutorial will show how to use AutoPrognosis to learn a model for datasets with missing data. We show how to use a predefined imputer or how to use AutoPrognosis to select the optimal imputer." ] }, { "cell_type": "code", "execution_count": null, "id": "8d3dc094", "metadata": {}, "outputs": [], "source": [ "# stdlib\n", "import json\n", "import sys\n", "import warnings\n", "\n", "# third party\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.model_selection import train_test_split\n", "\n", "warnings.filterwarnings(\"ignore\")\n", "\n", "# autoprognosis absolute\n", "import autoprognosis.logger as log\n", "from autoprognosis.studies.classifiers import ClassifierStudy" ] }, { "cell_type": "code", "execution_count": null, "id": "15ec7b75", "metadata": {}, "outputs": [], "source": [ "log.add(sink=sys.stderr, level=\"INFO\")" ] }, { "cell_type": "markdown", "id": "2f0827f5", "metadata": {}, "source": [ "## Load toy dataset\n" ] }, { "cell_type": "code", "execution_count": null, "id": "c45ad99e", "metadata": {}, "outputs": [], "source": [ "# stdlib\n", "from pathlib import Path\n", "\n", "\n", "def get_dataset() -> pd.DataFrame:\n", " Path(\"data\").mkdir(parents=True, exist_ok=True)\n", " bkp_file = Path(\"data\") / \"anneal.csv\"\n", "\n", " if bkp_file.exists():\n", " return pd.read_csv(bkp_file)\n", "\n", " df = pd.read_csv(\n", " \"https://archive.ics.uci.edu/ml/machine-learning-databases/annealing/anneal.data\",\n", " header=None,\n", " )\n", " df.to_csv(bkp_file, index=None)\n", "\n", " return df\n", "\n", "\n", "df = get_dataset()\n", "\n", "df = df.replace(\"?\", np.nan)\n", "\n", "X = df.drop(columns=[df.columns[-1]])\n", "y = df[df.columns[-1]]\n", "\n", "X" ] }, { "cell_type": "code", "execution_count": null, "id": "fe8bcb2c", "metadata": {}, "outputs": [], "source": [ "dataset = X.copy()\n", "dataset[\"target\"] = y" ] }, { "cell_type": "code", "execution_count": null, "id": "695eb4cf", "metadata": {}, "outputs": [], "source": [ "for col in X.columns:\n", " if X[col].isna().sum() == 0:\n", " continue\n", "\n", " col_type = \"categorical\" if len(X[col].unique()) < 10 else \"cont\"\n", " print(\n", " f\"NaNs ratio in col = {col} col_type = {col_type} miss ratio = {X[col].isna().sum() / len(X[col])}\"\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "aea15e48", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "300dccdf", "metadata": {}, "outputs": [], "source": [ "# List available classifiers\n", "\n", "# autoprognosis absolute\n", "from autoprognosis.plugins.prediction import Classifiers\n", "\n", "Classifiers().list_available()" ] }, { "cell_type": "markdown", "id": "343a2ff6", "metadata": {}, "source": [ "## Option 1: Predefined imputer" ] }, { "cell_type": "code", "execution_count": null, "id": "69b0146a", "metadata": {}, "outputs": [], "source": [ "# stdlib\n", "from pathlib import Path\n", "\n", "workspace = Path(\"workspace\")\n", "workspace.mkdir(parents=True, exist_ok=True)\n", "\n", "study_name = \"test_classification_studies\"\n", "\n", "study = ClassifierStudy(\n", " study_name=study_name,\n", " dataset=dataset,\n", " target=\"target\",\n", " num_iter=10, # DELETE THIS LINE FOR BETTER RESULTS.\n", " num_study_iter=1, # DELETE THIS LINE FOR BETTER RESULTS.\n", " imputers=[\"mean\"],\n", " classifiers=[\"logistic_regression\", \"lda\"], # DELETE THIS LINE FOR BETTER RESULTS.\n", " workspace=workspace,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "33d2219c", "metadata": {}, "outputs": [], "source": [ "study.run()" ] }, { "cell_type": "code", "execution_count": null, "id": "2903d9b5", "metadata": {}, "outputs": [], "source": [ "# autoprognosis absolute\n", "from autoprognosis.plugins.imputers import Imputers\n", "from autoprognosis.utils.serialization import load_model_from_file\n", "from autoprognosis.utils.tester import evaluate_estimator\n", "\n", "model_path = workspace / study_name / \"model.p\"\n", "\n", "model = load_model_from_file(model_path)\n", "\n", "evaluate_estimator(model, X, y)" ] }, { "cell_type": "code", "execution_count": null, "id": "9b298666", "metadata": {}, "outputs": [], "source": [ "model.name()" ] }, { "cell_type": "markdown", "id": "e2680f42", "metadata": {}, "source": [ "## Option 2: Let the optimizer find the optimal imputer" ] }, { "cell_type": "code", "execution_count": null, "id": "072f65a4", "metadata": {}, "outputs": [], "source": [ "# stdlib\n", "from pathlib import Path\n", "\n", "workspace = Path(\"workspace\")\n", "study_name = \"test_classification_studies_v2\"\n", "\n", "study = ClassifierStudy(\n", " study_name=study_name,\n", " dataset=dataset,\n", " target=\"target\",\n", " num_iter=10, # DELETE THIS LINE FOR BETTER RESULTS.\n", " num_study_iter=1, # DELETE THIS LINE FOR BETTER RESULTS.\n", " classifiers=[\n", " \"logistic_regression\",\n", " \"lda\",\n", " \"xgboost\",\n", " ], # DELETE THIS LINE FOR BETTER RESULTS.\n", " workspace=workspace,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "6e6ebb79", "metadata": {}, "outputs": [], "source": [ "study.run()" ] }, { "cell_type": "code", "execution_count": null, "id": "c6a637d3", "metadata": {}, "outputs": [], "source": [ "# autoprognosis absolute\n", "from autoprognosis.plugins.imputers import Imputers\n", "from autoprognosis.utils.serialization import load_model_from_file\n", "from autoprognosis.utils.tester import evaluate_estimator\n", "\n", "model_path = workspace / study_name / \"model.p\"\n", "\n", "model = load_model_from_file(model_path)\n", "\n", "evaluate_estimator(model, X, y)" ] }, { "cell_type": "code", "execution_count": null, "id": "01ce2d88", "metadata": {}, "outputs": [], "source": [ "model.name()" ] }, { "cell_type": "markdown", "id": "5c06af28", "metadata": {}, "source": [ "## Serialization" ] }, { "cell_type": "code", "execution_count": null, "id": "43472e67", "metadata": {}, "outputs": [], "source": [ "# autoprognosis absolute\n", "from autoprognosis.utils.serialization import load_from_file, save_to_file\n", "\n", "out = workspace / \"tmp.bkp\"\n", "# Fit the model\n", "model.fit(X, y)\n", "\n", "# Save\n", "save_to_file(out, model)\n", "\n", "# Reload\n", "loaded_model = load_from_file(out)\n", "\n", "print(loaded_model.name())\n", "\n", "assert loaded_model.name() == model.name()\n", "\n", "out.unlink()" ] }, { "cell_type": "markdown", "id": "d8f69a68", "metadata": {}, "source": [ "## Congratulations!\n", "\n", "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to join the movement towards Machine learning and AI for medicine, you can do so in the following ways!\n", "\n", "### Star AutoPrognosis on GitHub\n", "\n", "The easiest way to help our community is just by starring the Repos! This helps raise awareness of the tools we're building.\n", "\n", "- [Star AutoPrognosis](https://github.com/vanderschaarlab/autoprognosis)\n", "- [Star HyperImpute](https://github.com/vanderschaarlab/hyperimpute)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.15" } }, "nbformat": 4, "nbformat_minor": 5 }