{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "w5AUPMeLOtF4",
   "metadata": {
    "id": "w5AUPMeLOtF4"
   },
   "source": [
    "# AutoPrognosis - Tutorial on using classifiers with explainers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "u4Z5N2rbKtOX",
   "metadata": {
    "id": "u4Z5N2rbKtOX"
   },
   "outputs": [],
   "source": [
    "# Install AutoPrognosis\n",
    "!pip install autoprognosis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d3dc094",
   "metadata": {
    "id": "8d3dc094"
   },
   "outputs": [],
   "source": [
    "# stdlib\n",
    "import json\n",
    "import sys\n",
    "import warnings\n",
    "\n",
    "# third party\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "warnings.filterwarnings(\"ignore\")\n",
    "\n",
    "# autoprognosis absolute\n",
    "# autoprognosis\n",
    "import autoprognosis.logger as log\n",
    "from autoprognosis.studies.classifiers import ClassifierStudy\n",
    "\n",
    "log.add(sink=sys.stderr, level=\"INFO\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2f0827f5",
   "metadata": {
    "id": "2f0827f5"
   },
   "source": [
    "## Load dataset\n",
    "\n",
    "AutoPrognosis expects pandas.DataFrames as input.\n",
    "\n",
    "For this example, we will use the [Breast Cancer Wisconsin Dataset](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic))."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c45ad99e",
   "metadata": {
    "id": "c45ad99e"
   },
   "outputs": [],
   "source": [
    "# third party\n",
    "# Load dataset\n",
    "from sklearn.datasets import load_breast_cancer\n",
    "\n",
    "X, Y = load_breast_cancer(return_X_y=True, as_frame=True)\n",
    "\n",
    "X"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2680f42",
   "metadata": {
    "id": "e2680f42"
   },
   "source": [
    "## Run a study with AutoPrognosis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "qRYeNuCB_5KK",
   "metadata": {
    "id": "qRYeNuCB_5KK"
   },
   "outputs": [],
   "source": [
    "dataset = X.copy()\n",
    "dataset[\"target\"] = Y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "300dccdf",
   "metadata": {
    "id": "300dccdf"
   },
   "outputs": [],
   "source": [
    "# List available classifiers\n",
    "\n",
    "# autoprognosis absolute\n",
    "from autoprognosis.plugins.prediction import Classifiers\n",
    "\n",
    "Classifiers().list_available()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "072f65a4",
   "metadata": {
    "id": "072f65a4"
   },
   "outputs": [],
   "source": [
    "# stdlib\n",
    "from pathlib import Path\n",
    "\n",
    "workspace = Path(\"workspace\")\n",
    "study_name = \"test_classification_studies\"\n",
    "\n",
    "study = ClassifierStudy(\n",
    "    study_name=study_name,\n",
    "    dataset=dataset,\n",
    "    target=\"target\",\n",
    "    num_iter=100,  # DELETE THIS LINE FOR BETTER RESULTS.\n",
    "    num_study_iter=1,  # DELETE THIS LINE FOR BETTER RESULTS.\n",
    "    imputers=[],  # Dataset is complete, so imputation not necessary\n",
    "    classifiers=[\n",
    "        \"logistic_regression\",\n",
    "        \"perceptron\",\n",
    "        \"xgboost\",\n",
    "        \"decision_trees\",\n",
    "    ],  # DELETE THIS LINE FOR BETTER RESULTS.\n",
    "    feature_scaling=[],\n",
    "    score_threshold=0.4,\n",
    "    workspace=workspace,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6e6ebb79",
   "metadata": {
    "id": "6e6ebb79"
   },
   "outputs": [],
   "source": [
    "study.run()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c6a637d3",
   "metadata": {
    "id": "c6a637d3"
   },
   "outputs": [],
   "source": [
    "# autoprognosis absolute\n",
    "from autoprognosis.utils.serialization import load_model_from_file\n",
    "from autoprognosis.utils.tester import evaluate_estimator\n",
    "\n",
    "model_path = workspace / study_name / \"model.p\"\n",
    "\n",
    "model = load_model_from_file(model_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "qd8rfL0mccvh",
   "metadata": {
    "id": "qd8rfL0mccvh"
   },
   "outputs": [],
   "source": [
    "model.name()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "mjUFQ_pC2PnK",
   "metadata": {
    "id": "mjUFQ_pC2PnK"
   },
   "outputs": [],
   "source": [
    "evaluate_estimator(model, X, Y)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "GLg-FyZxPaf2",
   "metadata": {
    "id": "GLg-FyZxPaf2"
   },
   "source": [
    "## Interpretability"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "xUWeCd9A4IQe",
   "metadata": {
    "id": "xUWeCd9A4IQe"
   },
   "outputs": [],
   "source": [
    "# autoprognosis absolute\n",
    "from autoprognosis.plugins.explainers import Explainers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aLNGhYpIPcR9",
   "metadata": {
    "id": "aLNGhYpIPcR9"
   },
   "outputs": [],
   "source": [
    "# Explain using Kernel SHAP\n",
    "explainer = Explainers().get(\n",
    "    \"kernel_shap\",\n",
    "    model,\n",
    "    X,\n",
    "    Y,\n",
    "    feature_names=X.columns,\n",
    "    task_type=\"classification\",\n",
    ")\n",
    "explainer.plot(X.sample(frac=0.1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "YAQKycSE3w9k",
   "metadata": {
    "id": "YAQKycSE3w9k"
   },
   "outputs": [],
   "source": [
    "# Explain using Risk Effect Size\n",
    "explainer = Explainers().get(\n",
    "    \"risk_effect_size\",\n",
    "    model,\n",
    "    X,\n",
    "    Y,\n",
    "    task_type=\"classification\",\n",
    ")\n",
    "\n",
    "explainer.plot(X)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "GSwQAjwWmjrF",
   "metadata": {
    "id": "GSwQAjwWmjrF"
   },
   "source": [
    "## Value of information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3PtldUZOkjRk",
   "metadata": {
    "id": "3PtldUZOkjRk"
   },
   "outputs": [],
   "source": [
    "def evaluate_for_effect_size(effect_size):\n",
    "    exp = Explainers().get(\n",
    "        \"risk_effect_size\",\n",
    "        model,\n",
    "        X,\n",
    "        Y,\n",
    "        task_type=\"classification\",\n",
    "        effect_size=effect_size,\n",
    "    )\n",
    "\n",
    "    important_features = exp.explain(X, effect_size).index.tolist()\n",
    "\n",
    "    return important_features\n",
    "\n",
    "\n",
    "def evaluate_using_important_feature(effect_size):\n",
    "    filtered_model = load_model_from_file(model_path)\n",
    "\n",
    "    important_features = evaluate_for_effect_size(effect_size)\n",
    "    X_filtered = X[important_features]\n",
    "\n",
    "    metrics = evaluate_estimator(\n",
    "        filtered_model,\n",
    "        X_filtered,\n",
    "        Y,\n",
    "    )\n",
    "\n",
    "    print(\"\\033[1mEvaluation for effect size \\033[0m\", effect_size)\n",
    "    print(\n",
    "        \"    >>> \\033[1mSelected features for effect size\\033[0m \", important_features\n",
    "    )\n",
    "    print(\"    >>> \\033[1mSelected features count\\033[0m \", len(important_features))\n",
    "    print(\"    >>> \\033[1mEvaluation:\\033[0m \")\n",
    "    print(f\"        >>>> score =  {metrics['str']}\")\n",
    "    print(\"========================================\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "Zzfk6NIokS6u",
   "metadata": {
    "id": "Zzfk6NIokS6u"
   },
   "outputs": [],
   "source": [
    "# Evaluate performance for difference feature subsets defined by effect size\n",
    "for effect_size in [0.5, 1.0, 1.5, 2.0]:\n",
    "    evaluate_using_important_feature(effect_size)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8f69a68",
   "metadata": {
    "id": "d8f69a68"
   },
   "source": [
    "## Congratulations!\n",
    "\n",
    "Congratulations on completing this notebook tutorial! If you enjoyed this and would like to learn more about machine learning and AI for medicine, you can do so in the following ways!\n",
    "\n",
    "### Star AutoPrognosis on GitHub\n",
    "\n",
    "The easiest way to help our community is just by starring the Repos! This helps raise awareness of the tools we're building.\n",
    "\n",
    "- [Star AutoPrognosis](https://github.com/vanderschaarlab/autoprognosis)\n",
    "\n",
    "### Check out our website and paper for AutoPrognosis\n",
    "\n",
    "- [AutoPrognosis paper](https://arxiv.org/abs/2210.12090)\n",
    "- [AutoPrognosis website](https://www.autoprognosis.vanderschaar-lab.com/)\n",
    "\n",
    "### Learn more about our lab and other work\n",
    "\n",
    "- [van der Schaar lab website](https://www.vanderschaar-lab.com/)\n"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}