{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Demonstration of DataVizML\n",
    "\n",
    "This notebook will demonstrate the capabilities of the `DataVizML` library"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datavizml.singledistribution import SingleDistribution\n",
    "from datavizml.exploratorydataanalysis import ExploratoryDataAnalysis\n",
    "import matplotlib.pyplot as plt\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# binary classification dataset\n",
    "X_binary_classification, y_binary_classification = load_breast_cancer(\n",
    "    return_X_y=True, as_frame=True\n",
    ")\n",
    "X_binary_classification = X_binary_classification.iloc[:, :8]\n",
    "y_binary_classification = y_binary_classification.astype(bool)\n",
    "\n",
    "# regression dataset\n",
    "X_regression, y_regression = load_diabetes(return_X_y=True, as_frame=True)\n",
    "\n",
    "# create alternative target for classification with large class imbalance\n",
    "y_regression_class = y_regression > 50\n",
    "\n",
    "# add time series data field\n",
    "date_range = pd.date_range(start=\"2020-01-01\", end=\"2023-12-31\", freq=\"S\")\n",
    "X_binary_classification[\"time stamp\"] = np.random.choice(\n",
    "    date_range, size=len(X_binary_classification)\n",
    ")\n",
    "X_binary_classification.loc[y_binary_classification, \"time stamp\"] = (\n",
    "    X_binary_classification.loc[y_binary_classification, \"time stamp\"]\n",
    "    - pd.DateOffset(months=12)\n",
    ")\n",
    "X_regression[\"time stamp\"] = np.random.choice(date_range, size=len(X_regression))\n",
    "X_regression.loc[y_regression_class, \"time stamp\"] = X_regression.loc[\n",
    "    y_regression_class, \"time stamp\"\n",
    "] - pd.DateOffset(months=12)\n",
    "\n",
    "# multiclass dataset\n",
    "X_multiclass_classification, y_multiclass_classification = load_iris(\n",
    "    return_X_y=True, as_frame=True\n",
    ")\n",
    "class_map = {k: v for k, v in enumerate(load_iris()[\"target_names\"])}\n",
    "y_multiclass_classification = y_multiclass_classification.map(class_map)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demonstrate with binary classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=X_binary_classification,\n",
    "    target=y_binary_classification,\n",
    "    ncols=4,\n",
    "    figure_width=18,\n",
    "    axes_height=2.5,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=X_binary_classification,\n",
    "    ncols=4,\n",
    "    figure_width=18,\n",
    "    axes_height=2.5,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demonstrate with regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# initialise figure\n",
    "ncols = 5\n",
    "nrows = -(-(X_regression.shape[1]) // ncols)\n",
    "fig, ax_all = plt.subplots(ncols=ncols, nrows=nrows, figsize=(18, 3 * nrows))\n",
    "\n",
    "# loop though all features as an array\n",
    "for (_, x), ax in zip(X_regression.items(), ax_all.flatten()):\n",
    "    sd = SingleDistribution(feature=x, ax=ax, target=y_regression)\n",
    "    sd()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sd.to_dict()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demonstrate with multiclass classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=X_multiclass_classification,\n",
    "    target=y_multiclass_classification,\n",
    "    ncols=4,\n",
    "    figure_width=18,\n",
    "    axes_height=4,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eda.summary()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=pd.concat([X_multiclass_classification, y_multiclass_classification], axis=1),\n",
    "    ncols=5,\n",
    "    prediction_matrix_full=True,\n",
    "    figure_width=18,\n",
    "    axes_height=4,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# plot prediction heatmap\n",
    "fig, ax = plt.subplots()\n",
    "eda.prediction_score_plot(ax=ax)\n",
    "fig.tight_layout()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demonstrate with imbalanced binary classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=X_regression,\n",
    "    target=y_regression_class,\n",
    "    ncols=4,\n",
    "    figure_width=18,\n",
    "    axes_height=4,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()\n",
    "\n",
    "# display prediction matrix\n",
    "eda.prediction_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=X_regression,\n",
    "    target=y_regression_class,\n",
    "    target_rebalance=True,\n",
    "    ncols=4,\n",
    "    figure_width=18,\n",
    "    axes_height=4,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()\n",
    "\n",
    "# display prediction matrix\n",
    "eda.prediction_matrix"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Demonstrate transformation options"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "raw = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] * 100\n",
    "data_transform = pd.DataFrame(\n",
    "    {\n",
    "        \"raw\": raw,\n",
    "        \"square\": np.sqrt(raw),\n",
    "        \"square-root\": np.square(raw),\n",
    "        \"log-2\": np.exp2(raw),\n",
    "        \"exp-2\": np.log2(raw),\n",
    "    }\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=data_transform,\n",
    "    ncols=5,\n",
    "    figure_width=18,\n",
    "    axes_height=3,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=data_transform,\n",
    "    ncols=5,\n",
    "    data_deskew=[\"square\", \"square-root\"],\n",
    "    figure_width=18,\n",
    "    axes_height=3,\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# create and run eda for data and target\n",
    "eda = ExploratoryDataAnalysis(\n",
    "    data=data_transform,\n",
    "    ncols=5,\n",
    "    data_deskew=True,\n",
    "    figure_width=18,\n",
    "    axes_height=3,\n",
    "    metric=\"count\",\n",
    ")\n",
    "fig = eda()\n",
    "\n",
    "# set figure layout\n",
    "fig.tight_layout()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "eda.summary()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "2a52c5e09781ab6358926a8371c29f1aa8550e94d474bdf493a8eeac218c7138"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}