{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Demonstration of DataVizML\n", "\n", "This notebook will demonstrate the capabilities of the `DataVizML` library" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Import libraries" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datavizml.singledistribution import SingleDistribution\n", "from datavizml.exploratorydataanalysis import ExploratoryDataAnalysis\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Load data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# binary classification dataset\n", "X_binary_classification, y_binary_classification = load_breast_cancer(\n", " return_X_y=True, as_frame=True\n", ")\n", "X_binary_classification = X_binary_classification.iloc[:, :8]\n", "y_binary_classification = y_binary_classification.astype(bool)\n", "\n", "# regression dataset\n", "X_regression, y_regression = load_diabetes(return_X_y=True, as_frame=True)\n", "\n", "# create alternative target for classification with large class imbalance\n", "y_regression_class = y_regression > 50\n", "\n", "# add time series data field\n", "date_range = pd.date_range(start=\"2020-01-01\", end=\"2023-12-31\", freq=\"S\")\n", "X_binary_classification[\"time stamp\"] = np.random.choice(\n", " date_range, size=len(X_binary_classification)\n", ")\n", "X_binary_classification.loc[y_binary_classification, \"time stamp\"] = (\n", " X_binary_classification.loc[y_binary_classification, \"time stamp\"]\n", " - pd.DateOffset(months=12)\n", ")\n", "X_regression[\"time stamp\"] = np.random.choice(date_range, size=len(X_regression))\n", "X_regression.loc[y_regression_class, \"time stamp\"] = X_regression.loc[\n", " y_regression_class, \"time stamp\"\n", "] - pd.DateOffset(months=12)\n", "\n", "# multiclass dataset\n", "X_multiclass_classification, y_multiclass_classification = load_iris(\n", " return_X_y=True, as_frame=True\n", ")\n", "class_map = {k: v for k, v in enumerate(load_iris()[\"target_names\"])}\n", "y_multiclass_classification = y_multiclass_classification.map(class_map)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Demonstrate with binary classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=X_binary_classification,\n", " target=y_binary_classification,\n", " ncols=4,\n", " figure_width=18,\n", " axes_height=2.5,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=X_binary_classification,\n", " ncols=4,\n", " figure_width=18,\n", " axes_height=2.5,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Demonstrate with regression" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# initialise figure\n", "ncols = 5\n", "nrows = -(-(X_regression.shape[1]) // ncols)\n", "fig, ax_all = plt.subplots(ncols=ncols, nrows=nrows, figsize=(18, 3 * nrows))\n", "\n", "# loop though all features as an array\n", "for (_, x), ax in zip(X_regression.items(), ax_all.flatten()):\n", " sd = SingleDistribution(feature=x, ax=ax, target=y_regression)\n", " sd()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sd.to_dict()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Demonstrate with multiclass classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=X_multiclass_classification,\n", " target=y_multiclass_classification,\n", " ncols=4,\n", " figure_width=18,\n", " axes_height=4,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "eda.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=pd.concat([X_multiclass_classification, y_multiclass_classification], axis=1),\n", " ncols=5,\n", " prediction_matrix_full=True,\n", " figure_width=18,\n", " axes_height=4,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# plot prediction heatmap\n", "fig, ax = plt.subplots()\n", "eda.prediction_score_plot(ax=ax)\n", "fig.tight_layout()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Demonstrate with imbalanced binary classification" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=X_regression,\n", " target=y_regression_class,\n", " ncols=4,\n", " figure_width=18,\n", " axes_height=4,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()\n", "\n", "# display prediction matrix\n", "eda.prediction_matrix" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=X_regression,\n", " target=y_regression_class,\n", " target_rebalance=True,\n", " ncols=4,\n", " figure_width=18,\n", " axes_height=4,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()\n", "\n", "# display prediction matrix\n", "eda.prediction_matrix" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Demonstrate transformation options" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "raw = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] * 100\n", "data_transform = pd.DataFrame(\n", " {\n", " \"raw\": raw,\n", " \"square\": np.sqrt(raw),\n", " \"square-root\": np.square(raw),\n", " \"log-2\": np.exp2(raw),\n", " \"exp-2\": np.log2(raw),\n", " }\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=data_transform,\n", " ncols=5,\n", " figure_width=18,\n", " axes_height=3,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=data_transform,\n", " ncols=5,\n", " data_deskew=[\"square\", \"square-root\"],\n", " figure_width=18,\n", " axes_height=3,\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# create and run eda for data and target\n", "eda = ExploratoryDataAnalysis(\n", " data=data_transform,\n", " ncols=5,\n", " data_deskew=True,\n", " figure_width=18,\n", " axes_height=3,\n", " metric=\"count\",\n", ")\n", "fig = eda()\n", "\n", "# set figure layout\n", "fig.tight_layout()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "eda.summary()" ] } ], "metadata": { "kernelspec": { "display_name": "env", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.1" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "2a52c5e09781ab6358926a8371c29f1aa8550e94d474bdf493a8eeac218c7138" } } }, "nbformat": 4, "nbformat_minor": 2 }