Demonstration of DataVizML

This notebook will demonstrate the capabilities of the DataVizML library

Import libraries

from datavizml.singledistribution import SingleDistribution
from datavizml.exploratorydataanalysis import ExploratoryDataAnalysis
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris

Load data

# binary classification dataset
X_binary_classification, y_binary_classification = load_breast_cancer(
    return_X_y=True, as_frame=True
)
X_binary_classification = X_binary_classification.iloc[:, :8]
y_binary_classification = y_binary_classification.astype(bool)

# regression dataset
X_regression, y_regression = load_diabetes(return_X_y=True, as_frame=True)

# create alternative target for classification with large class imbalance
y_regression_class = y_regression > 50

# add time series data field
date_range = pd.date_range(start="2020-01-01", end="2023-12-31", freq="S")
X_binary_classification["time stamp"] = np.random.choice(
    date_range, size=len(X_binary_classification)
)
X_binary_classification.loc[y_binary_classification, "time stamp"] = (
    X_binary_classification.loc[y_binary_classification, "time stamp"]
    - pd.DateOffset(months=12)
)
X_regression["time stamp"] = np.random.choice(date_range, size=len(X_regression))
X_regression.loc[y_regression_class, "time stamp"] = X_regression.loc[
    y_regression_class, "time stamp"
] - pd.DateOffset(months=12)

# multiclass dataset
X_multiclass_classification, y_multiclass_classification = load_iris(
    return_X_y=True, as_frame=True
)
class_map = {k: v for k, v in enumerate(load_iris()["target_names"])}
y_multiclass_classification = y_multiclass_classification.map(class_map)

Demonstrate with binary classification

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_binary_classification,
    target=y_binary_classification,
    ncols=4,
    figure_width=18,
    axes_height=2.5,
)
fig = eda()

# set figure layout
fig.tight_layout()
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
/tmp/ipykernel_637/2607751757.py:12: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.tight_layout()
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/events.py:82: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  func(*args, **kwargs)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/pylabtools.py:170: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.canvas.print_figure(bytes_io, **kw)
_images/83e0bc47e85eddd5634f07c2c5f4af5d81f53b2943915dbbd36982b0ec6c24cf.png
# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_binary_classification,
    ncols=4,
    figure_width=18,
    axes_height=2.5,
)
fig = eda()

# set figure layout
fig.tight_layout()
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
/tmp/ipykernel_637/1227952141.py:11: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.tight_layout()
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/events.py:82: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  func(*args, **kwargs)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/pylabtools.py:170: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.canvas.print_figure(bytes_io, **kw)
_images/2c055d83ec93bd3e2f3439668cf840244e468c4c8646933290187d843926a2e2.png

Demonstrate with regression

# initialise figure
ncols = 5
nrows = -(-(X_regression.shape[1]) // ncols)
fig, ax_all = plt.subplots(ncols=ncols, nrows=nrows, figsize=(18, 3 * nrows))

# loop though all features as an array
for (_, x), ax in zip(X_regression.items(), ax_all.flatten()):
    sd = SingleDistribution(feature=x, ax=ax, target=y_regression)
    sd()

# set figure layout
fig.tight_layout()
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
_images/9c6e1778573278c03c32c4a2a0ec163b9691ce3238942236a25cc0596f540ca4.png
sd.to_dict()
{'feature_name': 'time stamp',
 'feature_dtype': dtype('<M8[ns]'),
 'feature_score': 0.033686967797483,
 'feature_score_type': 'Inter-decile skew',
 'feature_transform': None,
 'feature_nunique': 442,
 'feature_missing_proportion': 0,
 'target_name': 'target',
 'target_dtype': Int64Dtype(),
 'target_score': 0,
 'target_score_type': 'PPS'}

Demonstrate with multiclass classification

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_multiclass_classification,
    target=y_multiclass_classification,
    ncols=4,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()
_images/f1019f918e024c0c83b75c65b1aa5df5cec9fb0d577ec4e1248036d3b750e4e7.png
eda.summary()
feature_name feature_dtype feature_score feature_score_type feature_transform feature_nunique feature_missing_proportion target_name target_dtype target_score target_score_type
0 sepal length (cm) Float64 0.047619 Inter-decile skew None 35 0 target string 0.471649 PPS
1 sepal width (cm) Float64 0.099099 Inter-decile skew None 23 0 target string 0.156915 PPS
2 petal length (cm) Float64 0.340909 Inter-decile skew None 43 0 target string 0.884812 PPS
3 petal width (cm) Float64 0.100000 Inter-decile skew None 22 0 target string 0.927652 PPS
# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=pd.concat([X_multiclass_classification, y_multiclass_classification], axis=1),
    ncols=5,
    prediction_matrix_full=True,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()
_images/7ea90e95c97ab01b31b065b9a3ccfd5d4cae1ae1004372fb04e70b6f164cef44.png
# plot prediction heatmap
fig, ax = plt.subplots()
eda.prediction_score_plot(ax=ax)
fig.tight_layout()
_images/bc80e1a0e45b39aa90acff2cf0c651226ddec64eef93358ebe2f29510f2e5195.png

Demonstrate with imbalanced binary classification

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_regression,
    target=y_regression_class,
    ncols=4,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()

# display prediction matrix
eda.prediction_matrix
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
x y ppscore case is_valid_score metric baseline_score model_score model
0 age target 0.000000 classification True weighted F1 0.929311 0.927036 DecisionTreeClassifier()
1 sex target 0.000248 classification True weighted F1 0.929311 0.929329 DecisionTreeClassifier()
2 bmi target 0.000000 classification True weighted F1 0.929311 0.923644 DecisionTreeClassifier()
3 bp target 0.000000 classification True weighted F1 0.929311 0.925905 DecisionTreeClassifier()
4 s1 target 0.000000 classification True weighted F1 0.929311 0.907689 DecisionTreeClassifier()
5 s2 target 0.000000 classification True weighted F1 0.929311 0.911806 DecisionTreeClassifier()
6 s3 target 0.000000 classification True weighted F1 0.929311 0.921330 DecisionTreeClassifier()
7 s4 target 0.000000 classification True weighted F1 0.929311 0.928198 DecisionTreeClassifier()
8 s5 target 0.000000 classification True weighted F1 0.929311 0.910487 DecisionTreeClassifier()
9 s6 target 0.000248 classification True weighted F1 0.929311 0.929329 DecisionTreeClassifier()
10 time stamp target 0.150660 classification True weighted F1 0.929311 0.939961 DecisionTreeClassifier()
_images/12dfbc07e7230944bac8e0a39f830689646563b19b0e795a6bd574cb748d2822.png
# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_regression,
    target=y_regression_class,
    target_rebalance=True,
    ncols=4,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()

# display prediction matrix
eda.prediction_matrix
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")
x y ppscore case is_valid_score metric baseline_score model_score model
0 age target 0.616447 classification True weighted F1 0.505351 0.810276 DecisionTreeClassifier()
1 sex target 0.027742 classification True weighted F1 0.505351 0.519073 DecisionTreeClassifier()
2 bmi target 0.827911 classification True weighted F1 0.505351 0.914876 DecisionTreeClassifier()
3 bp target 0.644764 classification True weighted F1 0.505351 0.824283 DecisionTreeClassifier()
4 s1 target 0.820698 classification True weighted F1 0.505351 0.911308 DecisionTreeClassifier()
5 s2 target 0.920560 classification True weighted F1 0.505351 0.960705 DecisionTreeClassifier()
6 s3 target 0.661226 classification True weighted F1 0.505351 0.832426 DecisionTreeClassifier()
7 s4 target 0.553270 classification True weighted F1 0.505351 0.779025 DecisionTreeClassifier()
8 s5 target 0.879351 classification True weighted F1 0.505351 0.940321 DecisionTreeClassifier()
9 s6 target 0.534821 classification True weighted F1 0.505351 0.769900 DecisionTreeClassifier()
10 time stamp target 0.968737 classification True weighted F1 0.505351 0.984536 DecisionTreeClassifier()
_images/de5f7d8c1df0f2dd267059a4f6c337f691d806fcefcc78a9abb37a6c8cf5a6cf.png

Demonstrate transformation options

raw = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] * 100
data_transform = pd.DataFrame(
    {
        "raw": raw,
        "square": np.sqrt(raw),
        "square-root": np.square(raw),
        "log-2": np.exp2(raw),
        "exp-2": np.log2(raw),
    }
)
# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=data_transform,
    ncols=5,
    figure_width=18,
    axes_height=3,
)
fig = eda()

# set figure layout
fig.tight_layout()
_images/03ba58fa391530a2e9f5c07304f331b0f19b1f3e5f3177cbe348d47c1bc9258c.png
# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=data_transform,
    ncols=5,
    data_deskew=["square", "square-root"],
    figure_width=18,
    axes_height=3,
)
fig = eda()

# set figure layout
fig.tight_layout()
_images/9081468bb8a59c4adbc92ced68ebe6d4017c8177c54df2723276f90932c22f7f.png
# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=data_transform,
    ncols=5,
    data_deskew=True,
    figure_width=18,
    axes_height=3,
    metric="count",
)
fig = eda()

# set figure layout
fig.tight_layout()
_images/b52343f0f1f88f2a36022cc8b94dc5e877f39e61f2831074d0ae0759a7155ca9.png
eda.summary()
feature_name feature_dtype feature_score feature_score_type feature_transform feature_nunique feature_missing_proportion target_name target_dtype target_score target_score_type
0 raw Int64 9.349247e-15 Inter-decile skew None 20 0 None None NaN N/A
1 square Float64 9.115515e-15 Inter-decile skew square 20 0 None None NaN N/A
2 square-root Int64 9.349247e-15 Inter-decile skew square-root 20 0 None None NaN N/A
3 log-2 Int64 9.349247e-15 Inter-decile skew log-2 20 0 None None NaN N/A
4 exp-2 Float64 9.576338e-15 Inter-decile skew exp-2 20 0 None None NaN N/A