Demonstration of DataVizML

This notebook will demonstrate the capabilities of the DataVizML library

Import libraries

from datavizml.singledistribution import SingleDistribution
from datavizml.exploratorydataanalysis import ExploratoryDataAnalysis
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris

Load data

# binary classification dataset
X_binary_classification, y_binary_classification = load_breast_cancer(
    return_X_y=True, as_frame=True
)
X_binary_classification = X_binary_classification.iloc[:, :8]
y_binary_classification = y_binary_classification.astype(bool)

# regression dataset
X_regression, y_regression = load_diabetes(return_X_y=True, as_frame=True)

# create alternative target for classification with large class imbalance
y_regression_class = y_regression > 50

# add time series data field
date_range = pd.date_range(start="2020-01-01", end="2023-12-31", freq="S")
X_binary_classification["time stamp"] = np.random.choice(
    date_range, size=len(X_binary_classification)
)
X_binary_classification.loc[y_binary_classification, "time stamp"] = (
    X_binary_classification.loc[y_binary_classification, "time stamp"]
    - pd.DateOffset(months=12)
)
X_regression["time stamp"] = np.random.choice(date_range, size=len(X_regression))
X_regression.loc[y_regression_class, "time stamp"] = X_regression.loc[
    y_regression_class, "time stamp"
] - pd.DateOffset(months=12)

# multiclass dataset
X_multiclass_classification, y_multiclass_classification = load_iris(
    return_X_y=True, as_frame=True
)
class_map = {k: v for k, v in enumerate(load_iris()["target_names"])}
y_multiclass_classification = y_multiclass_classification.map(class_map)

Demonstrate with binary classification

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_binary_classification,
    target=y_binary_classification,
    ncols=4,
    figure_width=18,
    axes_height=2.5,
)
fig = eda()

# set figure layout
fig.tight_layout()

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")

/tmp/ipykernel_637/2607751757.py:12: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.tight_layout()

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/events.py:82: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  func(*args, **kwargs)

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/pylabtools.py:170: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.canvas.print_figure(bytes_io, **kw)

_images/83e0bc47e85eddd5634f07c2c5f4af5d81f53b2943915dbbd36982b0ec6c24cf.png

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_binary_classification,
    ncols=4,
    figure_width=18,
    axes_height=2.5,
)
fig = eda()

# set figure layout
fig.tight_layout()

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")

/tmp/ipykernel_637/1227952141.py:11: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.tight_layout()

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/events.py:82: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  func(*args, **kwargs)

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/IPython/core/pylabtools.py:170: UserWarning: AutoDateLocator was unable to pick an appropriate interval for this date range. It may be necessary to add an interval value to the AutoDateLocator's intervald dictionary. Defaulting to 6.
  fig.canvas.print_figure(bytes_io, **kw)

_images/2c055d83ec93bd3e2f3439668cf840244e468c4c8646933290187d843926a2e2.png

Demonstrate with regression

# initialise figure
ncols = 5
nrows = -(-(X_regression.shape[1]) // ncols)
fig, ax_all = plt.subplots(ncols=ncols, nrows=nrows, figsize=(18, 3 * nrows))

# loop though all features as an array
for (_, x), ax in zip(X_regression.items(), ax_all.flatten()):
    sd = SingleDistribution(feature=x, ax=ax, target=y_regression)
    sd()

# set figure layout
fig.tight_layout()

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")

_images/9c6e1778573278c03c32c4a2a0ec163b9691ce3238942236a25cc0596f540ca4.png

sd.to_dict()

{'feature_name': 'time stamp',
 'feature_dtype': dtype('<M8[ns]'),
 'feature_score': 0.033686967797483,
 'feature_score_type': 'Inter-decile skew',
 'feature_transform': None,
 'feature_nunique': 442,
 'feature_missing_proportion': 0,
 'target_name': 'target',
 'target_dtype': Int64Dtype(),
 'target_score': 0,
 'target_score_type': 'PPS'}

Demonstrate with multiclass classification

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_multiclass_classification,
    target=y_multiclass_classification,
    ncols=4,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()

_images/f1019f918e024c0c83b75c65b1aa5df5cec9fb0d577ec4e1248036d3b750e4e7.png

eda.summary()

	feature_name	feature_dtype	feature_score	feature_score_type	feature_transform	feature_nunique	target_name	target_dtype	target_score	target_score_type
0	sepal length (cm)	Float64	0.047619	Inter-decile skew	None	35	target	string	0.471649	PPS
1	sepal width (cm)	Float64	0.099099	Inter-decile skew	None	23	target	string	0.156915	PPS
2	petal length (cm)	Float64	0.340909	Inter-decile skew	None	43	target	string	0.884812	PPS
3	petal width (cm)	Float64	0.100000	Inter-decile skew	None	22	target	string	0.927652	PPS

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=pd.concat([X_multiclass_classification, y_multiclass_classification], axis=1),
    ncols=5,
    prediction_matrix_full=True,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()

_images/7ea90e95c97ab01b31b065b9a3ccfd5d4cae1ae1004372fb04e70b6f164cef44.png

# plot prediction heatmap
fig, ax = plt.subplots()
eda.prediction_score_plot(ax=ax)
fig.tight_layout()

_images/bc80e1a0e45b39aa90acff2cf0c651226ddec64eef93358ebe2f29510f2e5195.png

Demonstrate with imbalanced binary classification

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_regression,
    target=y_regression_class,
    ncols=4,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()

# display prediction matrix
eda.prediction_matrix

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")

	x	y	ppscore	case	is_valid_score	metric	baseline_score	model_score	model
0	age	target	0.000000	classification	True	weighted F1	0.929311	0.927036	DecisionTreeClassifier()
1	sex	target	0.000248	classification	True	weighted F1	0.929311	0.929329	DecisionTreeClassifier()
2	bmi	target	0.000000	classification	True	weighted F1	0.929311	0.923644	DecisionTreeClassifier()
3	bp	target	0.000000	classification	True	weighted F1	0.929311	0.925905	DecisionTreeClassifier()
4	s1	target	0.000000	classification	True	weighted F1	0.929311	0.907689	DecisionTreeClassifier()
5	s2	target	0.000000	classification	True	weighted F1	0.929311	0.911806	DecisionTreeClassifier()
6	s3	target	0.000000	classification	True	weighted F1	0.929311	0.921330	DecisionTreeClassifier()
7	s4	target	0.000000	classification	True	weighted F1	0.929311	0.928198	DecisionTreeClassifier()
8	s5	target	0.000000	classification	True	weighted F1	0.929311	0.910487	DecisionTreeClassifier()
9	s6	target	0.000248	classification	True	weighted F1	0.929311	0.929329	DecisionTreeClassifier()
10	time stamp	target	0.150660	classification	True	weighted F1	0.929311	0.939961	DecisionTreeClassifier()

_images/12dfbc07e7230944bac8e0a39f830689646563b19b0e795a6bd574cb748d2822.png

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=X_regression,
    target=y_regression_class,
    target_rebalance=True,
    ncols=4,
    figure_width=18,
    axes_height=4,
)
fig = eda()

# set figure layout
fig.tight_layout()

# display prediction matrix
eda.prediction_matrix

/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:908: RuntimeWarning: invalid value encountered in cast
  base = data.astype(np.int64)
/home/docs/checkouts/readthedocs.org/user_builds/datavizml/envs/latest/lib/python3.11/site-packages/pandas/core/arrays/timedeltas.py:912: RuntimeWarning: invalid value encountered in cast
  data = (base * m + (frac * m).astype(np.int64)).view("timedelta64[ns]")

	x	y	ppscore	case	is_valid_score	metric	baseline_score	model_score	model
0	age	target	0.616447	classification	True	weighted F1	0.505351	0.810276	DecisionTreeClassifier()
1	sex	target	0.027742	classification	True	weighted F1	0.505351	0.519073	DecisionTreeClassifier()
2	bmi	target	0.827911	classification	True	weighted F1	0.505351	0.914876	DecisionTreeClassifier()
3	bp	target	0.644764	classification	True	weighted F1	0.505351	0.824283	DecisionTreeClassifier()
4	s1	target	0.820698	classification	True	weighted F1	0.505351	0.911308	DecisionTreeClassifier()
5	s2	target	0.920560	classification	True	weighted F1	0.505351	0.960705	DecisionTreeClassifier()
6	s3	target	0.661226	classification	True	weighted F1	0.505351	0.832426	DecisionTreeClassifier()
7	s4	target	0.553270	classification	True	weighted F1	0.505351	0.779025	DecisionTreeClassifier()
8	s5	target	0.879351	classification	True	weighted F1	0.505351	0.940321	DecisionTreeClassifier()
9	s6	target	0.534821	classification	True	weighted F1	0.505351	0.769900	DecisionTreeClassifier()
10	time stamp	target	0.968737	classification	True	weighted F1	0.505351	0.984536	DecisionTreeClassifier()

_images/de5f7d8c1df0f2dd267059a4f6c337f691d806fcefcc78a9abb37a6c8cf5a6cf.png

Demonstrate transformation options

raw = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20] * 100
data_transform = pd.DataFrame(
    {
        "raw": raw,
        "square": np.sqrt(raw),
        "square-root": np.square(raw),
        "log-2": np.exp2(raw),
        "exp-2": np.log2(raw),
    }
)

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=data_transform,
    ncols=5,
    figure_width=18,
    axes_height=3,
)
fig = eda()

# set figure layout
fig.tight_layout()

_images/03ba58fa391530a2e9f5c07304f331b0f19b1f3e5f3177cbe348d47c1bc9258c.png

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=data_transform,
    ncols=5,
    data_deskew=["square", "square-root"],
    figure_width=18,
    axes_height=3,
)
fig = eda()

# set figure layout
fig.tight_layout()

_images/9081468bb8a59c4adbc92ced68ebe6d4017c8177c54df2723276f90932c22f7f.png

# create and run eda for data and target
eda = ExploratoryDataAnalysis(
    data=data_transform,
    ncols=5,
    data_deskew=True,
    figure_width=18,
    axes_height=3,
    metric="count",
)
fig = eda()

# set figure layout
fig.tight_layout()

_images/b52343f0f1f88f2a36022cc8b94dc5e877f39e61f2831074d0ae0759a7155ca9.png

eda.summary()

	feature_name	feature_dtype	feature_score	feature_score_type	feature_transform	feature_nunique	target_name	target_dtype	target_score	target_score_type
0	raw	Int64	9.349247e-15	Inter-decile skew	None	20	None	None	NaN	N/A
1	square	Float64	9.115515e-15	Inter-decile skew	square	20	None	None	NaN	N/A
2	square-root	Int64	9.349247e-15	Inter-decile skew	square-root	20	None	None	NaN	N/A
3	log-2	Int64	9.349247e-15	Inter-decile skew	log-2	20	None	None	NaN	N/A
4	exp-2	Float64	9.576338e-15	Inter-decile skew	exp-2	20	None	None	NaN	N/A