Source code for datavizml.exploratorydataanalysis

from typing import Any, Dict, Optional, Union

import matplotlib
import numpy as np
import pandas as pd
import ppscore as pps
import seaborn as sns
from matplotlib import pyplot as plt

from datavizml import singledistribution as sd
from datavizml import utils


[docs]class ExploratoryDataAnalysis:
    """A graphical summary of all given features and their relationship to a target

    :param data: Features to be analysed
    :type data: pandas Series of pandas DataFrame
    :param ncols: Number of columns to use in figure
    :type ncols: float, optional
    :param data_deskew: Reduce data skew, trialling: squaring, rooting, logging, exponents and Yeo-Johnson
    :type data_deskew: bool for all features or string or list of string for selective features, optional
    :param target: Target to be predicted
    :type target: pandas Series, optional
    :param target_rebalance: Rebalance target
    :type target_rebalance: bool, optional
    :param metric: Metric used for prevalence, "count" or "prop" (default)
    :type metric: string, optional
    :param prediction_matrix_full: Full or reduced prediction matrix
    :type prediction_matrix_full: bool, optional
    :param figure_width: Width of figure
    :type figure_width: int, optional
    :param axes_height: Height of axes
    :type axes_height: int, optional
    """

    FIGURE_WIDTH = 18  # width of figure
    AXES_HEIGHT = 3  # height of each axis

    def __init__(
        self,
        data: Any,
        ncols: int,
        data_deskew: Union[bool, list, str] = False,
        target: Optional[Any] = None,
        target_rebalance: bool = False,
        metric: str = "prop",
        prediction_matrix_full: bool = False,
        figure_width: Union[int, float] = FIGURE_WIDTH,
        axes_height: Union[int, float] = AXES_HEIGHT,
    ) -> None:
        """Constructor method"""
        # input variables
        self.data = data
        self.__data_deskew = data_deskew
        self.__has_target = target is not None
        if self.__has_target:
            self.target = target
            self.__target_rebalance = target_rebalance
        self.__ncols = ncols
        self.__prediction_matrix_full = prediction_matrix_full
        self.__figure_width = figure_width
        self.__axes_height = axes_height
        self.__metric = metric

        # calculate general use variables
        self.__nrows = -(-(self.data.shape[1]) // self.__ncols)

        # classify inputs
        self.__data_dtypes = set(
            [utils.classify_type(x)[3] for _, x in self.data.items()]
        )
        if self.__has_target:
            (
                self.__target_is_bool,
                self.__target_is_numeric,
                _,
                self.__target_dtype,
            ) = utils.classify_type(self.target)
            if self.__target_is_numeric and not self.__target_is_bool:
                self.__target_type = "regression"
            else:
                self.__target_type = "classification"

        # check input
        if self.__has_target:
            if self.data.shape[0] != self.target.shape[0]:
                raise ValueError(
                    f"Dimension mismatch, features have {self.data.shape[0]} elements but the target has {self.target.shape[0]}"
                )

        # initialise figure and axes
        self.__init_figure()

        # calculate prediction matrix
        self.__calculate_prediction_matrix()

        # initialise figure and axes
        self.__init_single_distributions()

    def __str__(self) -> str:
        """Returns a string representation of the instance

        :return: A string containing: feature name and data type; target name and data type; and relationship score if available
        :rtype: str
        """

        # conditional strings
        feature_vals = (
            ", ".join(self.data.columns),
            ", ".join(sorted([str(x) for x in self.__data_dtypes])),
        )
        target_val = (
            f"{self.target.name} ({self.__target_dtype})"
            if self.__has_target
            else "no target provided"
        )

        # attribute related strings
        feature_str = f"features: {feature_vals[0]} ({feature_vals[1]})"
        target_str = f"target: {target_val}"

        return "\n".join([feature_str, target_str])

    def __getitem__(self, ind: int) -> sd.SingleDistribution:
        """Get the distribution plot at the given index

        :param ind: The index of the distribution plot to retrieve
        :type ind: int

        :return: The SingleDistribution object at the given index, or None if the index is out of range
        :rtype: SingleDistribution or None
        """
        return self.single_distributions[ind]

    def __call__(self) -> matplotlib.figure.Figure:
        """Generates and decorates the plots for each feature

        :return: A figure with the plots for each feature
        :rtype: matplotlib.figure.Figure
        """
        # call the plot for each object
        for plot in self:  # type: ignore
            plot()

        return self.fig

    # initialise figure
    def __init_figure(self) -> None:
        """Initialise a figure with the required size and axes for the exploratory data analysis"""
        # create figure of required size with the required axes
        figsize = (self.__figure_width, self.__axes_height * self.__nrows)
        fig, ax = plt.subplots(
            nrows=self.__nrows, ncols=self.__ncols, squeeze=False, figsize=figsize
        )

        # assign to object
        self.fig: matplotlib.figure.Figure = fig
        self.ax = ax

    # calculate prediction matrix
    def __calculate_prediction_matrix(self) -> None:
        "Calculate prediction matrix for specified combinations of features/targets"
        # combine feature and target
        if self.__has_target:
            # rebalance classes
            if self.__target_type == "classification" and self.__target_rebalance:
                x_balanced, y_balanced = utils.class_rebalance(self.data, self.target)
                df = pd.concat([x_balanced, y_balanced], axis=1)
            else:
                df = pd.concat([self.data, self.target], axis=1)

        else:
            df = self.data

        # calculate full matrix
        if self.__prediction_matrix_full:
            self.__prediction_matrix = pps.matrix(
                df=df,
                sample=None,
                invalid_score=np.nan,
            )
        else:
            # calculate reduced matrix
            if self.__has_target:
                self.__prediction_matrix = pps.predictors(
                    df=df,
                    y=self.target.name,
                    sorted=False,
                    sample=None,
                    invalid_score=np.nan,
                )
            else:
                self.__prediction_matrix = None

    # initialise distribution plot
    def __init_single_distributions(self) -> None:
        """Initialise a single distribution object for each feature"""
        # initialise all single distribution objects
        self.single_distributions = []
        for (_, feature), ax in zip(self.data.items(), self.ax.flatten()):
            self.single_distributions.append(
                sd.SingleDistribution(
                    feature=feature,
                    ax=ax,
                    feature_deskew=(
                        self.__data_deskew
                        if isinstance(self.__data_deskew, bool)
                        else feature.name in self.__data_deskew
                    ),
                    target=self.target if self.__has_target else None,
                    target_score=(
                        self.prediction_matrix.pivot(
                            index="x", columns="y", values="ppscore"
                        ).loc[feature.name, self.target.name]
                        if self.__has_target
                        else None
                    ),
                    metric=self.__metric,
                )
            )

    # create summary dataframe
[docs]    def summary(self) -> pd.DataFrame:
        """Summarise analysis

        :return: A dataframe summarising each of the features and their relationship to the target
        :rtype: pd.DataFrame
        """
        data = [sd.to_dict() for sd in self.single_distributions]
        return pd.DataFrame(data=data)

    # create prediction power plot
[docs]    def prediction_score_plot(self, ax: matplotlib.axes.Axes) -> matplotlib.axes.Axes:
        """Plot the prediction scores as a heatmap

        :param ax: Axes to plot on
        :type ax: matplotlib Axes

        :return: The heatmap plot
        :rtype: matplotlib Axes
        """
        # extract data and plot heatmap
        if self.prediction_matrix is not None:
            data = self.prediction_matrix.rename(
                columns={"x": "x (predictor)", "y": "y (predictee)"}
            )
            data = data.pivot(
                index="x (predictor)", columns="y (predictee)", values="ppscore"
            )
            sns.heatmap(
                data=data,
                vmin=0,
                vmax=1,
                cmap="GnBu",
                annot=True,
                fmt=".2f",
                ax=ax,
            )
        else:
            raise TypeError(
                f"No appropriate matrix is present. This most likely is because a reduced dataframe was calculated with no target"
            )

        return ax

    # data getter
    @property
    def data(self) -> Union[pd.Series, pd.DataFrame]:
        """The feature data"""
        return self.__data

    # data setter
    @data.setter
    def data(self, data: Any) -> None:
        if hasattr(self, "data"):
            # do not allow changing of data
            raise AttributeError("This attribute has already been set")

        else:
            # convert to series and set
            self.__data = utils.to_frame(data)

    # target getter
    @property
    def target(self) -> pd.Series:
        """The target data"""
        self.__target: pd.Series
        return self.__target

    # target setter
    @target.setter
    def target(self, target: Any) -> None:
        if hasattr(self, "target") or not self.__has_target:
            # do not allow changing of data
            raise AttributeError("This attribute has already been set")

        else:
            # convert to series and set
            self.__target = utils.to_series(target)

    # prediction matrix getter
    @property
    def prediction_matrix(self) -> pd.DataFrame:
        """The prediction matrix data"""
        return self.__prediction_matrix