Source code for eda_report.bivariate

import logging
from collections.abc import Iterable
from itertools import combinations
from textwrap import indent
from typing import List

from pandas import DataFrame

from eda_report._validate import _validate_dataset


def _compute_correlation(dataframe: DataFrame) -> List:
    """Get the Pearson correlation coefficients for numeric variables.

    Args:
        dataframe (pandas.DataFrame): A 2D array of numeric data.

    Returns:
        Optional[List]: A list of column pairs and their Pearson's correlation
        coefficients; sorted by magnitude in descending order.
    """
    if dataframe is None:
        return None

    numeric_data = dataframe.select_dtypes("number")
    if numeric_data.shape[1] < 2:
        return None
    else:
        correlation_df = numeric_data.corr(method="pearson")
        unique_pairs = list(combinations(correlation_df.columns, r=2))
        correlation_info = [
            (pair, correlation_df.at[pair]) for pair in unique_pairs
        ]
        return sorted(correlation_info, key=lambda x: -abs(x[1]))


def _describe_correlation(corr_value: float) -> str:
    """Explain the nature and magnitude of correlation.

    Args:
        corr_value (str): Pearson's correlation coefficient.

    Returns:
        str: Brief description of correlation type.
    """
    nature = " positive" if corr_value > 0 else " negative"
    value = abs(corr_value)
    if value >= 0.8:
        strength = "very strong"
    elif value >= 0.6:
        strength = "strong"
    elif value >= 0.4:
        strength = "moderate"
    elif value >= 0.2:
        strength = "weak"
    elif value >= 0.05:
        strength = "very weak"
    else:
        strength = "virtually no"
        nature = ""
    return f"{strength}{ nature} correlation ({corr_value:.2f})"



[docs]
class Dataset:
    """Analyze two-dimensional datasets to obtain descriptive statistics
    and correlation information.

    Input data is stored as a :class:`pandas.DataFrame` in order to leverage
    pandas_' built-in statistical methods.

    .. _pandas: https://pandas.pydata.org/

    Args:
        data (Iterable): The data to analyze.

    Example:
        .. literalinclude:: examples.txt
           :lines: 79-101
    """

    def __init__(self, data: Iterable) -> None:
        self.data = _validate_dataset(data)
        self._get_summary_statistics()
        self._get_bivariate_analysis()

    def __repr__(self) -> str:
        """Get the string representation for a `Dataset`.

        Returns:
            str: The string representation of the `Dataset` instance.
        """
        if self._numeric_stats is None:
            numeric_stats = ""
        else:
            numeric_stats_title = (
                "Summary Statistics for Numeric features "
                f"({self._numeric_stats.shape[0]})"
            )
            numeric_stats = "\n".join(
                [
                    f"\n\t\t  {numeric_stats_title}",
                    f"\t\t  {'-' * len(numeric_stats_title)}",
                    indent(f"{self._numeric_stats}\n", "  "),
                ]
            )

        if self._categorical_stats is None:
            categorical_stats = ""
        else:
            categorical_stats_title = (
                "Summary Statistics for Categorical features "
                f"({self._categorical_stats.shape[0]})"
            )
            categorical_stats = "\n".join(
                [
                    f"\t{categorical_stats_title}",
                    f"\t{'-' * len(categorical_stats_title)}",
                    indent(f"{self._categorical_stats}\n", " " * 4),
                ]
            )
        if hasattr(self, "_correlation_descriptions"):
            max_pairs = min(20, len(self._correlation_descriptions))
            top_20 = list(self._correlation_descriptions.items())[:max_pairs]
            corr_repr = "\n".join(
                [
                    f"{var_pair[0] + ' & ' + var_pair[1]:>32} -> "
                    f"{corr_description}"
                    for var_pair, corr_description in top_20
                ]
            )
            correlation_description = "\n".join(
                [
                    "\n\t\t\tPearson's Correlation (Top 20)",
                    f"\t\t\t{'-' * 30}",
                    f"{corr_repr}",
                ]
            )
        else:
            correlation_description = ""

        return "\n".join(
            [
                f"{numeric_stats}",
                indent(f"{categorical_stats}", "\t"),
                f"{correlation_description}",
                "\t",
            ]
        )

    def _get_summary_statistics(self) -> None:
        """Compute descriptive statistics."""
        data = self.data.copy()
        numeric_data = data.select_dtypes("number")
        # Consider numeric columns with < 11 unique values as categorical
        categorical_with_numbers = [
            col for col in numeric_data if numeric_data[col].nunique() < 11
        ]
        numeric_data = numeric_data.drop(columns=categorical_with_numbers)
        if numeric_data.shape[1] < 1:
            self._numeric_stats = None
        else:
            numeric_stats = numeric_data.describe().T
            numeric_stats["count"] = numeric_stats["count"].astype("int")
            numeric_stats = numeric_stats.rename(
                columns={"mean": "avg", "std": "stddev"}
            )
            numeric_stats["skewness"] = numeric_data.skew(numeric_only=True)
            numeric_stats["kurtosis"] = numeric_data.kurt(numeric_only=True)
            self._numeric_stats = numeric_stats.round(4)

        categorical_data = data.drop(columns=numeric_data.columns).copy()
        if categorical_data.shape[1] < 1:
            self._categorical_stats = None
        else:
            for col in categorical_data:
                # Convert categorical columns with "unique ratio" < 0.3 to
                # categorical dtype, which would consume much less memory.
                if (
                    categorical_data[col].nunique() / len(categorical_data)
                ) < 0.3:
                    categorical_data[col] = categorical_data[col].astype(
                        "category"
                    )
                else:
                    categorical_data[col] = categorical_data[col].astype(
                        "string"
                    )
            categorical_stats = categorical_data.describe().T
            categorical_stats["relative freq"] = (
                categorical_stats["freq"] / len(self.data)
            ).apply(lambda x: f"{x :.2%}")
            self._categorical_stats = categorical_stats

    def _get_bivariate_analysis(self) -> None:
        """Compare numeric column pairs."""
        self._correlation_values = _compute_correlation(self.data)
        if self._correlation_values is None:
            logging.warning(
                "Skipped Bivariate Analysis: There are less than 2 numeric "
                "variables."
            )
        else:
            self._get_correlation_descriptions()

    def _get_correlation_descriptions(self) -> None:
        """Get brief descriptions of the nature of correlation between numeric
        column pairs."""
        self._correlation_descriptions = {
            pair: _describe_correlation(corr_value)
            for pair, corr_value in self._correlation_values
        }