Source code for eda_report.univariate

from collections.abc import Iterable
from textwrap import shorten
from typing import Dict, Optional, Tuple

import numpy as np
from pandas import DataFrame, Series
from pandas.api.types import (
    is_bool_dtype,
    is_datetime64_any_dtype,
    is_numeric_dtype,
)
from scipy import stats

from eda_report._validate import _validate_univariate_input



[docs]
class Variable:
    """Obtain summary statistics and properties such as data type, missing
    value info & cardinality from one-dimensional datasets.

    Args:
        data (Iterable): The data to analyze.
        name (str, optional): The name to assign the variable. Defaults to
            None.

    Examples:
        .. literalinclude:: examples.txt
           :lines: 6-32
        .. literalinclude:: examples.txt
           :lines: 36-50
        .. literalinclude:: examples.txt
           :lines: 54-71
    """

    def __init__(self, data: Iterable, *, name: str = None) -> None:
        data = _validate_univariate_input(data, name=name)

        #: str: The variable's *name*. If no name is specified, the name will
        #: be set the value of the ``name`` attribute of the input data, or
        #: ``None``.
        self.name = data.name

        #: str: The type of variable — one of *"boolean"*, *"categorical"*,
        #: *"datetime"*, *"numeric"* or *"numeric (<=10 levels)"*.
        self.var_type = self._get_variable_type(data)

        #: int: The *number of unique values* present in the variable.
        self.num_unique = data.nunique()

        #: numpy.ndarray: The *unique values* present in the variable.
        self.unique_values = np.sort(data.dropna().unique())

        #: str: The number of *missing values* in the form
        #: ``number (% of total count)`` e.g "4 (16.67%)".
        self.missing = self._get_missing_values_info(data)

        #: dict: Descriptive statistics
        self.summary_stats = self._get_summary_statistics(data)

        self._num_non_null = len(data.dropna())
        self._normality_test_results = self._test_for_normality(data)
        self._most_common_categories = self._get_most_common_categories(data)

    def __repr__(self) -> str:
        """Define the string representation of a `Variable`.

        Returns:
            str: Variable summary.
        """
        sample_values = shorten(
            f"{self.num_unique} -> {self.unique_values}",
            width=60,
            placeholder=" ... ]",
        )
        basic_details = "\n".join(
            [
                f"\nName: {self.name}",
                f"Type: {self.var_type}",
                f"Non-null Observations: {self._num_non_null}",
                f"Unique Values: {sample_values}",
                f"Missing Values: {self.missing}",
            ]
        )
        if self.var_type == "numeric":
            summary_stats = "\n".join(
                [
                    f"\t{key + ':':21} {value :>15.4f}"
                    for key, value in self.summary_stats.items()
                ],
            )
            return "\n".join(
                [
                    f"{basic_details}\n",
                    "\t\t  Summary Statistics",
                    "\t\t  ------------------",
                    summary_stats,
                    "\n\t\t  Tests for Normality",
                    "\t\t  -------------------",
                    f"{self._normality_test_results}",
                ]
            )
        elif self.var_type == "datetime":
            summary_stats = "\n".join(
                [
                    f"\t{key + ':':18} {str(value):>22}"
                    for key, value in self.summary_stats.items()
                ],
            )
            return "\n".join(
                [
                    f"{basic_details}\n",
                    "\t\t  Summary Statistics",
                    "\t\t  ------------------",
                    summary_stats,
                ]
            )
        else:
            summary_stats = "\n".join(
                [
                    f"{key}: {value}"
                    for key, value in self.summary_stats.items()
                ]
            )
            most_common = "\n".join(
                [
                    f"{str(key):>24}: {value}"
                    for key, value in self._most_common_categories.items()
                ]
            )
            return "\n".join(
                [
                    basic_details,
                    summary_stats,
                    "\n\t\tMost Common Items",
                    "\t\t-----------------",
                    most_common,
                ]
            )

    def _get_variable_type(self, data: Series) -> str:
        """Determine the variable type.

        Args:
            data (pandas.Series): The data to analyze.

        Returns:
            str: The variable type: `boolean`, `categorical`, `datetime`,
            `numeric` or `numeric (<10 levels)`.
        """
        if is_numeric_dtype(data):
            if is_bool_dtype(data) or set(data.dropna()) == {0, 1}:
                # Consider data consisting of ones and zeros as boolean
                return "boolean"
            elif data.nunique() <= 10:
                # Consider numeric data with cardinality <= 10 as categorical
                return "numeric (<=10 levels)"
            else:
                return "numeric"
        # Accomodate common values for boolean variables
        elif set(data.dropna()) in [
            {False, True},
            {"False", "True"},
            {"No", "Yes"},
            {"N", "Y"},
        ]:
            return "boolean"
        elif is_datetime64_any_dtype(data):
            return "datetime"
        else:
            return "categorical"

    def _get_missing_values_info(self, data: Series) -> Optional[str]:
        """Get the number of missing values.

        Args:
            data (pandas.Series): The data to analyze.

        Returns:
            Optional[str]: Details about the number of missing values.
        """
        missing_values = data.isna().sum()
        if missing_values == 0:
            return None
        else:
            return f"{missing_values:,} ({missing_values / len(data):.2%})"

    def _get_summary_statistics(self, data: Series) -> Dict:
        """Compute summary statistics for the variable based on data type.

        Args:
            data (pandas.Series): The data to analyze.

        Returns:
            Dict: Summary statistics.
        """
        if self.var_type == "numeric":
            stats = data.describe()
            return {
                "Average": stats["mean"],
                "Standard Deviation": stats["std"],
                "Minimum": stats["min"],
                "Lower Quartile": stats["25%"],
                "Median": stats["50%"],
                "Upper Quartile": stats["75%"],
                "Maximum": stats["max"],
                "Skewness": data.skew(),
                "Kurtosis": data.kurt(),
            }
        elif self.var_type == "datetime":
            stats = data.describe()
            return {
                "Average": stats["mean"],
                "Minimum": stats["min"],
                "Lower Quartile": stats["25%"],
                "Median": stats["50%"],
                "Upper Quartile": stats["75%"],
                "Maximum": stats["max"],
            }
        else:
            data = data.copy().astype("category")
            stats = data.describe()
            return {
                "Mode (Most frequent)": stats["top"],
                "Maximum frequency": stats["freq"],
            }

    def _test_for_normality(
        self, data: Series, alpha: float = 0.05
    ) -> DataFrame:
        """Perform the "D'Agostino's K-squared", "Kolmogorov-Smirnov" and
        "Shapiro-Wilk" tests for normality.

        Args:
            data (pandas.Series): The data to analyze.
            alpha (float, optional): The level of significance. Defaults to
                0.05.

        Returns:
            pandas.DataFrame: Table of results.
        """
        data = data.dropna()
        if self.var_type == "numeric":
            # The scipy implementation of the Shapiro-Wilk test reports:
            # "For N > 5000 the W test statistic is accurate but the p-value
            # may not be."
            shapiro_sample = data.sample(5000) if len(data) > 5000 else data
            tests = [
                "D'Agostino's K-squared test",
                "Kolmogorov-Smirnov test",
                "Shapiro-Wilk test",
            ]
            p_values = [
                stats.normaltest(data).pvalue,
                stats.kstest(data, "norm", N=200).pvalue,
                stats.shapiro(shapiro_sample).pvalue,
            ]
            results = DataFrame(index=tests)
            results["p-value"] = [f"{x:.7f}" for x in p_values]
            results[f"Conclusion at α = {alpha}"] = [
                "Possibly normal"
                if p_value > alpha
                else "Unlikely to be normal"
                for p_value in p_values
            ]
            return results
        else:
            return None

    def _get_most_common_categories(self, data: Series) -> Dict:
        """Get the top 10 frequently occuring categories.

        Args:
            data (pandas.Series): The data to analyze.

        Returns:
            Dict: Top 10 categories and their frequency info.
        """
        data = data.dropna()
        if self.var_type in {"numeric", "datetime"}:
            return None
        else:
            top_10 = data.value_counts().nlargest(10)
            return {
                key: f"{val} ({val/len(data):.2%})"
                for key, val in top_10.items()
            }


[docs]
    def rename(self, name: str) -> None:
        """Update the variable's name.

        Args:
            name (str): New name.
        """
        self.name = name




def _analyze_univariate(name_and_data: Tuple) -> Variable:
    """Helper function to concurrently analyze data with multiprocessing.

    Args:
        name_and_data (Tuple): Name and data.

    Returns:
        Variable: `Variable` instance.
    """
    name, data = name_and_data
    var = Variable(data, name=name)
    return name, var