importloggingfromcollections.abcimportIterablefromitertoolsimportcombinationsfromtextwrapimportindentfromtypingimportListfrompandasimportDataFramefromeda_report._validateimport_validate_datasetdef_compute_correlation(dataframe:DataFrame)->List:"""Get the Pearson correlation coefficients for numeric variables. Args: dataframe (pandas.DataFrame): A 2D array of numeric data. Returns: Optional[List]: A list of column pairs and their Pearson's correlation coefficients; sorted by magnitude in descending order. """ifdataframeisNone:returnNonenumeric_data=dataframe.select_dtypes("number")ifnumeric_data.shape[1]<2:returnNoneelse:correlation_df=numeric_data.corr(method="pearson")unique_pairs=list(combinations(correlation_df.columns,r=2))correlation_info=[(pair,correlation_df.at[pair])forpairinunique_pairs]returnsorted(correlation_info,key=lambdax:-abs(x[1]))def_describe_correlation(corr_value:float)->str:"""Explain the nature and magnitude of correlation. Args: corr_value (str): Pearson's correlation coefficient. Returns: str: Brief description of correlation type. """nature=" positive"ifcorr_value>0else" negative"value=abs(corr_value)ifvalue>=0.8:strength="very strong"elifvalue>=0.6:strength="strong"elifvalue>=0.4:strength="moderate"elifvalue>=0.2:strength="weak"elifvalue>=0.05:strength="very weak"else:strength="virtually no"nature=""returnf"{strength}{nature} correlation ({corr_value:.2f})"
[docs]classDataset:"""Analyze two-dimensional datasets to obtain descriptive statistics and correlation information. Input data is stored as a :class:`pandas.DataFrame` in order to leverage pandas_' built-in statistical methods. .. _pandas: https://pandas.pydata.org/ Args: data (Iterable): The data to analyze. Example: .. literalinclude:: examples.txt :lines: 79-101 """def__init__(self,data:Iterable)->None:self.data=_validate_dataset(data)self._get_summary_statistics()self._get_bivariate_analysis()def__repr__(self)->str:"""Get the string representation for a `Dataset`. Returns: str: The string representation of the `Dataset` instance. """ifself._numeric_statsisNone:numeric_stats=""else:numeric_stats_title=("Summary Statistics for Numeric features "f"({self._numeric_stats.shape[0]})")numeric_stats="\n".join([f"\n\t\t{numeric_stats_title}",f"\t\t{'-'*len(numeric_stats_title)}",indent(f"{self._numeric_stats}\n"," "),])ifself._categorical_statsisNone:categorical_stats=""else:categorical_stats_title=("Summary Statistics for Categorical features "f"({self._categorical_stats.shape[0]})")categorical_stats="\n".join([f"\t{categorical_stats_title}",f"\t{'-'*len(categorical_stats_title)}",indent(f"{self._categorical_stats}\n"," "*4),])ifhasattr(self,"_correlation_descriptions"):max_pairs=min(20,len(self._correlation_descriptions))top_20=list(self._correlation_descriptions.items())[:max_pairs]corr_repr="\n".join([f"{var_pair[0]+' & '+var_pair[1]:>32} -> "f"{corr_description}"forvar_pair,corr_descriptionintop_20])correlation_description="\n".join(["\n\t\t\tPearson's Correlation (Top 20)",f"\t\t\t{'-'*30}",f"{corr_repr}",])else:correlation_description=""return"\n".join([f"{numeric_stats}",indent(f"{categorical_stats}","\t"),f"{correlation_description}","\t",])def_get_summary_statistics(self)->None:"""Compute descriptive statistics."""data=self.data.copy()numeric_data=data.select_dtypes("number")# Consider numeric columns with < 11 unique values as categoricalcategorical_with_numbers=[colforcolinnumeric_dataifnumeric_data[col].nunique()<11]numeric_data=numeric_data.drop(columns=categorical_with_numbers)ifnumeric_data.shape[1]<1:self._numeric_stats=Noneelse:numeric_stats=numeric_data.describe().Tnumeric_stats["count"]=numeric_stats["count"].astype("int")numeric_stats=numeric_stats.rename(columns={"mean":"avg","std":"stddev"})numeric_stats["skewness"]=numeric_data.skew(numeric_only=True)numeric_stats["kurtosis"]=numeric_data.kurt(numeric_only=True)self._numeric_stats=numeric_stats.round(4)categorical_data=data.drop(columns=numeric_data.columns).copy()ifcategorical_data.shape[1]<1:self._categorical_stats=Noneelse:forcolincategorical_data:# Convert categorical columns with "unique ratio" < 0.3 to# categorical dtype, which would consume much less memory.if(categorical_data[col].nunique()/len(categorical_data))<0.3:categorical_data[col]=categorical_data[col].astype("category")else:categorical_data[col]=categorical_data[col].astype("string")categorical_stats=categorical_data.describe().Tcategorical_stats["relative freq"]=(categorical_stats["freq"]/len(self.data)).apply(lambdax:f"{x:.2%}")self._categorical_stats=categorical_statsdef_get_bivariate_analysis(self)->None:"""Compare numeric column pairs."""self._correlation_values=_compute_correlation(self.data)ifself._correlation_valuesisNone:logging.warning("Skipped Bivariate Analysis: There are less than 2 numeric ""variables.")else:self._get_correlation_descriptions()def_get_correlation_descriptions(self)->None:"""Get brief descriptions of the nature of correlation between numeric column pairs."""self._correlation_descriptions={pair:_describe_correlation(corr_value)forpair,corr_valueinself._correlation_values}