Source code for eda_report.document

import logging
from typing import Iterable, Sequence, Union

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Inches, Pt
from docx.text.paragraph import Paragraph
from pandas import DataFrame, Series

from eda_report._content import _ReportContent

logging.basicConfig(
    format="[%(levelname)s %(asctime)s.%(msecs)03d] %(message)s",
    level=logging.INFO,
    datefmt="%H:%M:%S",
)
# Set matplotlib logging level to WARNING.
mpl_logger = logging.getLogger("matplotlib")
mpl_logger.setLevel(logging.WARNING)


[docs] class ReportDocument(_ReportContent): """Creates a report :class:`~docx.document.Document` with analysis results. The report consists of 3 main sections: #. An **Overview** of the data and its features. #. **Univariate Analysis**: Summary statistics and graphs for each feature. #. **Bivariate Analysis**: Pair-wise comparisons of numerical features. Args: data (Iterable): The data to analyze. title (str, optional): The title to assign the report. Defaults to "Exploratory Data Analysis Report". graph_color (str, optional): The color to apply to the graphs. Defaults to "cyan". groupby_variable (Union[str, int], optional): The column to use to group values. Defaults to None. output_filename (str, optional): The name/path to save the document to. Defaults to "eda-report.docx". table_style (str, optional): The style to apply to the tables created. Defaults to "Table Grid". """ def __init__( self, data: Iterable, *, title: str = "Exploratory Data Analysis Report", graph_color: str = "cyan", groupby_variable: Union[str, int] = None, output_filename: str = "eda-report.docx", table_style: str = "Table Grid", ) -> None: super().__init__( data, title=title, graph_color=graph_color, groupby_variable=groupby_variable, ) self.OUTPUT_FILENAME = output_filename self.TABLE_STYLE = table_style self.document = Document() # Initialize report document self._create_cover_page() self._get_univariate_analysis() if self.dataset._correlation_values is not None: self._get_bivariate_analysis() self._to_file() logging.info(f"Done. Results saved as {self.OUTPUT_FILENAME!r}") def _create_cover_page(self) -> None: """Add a title and overview of the data.""" self.document.add_heading(self.TITLE, level=0) self.document.add_paragraph(self.intro_text) self._get_numeric_overview_table() self._get_categorical_overview_table() self.document.add_page_break() def _get_numeric_overview_table(self) -> None: """Create a table with an overview of the numeric features present.""" if self.dataset._numeric_stats is None: return None else: heading = self.document.add_heading( "Overview of Numeric Features", level=1 ) self._format_paragraph_spacing(heading) # count | avg | stddev | min | 25% | 50% | 75% | max self._create_table( data=self.dataset._numeric_stats, header=True, column_widths=(1.2,) + (0.7,) * 8, font_size=8.5, style="Normal Table", ) def _get_categorical_overview_table(self) -> None: """Create a table with an overview of the categorical features present. """ if self.dataset._categorical_stats is None: return None else: heading = self.document.add_heading( "Overview of Categorical Features", level=1 ) self._format_paragraph_spacing(heading) # column-name | count | unique | top | freq | relative freq self._create_table( data=self.dataset._categorical_stats, header=True, column_widths=(1.2,) + (0.9,) * 5, font_size=8.5, style="Normal Table", ) def _get_univariate_analysis(self) -> None: """Get a brief introduction, summary statistics, and graphs for each individual variable. """ univariate_heading = self.document.add_heading( "1. Univariate Analysis", level=1 ) self._format_paragraph_spacing(univariate_heading, before=0, after=0) for idx, variable in enumerate(self.variables.values(), start=1): var_name = variable.name description = self.variable_descriptions[var_name] summary_stats = Series(self.univariate_stats[var_name]).to_frame() graphs = self.univariate_graphs[var_name] contingency_table = self.contingency_tables.get(var_name) normality_tests = self.normality_tests.get(var_name) # Variable's title and brief description heading = self.document.add_heading( f"1.{idx} {var_name}".title(), level=2 ) self._format_paragraph_spacing(heading, before=12, after=5) self.document.add_paragraph(description) # Summary statistics table stats_heading = self.document.add_heading( "Summary Statistics", level=4 ) stats_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER self._create_table(summary_stats, column_widths=[2.5, 2]) # Images of plotted graphs for name, image in graphs.items(): width = 3.3 if name == "prob_plot" else 4.2 self.document.add_picture(image, width=Inches(width)) picture_paragraph = self.document.paragraphs[-1] picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER if contingency_table is not None: contingency_table_heading = self.document.add_heading( "Contingency table", level=4 ) contingency_table_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER context = self.document.add_paragraph( f"Index = '{var_name}', " f"Columns = '{self.GROUPBY_DATA.name}' " ) context.alignment = WD_ALIGN_PARAGRAPH.CENTER context.runs[0].font.size = Pt(8) n_cols = contingency_table.shape[1] max_width = 5.2 if n_cols > 5 else 3.2 col_width = max_width / n_cols self._create_table( data=contingency_table, header=True, column_widths=(1.2,) + (col_width,) * n_cols, font_size=8.5, ) if normality_tests is not None: norm_test_heading = self.document.add_heading( "Tests for Normality", level=4 ) norm_test_heading.alignment = WD_ALIGN_PARAGRAPH.CENTER # type | p-value | conclusion self._create_table( data=normality_tests, header=True, column_widths=(2.2, 1, 2), font_size=8.5, style="Normal Table", ) self.document.add_page_break() def _get_bivariate_analysis(self) -> None: """Get comparisons and regression-plots for pairs of numeric variables. """ bivariate_heading = self.document.add_heading( "2. Bivariate Analysis", level=1 ) self._format_paragraph_spacing(bivariate_heading, before=0) overview_heading = self.document.add_heading("2.1 Overview", level=2) self._format_paragraph_spacing(overview_heading) self.document.add_picture( self.bivariate_graphs["correlation_plot"], width=Inches(6.4), ) picture_paragraph = self.document.paragraphs[-1] picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER self.document.add_page_break() pairwise_heading = self.document.add_heading( "2.2 Regression Plots (Top 20)", level=2 ) self._format_paragraph_spacing(pairwise_heading, before=0) for idx, var_pair in enumerate(self.bivariate_summaries, start=1): heading = self.document.add_heading( f"2.2.{idx} {var_pair[0]} vs {var_pair[1]}".title(), level=3 ) self._format_paragraph_spacing(heading, before=16, after=5) self.document.add_paragraph(self.bivariate_summaries[var_pair]) self.document.add_picture( self.bivariate_graphs["regression_plots"][var_pair], width=Inches(3.3), ) picture_paragraph = self.document.paragraphs[-1] picture_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER def _format_paragraph_spacing( self, paragraph: Paragraph, before: int = 15, after: int = 7 ) -> None: """Set the spacing above or below a paragraph. Args: paragraph (docx.text.paragraph.Paragraph): A paragraph. before (int, optional): Size of spacing above the paragraph in pt. Defaults to 15. after (int, optional): Size of spacing below the paragraph in pt. Defaults to 7. """ paragraph.paragraph_format.space_before = Pt(before) paragraph.paragraph_format.space_after = Pt(after) def _create_table( self, data: DataFrame, column_widths: Sequence = (), font_face: str = "Courier New", font_size: float = 10, style: str = None, header: bool = False, ) -> None: """Generates a table for the supplied ``data``. Args: data (DataFrame): The data to tabulate. column_widths (Sequence, optional): Column dimensions in inches. Defaults to (). font_face (str, optional): Font for cell text. Defaults to "Courier New". font_size (float, optional): Font size. Defaults to 10. style (str, optional): A `Word` table style. Defaults to None. header (bool, optional): Whether or not to include column names. Defaults to False. """ table = self.document.add_table( rows=0, cols=len(column_widths), style=style or self.document.styles[self.TABLE_STYLE], ) table.alignment = WD_ALIGN_PARAGRAPH.CENTER for idx, width in enumerate(column_widths): table.columns[idx].width = Inches(width) if header: cells = table.add_row().cells header_labels = [""] + list(data.columns) for cell, value in zip(cells, header_labels): cell.text = f"{value}" # Font size and type-face have to be set at `run` level run = cell.paragraphs[0].runs[0] run.bold = True run.font.size = Pt(font_size) run.font.name = font_face # Sequentially add and populate rows for row_data in data.itertuples(): cells = table.add_row().cells for idx, (cell, value) in enumerate(zip(cells, row_data)): try: # Strip trailing zeros from float values text = f"{value:.4f}".rstrip("0").rstrip(".") except ValueError: text = f"{value}" cell.text = text # Font size and type-face have to be set at `run` level run = cell.paragraphs[0].runs[0] run.font.size = Pt(font_size) run.font.name = font_face # Make first column values bold if header is True if idx == 0 and header: run.bold = True # Add empty paragraph. "Spacing" for docx Table isn't yet implemented self.document.add_paragraph() def _to_file(self) -> None: """Save the report as a file.""" for section in self.document.sections: section.left_margin = Inches(1.2) section.right_margin = Inches(1.2) self.document.save(self.OUTPUT_FILENAME)