Source code for pudl.extract.csv

"""Extractor for CSV data."""

from typing import Any

import pandas as pd

import pudl.logging_helpers
from pudl.extract.extractor import GenericExtractor, PartitionSelection


[docs]
logger = pudl.logging_helpers.get_logger(__name__)




[docs]
class CsvExtractor(GenericExtractor):
    """Class for extracting dataframes from CSV files.

    The extraction logic is invoked by calling extract() method of this class.
    """


[docs]
    READ_CSV_KWARGS: dict[str, Any] = {}

    """Keyword arguments that are passed to :meth:`pandas.read_csv`.

    These allow customization of the CSV parsing process. For example, you can specify
    the column delimeter, data types, date parsing, etc. This can greatly reduce peak
    memory usage and speed up the extraction process. Unfortunately you must refer to
    the column headers using their original names as they appear in the CSV.

    TODO[zaneselvans] 2024-04-19: it would be useful to be able to specify different CSV
    reading options for different pages within the same dataset. At the moment the same
    arguments will be applied to all pages. This still allows some flexibility because
    some :meth:`pandas.read_csv` arguments like ``dtype`` don't raise errors if the
    columns they apply to aren't present.
    """


[docs]
    def source_filename(self, page: str, **partition: PartitionSelection) -> str:
        """Produce the source CSV file name as it will appear in the archive.

        Args:
            page: pudl name for the dataset contents, eg "boiler_generator_assn" or
                "data"
            partition: partition to load. Examples:
                {'year': 2009}
                {'year_month': '2020-08'}

        Returns:
            string name of the CSV file
        """
        partition_selection = self._metadata._get_partition_selection(partition)
        return f"{self._dataset_name}_{partition_selection}.csv"



[docs]
    def load_source(self, page: str, **partition: PartitionSelection) -> pd.DataFrame:
        """Produce the dataframe object for the given partition.

        Args:
            page: pudl name for the dataset contents, eg "boiler_generator_assn" or
                "data"
            partition: partition to load. Examples:
                {'year': 2009}
                {'year_month': '2020-08'}

        Returns:
            pd.DataFrame instance containing CSV data
        """
        filename = self.source_filename(page, **partition)

        with (
            self.ds.get_zipfile_resource(self._dataset_name, **partition) as zf,
            zf.open(filename) as f,
        ):
            df = pd.read_csv(f, **self.READ_CSV_KWARGS)

        return df