Source code for pudl.extract.parquet

"""Extractor for Parquet data."""

import io

import pandas as pd

import pudl.logging_helpers
from pudl.extract.extractor import GenericExtractor, PartitionSelection

[docs] logger = pudl.logging_helpers.get_logger(__name__)
[docs] class ParquetExtractor(GenericExtractor): """Class for extracting dataframes from parquet files. The extraction logic is invoked by calling extract() method of this class. """
[docs] def source_filename(self, page: str, **partition: PartitionSelection) -> str: """Produce the source Parquet file name as it will appear in the archive. Args: page: pudl name for the dataset contents, eg "boiler_generator_assn" or "data" partition: partition to load. Examples: {'year': 2009} Returns: string name of the parquet file """ partition_selection = self._metadata._get_partition_selection(partition) return f"{self._dataset_name}-{partition_selection}.parquet"
[docs] def load_source(self, page: str, **partition: PartitionSelection) -> pd.DataFrame: """Produce the dataframe object for the given partition. This method assumes that the archive includes one unzipped file per partition. Args: page: pudl name for the dataset contents, eg "boiler_generator_assn" or "data" partition: partition to load. Examples: {'year': 2009} {'year_month': '2020-08'} Returns: pd.DataFrame instance containing CSV data """ res = self.ds.get_unique_resource(self._dataset_name, **partition) df = pd.read_parquet(io.BytesIO(res)) return df