Source code for pudl.extract.eia860

"""Retrieve data from EIA Form 860 spreadsheets for analysis.

This modules pulls data from EIA's published Excel spreadsheets.

This code is for use analyzing EIA Form 860 data.
import pandas as pd

import pudl.logging_helpers
from pudl.extract import excel
from pudl.helpers import remove_leading_zeros_from_numeric_strings
from pudl.settings import Eia860Settings

[docs]logger = pudl.logging_helpers.get_logger(__name__)
[docs]class Extractor(excel.GenericExtractor): """Extractor for the excel dataset EIA860.""" def __init__(self, *args, **kwargs): """Initialize the module. Args: ds (:class:datastore.Datastore): Initialized datastore. """ self.METADATA = excel.Metadata("eia860") self.cols_added = [] super().__init__(*args, **kwargs)
[docs] def process_raw(self, df, page, **partition): """Apply necessary pre-processing to the dataframe. * Rename columns based on our compiled spreadsheet metadata * Add report_year if it is missing * Add a flag indicating if record came from EIA 860, or EIA 860M * Fix any generator_id values with leading zeroes. """ df = df.rename(columns=self._metadata.get_column_map(page, **partition)) if "report_year" not in df.columns: df["report_year"] = list(partition.values())[0] self.cols_added = ["report_year"] # Eventually we should probably make this a transform if "generator_id" in df.columns: df = remove_leading_zeros_from_numeric_strings( df=df, col_name="generator_id" ) df = self.add_data_maturity(df, page, **partition) return df
[docs] def extract(self, settings: Eia860Settings = Eia860Settings()): """Extracts dataframes. Returns dict where keys are page names and values are DataFrames containing data across given years. Args: settings: Object containing validated settings relevant to EIA 860. Contains the tables and years to be loaded into PUDL. """ return super().extract(year=settings.years)
[docs] def get_dtypes(page, **partition): """Returns dtypes for plant id columns.""" return { "Plant ID": pd.Int64Dtype(), "Plant Id": pd.Int64Dtype(), }