"""Retrieve data from EIA Form 860M spreadsheets for analysis.This modules pulls data from EIA's published Excel spreadsheets.This code is for use analyzing EIA Form 860M data. EIA 860M is only used inconjunction with EIA 860. This module boths extracts EIA 860M and appendsthe extracted EIA 860M dataframes to the extracted EIA 860 dataframes. Examplesetup with pre-genrated `eia860_raw_dfs` and datastore as `ds`:eia860m_raw_dfs = pudl.extract.eia860m.Extractor(ds).extract( Eia860Settings.eia860m_date)eia860_raw_dfs = pudl.extract.eia860m.append_eia860m( eia860_raw_dfs=eia860_raw_dfs, eia860m_raw_dfs=eia860m_raw_dfs)"""fromdatetimeimportdatetimeimportpandasaspdfromdagsterimportAssetOut,Output,asset,multi_assetimportpudl.logging_helpersfrompudl.extractimportexcelfrompudl.helpersimportremove_leading_zeros_from_numeric_strings
[docs]classExtractor(excel.ExcelExtractor):"""Extractor for the excel dataset EIA860M."""def__init__(self,*args,**kwargs):"""Initialize the module. Args: ds (:class:datastore.Datastore): Initialized datastore. """self.METADATA=excel.ExcelMetadata("eia860m")self.cols_added=[]super().__init__(*args,**kwargs)
[docs]defprocess_raw(self,df,page,**partition):"""Adds source column and report_year column if missing."""df=df.rename(columns=self._metadata.get_column_map(page,**partition))if"report_year"notindf.columns:df["report_year"]=datetime.strptime(list(partition.values())[0],"%Y-%m").yeardf["report_date"]=pd.to_datetime(list(partition.values())[0],format="%Y-%m",exact=False)df=self.add_data_maturity(df,page,**partition)self.cols_added.append("report_year")self.cols_added.append("report_date")# Eventually we should probably make this a transformforcolin["generator_id","boiler_id"]:ifcolindf.columns:df=remove_leading_zeros_from_numeric_strings(df=df,col_name=col)returndf
@staticmethod
[docs]defget_dtypes(page,**partition):"""Returns dtypes for plant id columns."""return{"Plant ID":pd.Int64Dtype(),}
[docs]defappend_eia860m(eia860_raw_dfs:dict[str,pd.DataFrame],eia860m_raw_dfs:dict[str,pd.DataFrame])->dict[str,pd.DataFrame]:"""Append EIA 860M to the pages to. Args: eia860_raw_dfs: EIA 860 raw tables. Result of :meth:`pudl.extract.eia860.Extractor.extract` eia860m_raw_dfs: EIA 860M raw tables. Restult of :meth:`Extractor.extract` Return: Augmented version of eia860_raw_dfs. Each raw page stored in eia860m_raw_dfs appended to its eia860_raw_dfs counterpart. """meta_eia860m=excel.ExcelMetadata("eia860m")pages_eia860m=meta_eia860m.get_all_pages()# page names in 860m and 860 are the same.forpageinpages_eia860m:eia860_raw_dfs[page]=pd.concat([eia860_raw_dfs[page],eia860m_raw_dfs[page].drop(columns=["report_date"])],ignore_index=True,sort=True,)returneia860_raw_dfs
[docs]defraw_eia860m__all_dfs(context):"""Extract raw EIA 860M data from excel sheets into dict of dataframes."""eia_settings=context.resources.dataset_settings.eiads=context.resources.datastoreeia860m_extractor=Extractor(ds=ds)raw_eia860m__all_dfs=eia860m_extractor.extract(year_month=eia_settings.eia860m.year_months)returnraw_eia860m__all_dfs
[docs]defextract_eia860m(raw_eia860m__all_dfs:dict[str,pd.DataFrame]):"""Extract raw EIA data from excel sheets into dataframes."""# create descriptive table_namesraw_eia860m__all_dfs={"raw_eia860m__"+table_name:dffortable_name,dfinraw_eia860m__all_dfs.items()}raw_eia860m__all_dfs=dict(sorted(raw_eia860m__all_dfs.items()))return(Output(output_name=table_name,value=df)fortable_name,dfinraw_eia860m__all_dfs.items())