[docs]defcore_eia860m__changelog_generators(raw_eia860m__generator_proposed,raw_eia860m__generator_existing,raw_eia860m__generator_retired,)->pd.DataFrame:"""Changelog of EIA-860M Generators based on operating status. The monthly reported EIA-860M tables includes existing, proposed and retired generators. This table combines all monthly reported data and preserves the first reported record when any new information about the generator was reported. We are not putting this table through PUDL's standard normalization process for EIA tables (see :func:`pudl.transform.eia.harvest_entity_tables`). EIA-860M includes provisional data reported monthly so it changes frequently compared to the more stable annually reported EIA data. If we fed all of the EIA-860M data into the harvesting process, we would get failures because the records from EIA-80m are too inconsistent for our thresholds for harvesting canonical values for entities. A ramification of this table not being harvested is that if there are any entities (generators, plants, utilities) that were only ever reported in an older EIA-860M file, there will be no record of it in the PUDL entity or SCD tables. Therefore, this asset cannot have foreign key relationships with the rest of the core EIA tables. """# compile all of the columns so these 860m bbs have everything for the transformeia860_columns=pudl.helpers.dedupe_n_flatten_list_of_lists([pudl.extract.excel.ExcelMetadata("eia860").get_all_columns(gen_table)forgen_tablein["generator_proposed","generator_existing","generator_retired","generator",]])eia860m_all=pudl.transform.eia860._core_eia860__generators(raw_eia860__generator_proposed=raw_eia860m__generator_proposed,raw_eia860__generator_existing=raw_eia860m__generator_existing,raw_eia860__generator_retired=raw_eia860m__generator_retired.assign(operational_status_code=pd.NA),# pass an empty genertor df here. 860 old years had one big gens tab# but 860m doesn't. we do this just to enable us to run the 860 transform# function. We add all of the columns to it so we don't have any errors# from missing columnsraw_eia860__generator=pd.DataFrame(columns=list(eia860_columns)).convert_dtypes(),).assign(# In order to be able to compare the values in this table to those reported# elsewhere, we need to translate these categories to the associated codes, as# the strings associated with the codes vary from table to table. See the# core_eia__codes_sector_consolidated table for the canonical definition of the# codes.sector_id_eia=lambdadf:df["sector_name_eia"].map({"Electric Utility":1,"IPP Non-CHP":2,"IPP CHP":3,"Commercial Non-CHP":4,"Commercial CHP":5,"Industrial Non-CHP":6,"Industrial CHP":7,}))# Drop all columns that aren't part of EIA-860M prior to deduplication.eia860m_all=eia860m_all[[field.nameforfieldinpudl.metadata.classes.Resource.from_id("core_eia860m__changelog_generators").schema.fieldsiffield.name!="valid_until_date"]]# there is one plant/gen that has duplicate valuesgens_idx=["plant_id_eia","generator_id","report_date"]dupe_mask=(eia860m_all.plant_id_eia==56032)&(eia860m_all.generator_id=="1")deduped=eia860m_all[dupe_mask].drop_duplicates(subset=gens_idx,keep="first")without_known_dupes=eia860m_all[~dupe_mask]eia860m_deduped=pd.concat([without_known_dupes,deduped])# Check whether we have truly deduplicated the dataframe.remaining_dupes=eia860m_deduped[eia860m_deduped.duplicated(subset=gens_idx,keep=False)]ifnotremaining_dupes.empty:raiseValueError(f"Duplicate ownership slices found in 860m table: {remaining_dupes}")gen_idx_no_date=[cforcingens_idxifc!="report_date"]eia860m_all=pudl.helpers.expand_timeseries(df=eia860m_deduped,key_cols=gen_idx_no_date,date_col="report_date",freq="MS",fill_through_freq="month",)# assign a max report_date column for use in the valid_until_date columneia860m_all["report_date_max"]=eia860m_all.groupby(gen_idx_no_date)["report_date"].transform("max")# drop duplicates after sorting by date so we get the first appreanceeia860m_changelog=eia860m_all.sort_values(by=["report_date"],ascending=True).drop_duplicates(subset=[cforcineia860m_allifc!="report_date"],keep="first",)eia860m_changelog["valid_until_date"]=(eia860m_changelog.sort_values(gens_idx,ascending=False).groupby(gen_idx_no_date)["report_date"].transform("shift").fillna(eia860m_changelog.report_date_max).where(eia860m_changelog["report_date"]!=eia860m_changelog["report_date_max"]))returneia860m_changelog