Source code for pudl.transform.eia_bulk_elec

"""Clean and normalize EIA bulk electricity data.

EIA's bulk electricity data contains 680,000 timeseries. These timeseries contain a
variety of measures (fuel amount and cost are just two) across multiple levels of
aggregation, from individual plants to national averages.

The data is formatted as a single 1.1GB text file of line-delimited JSON with one line
per timeseries. Each JSON structure has two nested levels: the top level contains
metadata describing the series and the second level (under the "data" heading) contains
an array of timestamp/value pairs. This structure leads to a natural normalization into
two tables: one of metadata and one of timeseries. That is the format delivered by the
extract module.

The transform module parses a compound primary key out of long string IDs ("series_id").
The rest of the metadata is not very valuable so is not transformed or returned.

The EIA aggregates are related to their component categories via a set of association
tables defined in pudl.metadata.dfs. For example, the "all_coal" fuel aggregate is
linked to all the coal-related energy_source_code values: BIT, SUB, LIG, and WC. Similar
relationships are defined for aggregates over fuel, sector, geography, and time.
"""

import pandas as pd


[docs] def _extract_keys_from_series_id(raw_df: pd.DataFrame) -> pd.DataFrame: """Parse primary key codes from EIA series_id. These codes comprise the compound primary key that uniquely identifies a data series: (metric, fuel, region, sector, frequency). """ # drop first one (constant value of "ELEC") keys = ( raw_df.loc[:, "series_id"] .str.split(r"[\.-]", expand=True, regex=True) .drop(columns=0) ) keys.columns = pd.Index( ["series_code", "fuel_agg", "geo_agg", "sector_agg", "temporal_agg"] ) return keys
[docs] def _map_key_codes_to_readable_values(compound_keys: pd.DataFrame) -> pd.DataFrame: keys = compound_keys.copy() mappings = { "fuel_agg": { # match values in pudl.metadata.dfs.py:EIA_FUEL_AGGREGATE_ASSN "BIT": "bituminous_coal", "SUB": "sub_bituminous_coal", "LIG": "lignite_coal", "COW": "all_coal", "NG": "natural_gas", "PC": "petroleum_coke", "PEL": "petroleum_liquids", }, "sector_agg": { # match values in pudl.metadata.dfs.py:EIA_SECTOR_AGGREGATE_ASSN "1": "electric_utility", "2": "ipp_non_cogen", "3": "ipp_cogen", "4": "commercial_non_cogen", "5": "commercial_cogen", "6": "industrial_non_cogen", "7": "industrial_cogen", "94": "all_ipp", "96": "all_commercial", "97": "all_industrial", "98": "all_electric_power", # all_IPP + regulated utilities "99": "all_sectors", }, "temporal_agg": { "M": "monthly", "Q": "quarterly", "A": "annual", }, } for col_name, mapping in mappings.items(): keys.loc[:, col_name] = keys.loc[:, col_name].map(mapping) assert ( keys.loc[:, col_name].notnull().all() ), f"{col_name} contains an unmapped category." keys = keys.astype("category") return keys
[docs] def _transform_timeseries(raw_ts: pd.DataFrame) -> pd.DataFrame: """Transform raw timeseries. Transform to tidy format and replace the obscure series_id with a readable compound primary key. Returns: A dataframe with compound key ("fuel_agg", "geo_agg", "sector_agg", "temporal_agg", "report_date") and two value columns: "fuel_received_mmbtu", "fuel_cost_per_mmbtu" """ compound_key = _map_key_codes_to_readable_values( _extract_keys_from_series_id(raw_ts) ) ts = pd.concat([compound_key, raw_ts.drop(columns="series_id")], axis=1) ts = ts.pivot( index=["fuel_agg", "geo_agg", "sector_agg", "temporal_agg", "date"], columns="series_code", ) ts.columns = ts.columns.droplevel(level=None) ts.columns.name = None # remove "series_code" as name - no longer appropriate ts = ts.reset_index(drop=False) # convert units from billion BTU to MMBTU for consistency with other PUDL tables ts.loc[:, "RECEIPTS_BTU"] *= 1000 ts = ts.rename( columns={ "RECEIPTS_BTU": "fuel_received_mmbtu", "COST_BTU": "fuel_cost_per_mmbtu", "date": "report_date", }, ) return ts
# TODO (bendnorman): Are we planning on extracting multiple dataframes from the EIA API?
[docs] def transform(raw_dfs: dict[str, pd.DataFrame]) -> pd.DataFrame: """Transform raw EIA bulk electricity aggregates. Args: raw_dfs: raw timeseries dataframe Returns: Transformed timeseries dataframe with compound key: ("fuel_agg", "geo_agg", "sector_agg", "temporal_agg", "report_date") and two value columns: "fuel_received_mmbtu", "fuel_cost_per_mmbtu" """ ts = _transform_timeseries(raw_dfs["timeseries"]) # raw_dfs["metadata"] is mostly useless after joining the keys into the timeseries, # so don't return it return ts