Source code for pudl.output.eia860

"""Functions for pulling data primarily from the EIA's Form 860."""

import pandas as pd
import sqlalchemy as sa

import pudl
import pudl.constants as pc


[docs]def utilities_eia860(pudl_engine, start_date=None, end_date=None): """Pulls all fields from the EIA860 Utilities table. Args: pudl_engine (sqlalchemy.engine.Engine): SQLAlchemy connection engine for the PUDL DB. start_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. end_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. Returns: pandas.DataFrame: A DataFrame containing all the fields of the EIA 860 Utilities table. """ pt = pudl.output.pudltabl.get_table_meta(pudl_engine) # grab the entity table utils_eia_tbl = pt['utilities_entity_eia'] utils_eia_select = sa.sql.select([utils_eia_tbl]) utils_eia_df = pd.read_sql(utils_eia_select, pudl_engine) # grab the annual eia entity table utils_eia860_tbl = pt['utilities_eia860'] utils_eia860_select = sa.sql.select([utils_eia860_tbl]) if start_date is not None: start_date = pd.to_datetime(start_date) utils_eia860_select = utils_eia860_select.where( utils_eia860_tbl.c.report_date >= start_date ) if end_date is not None: end_date = pd.to_datetime(end_date) utils_eia860_select = utils_eia860_select.where( utils_eia860_tbl.c.report_date <= end_date ) utils_eia860_df = pd.read_sql(utils_eia860_select, pudl_engine) # grab the glue table for the utility_id_pudl utils_g_eia_tbl = pt['utilities_eia'] utils_g_eia_select = sa.sql.select([ utils_g_eia_tbl.c.utility_id_eia, utils_g_eia_tbl.c.utility_id_pudl, ]) utils_g_eia_df = pd.read_sql(utils_g_eia_select, pudl_engine) out_df = pd.merge(utils_eia_df, utils_eia860_df, how='left', on=['utility_id_eia', ]) out_df = pd.merge(out_df, utils_g_eia_df, how='left', on=['utility_id_eia', ]) out_df = out_df.drop(['id'], axis='columns') first_cols = [ 'report_date', 'utility_id_eia', 'utility_id_pudl', 'utility_name_eia', ] out_df = pudl.helpers.organize_cols(out_df, first_cols) out_df = pudl.helpers.extend_annual( out_df, start_date=start_date, end_date=end_date) return out_df
[docs]def plants_eia860(pudl_engine, start_date=None, end_date=None): """Pulls all fields from the EIA Plants tables. Args: pudl_engine (sqlalchemy.engine.Engine): SQLAlchemy connection engine for the PUDL DB. start_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. end_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. Returns: pandas.DataFrame: A DataFrame containing all the fields of the EIA 860 Plants table. """ pt = pudl.output.pudltabl.get_table_meta(pudl_engine) # grab the entity table plants_eia_tbl = pt['plants_entity_eia'] plants_eia_select = sa.sql.select([plants_eia_tbl]) plants_eia_df = pd.read_sql(plants_eia_select, pudl_engine) # grab the annual table select plants_eia860_tbl = pt['plants_eia860'] plants_eia860_select = sa.sql.select([plants_eia860_tbl]) if start_date is not None: start_date = pd.to_datetime(start_date) plants_eia860_select = plants_eia860_select.where( plants_eia860_tbl.c.report_date >= start_date ) if end_date is not None: end_date = pd.to_datetime(end_date) plants_eia860_select = plants_eia860_select.where( plants_eia860_tbl.c.report_date <= end_date ) plants_eia860_df = pd.read_sql(plants_eia860_select, pudl_engine) plants_eia860_df['report_date'] = \ pd.to_datetime(plants_eia860_df['report_date']) # plant glue table plants_g_eia_tbl = pt['plants_eia'] plants_g_eia_select = sa.sql.select([ plants_g_eia_tbl.c.plant_id_eia, plants_g_eia_tbl.c.plant_id_pudl, ]) plants_g_eia_df = pd.read_sql(plants_g_eia_select, pudl_engine) out_df = pd.merge(plants_eia_df, plants_eia860_df, how='left', on=['plant_id_eia', ]) out_df = pd.merge(out_df, plants_g_eia_df, how='left', on=['plant_id_eia', ]) utils_eia_tbl = pt['utilities_eia'] utils_eia_select = sa.sql.select([ utils_eia_tbl.c.utility_id_eia, utils_eia_tbl.c.utility_id_pudl, ]) utils_eia_df = pd.read_sql(utils_eia_select, pudl_engine) out_df = pd.merge(out_df, utils_eia_df, how='left', on=['utility_id_eia', ]) out_df = out_df.drop(['id'], axis='columns') return out_df
[docs]def plants_utils_eia860(pudl_engine, start_date=None, end_date=None): """Create a dataframe of plant and utility IDs and names from EIA 860. Returns a pandas dataframe with the following columns: - report_date (in which data was reported) - plant_name_eia (from EIA entity) - plant_id_eia (from EIA entity) - plant_id_pudl - utility_id_eia (from EIA860) - utility_name_eia (from EIA860) - utility_id_pudl Note: EIA 860 data has only been integrated for 2011-2016. If earlier or later years are requested, they will be filled in with data from the first or last years. Args: pudl_engine (sqlalchemy.engine.Engine): SQLAlchemy connection engine for the PUDL DB. start_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. end_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. Returns: pandas.DataFrame: A DataFrame containing plant and utility IDs and names from EIA 860. """ # Contains the one-to-one mapping of EIA plants to their operators, but # we only have the 860 data integrated for 2011 forward right now. plants_eia = plants_eia860(pudl_engine, start_date=start_date, end_date=end_date) utils_eia = utilities_eia860(pudl_engine, start_date=start_date, end_date=end_date) # to avoid duplicate columns on the merge... plants_eia = plants_eia.drop(['utility_id_pudl', 'city', 'state', 'zip_code', 'street_address'], axis='columns') out_df = pd.merge(plants_eia, utils_eia, how='left', on=['report_date', 'utility_id_eia']) cols_to_keep = ['report_date', 'plant_id_eia', 'plant_name_eia', 'plant_id_pudl', 'utility_id_eia', 'utility_name_eia', 'utility_id_pudl' ] out_df = out_df[cols_to_keep] out_df = out_df.dropna() out_df.plant_id_pudl = out_df.plant_id_pudl.astype(int) out_df.utility_id_eia = out_df.utility_id_eia.astype(int) out_df.utility_id_pudl = out_df.utility_id_pudl.astype(int) return out_df
[docs]def generators_eia860(pudl_engine, start_date=None, end_date=None): """Pull all fields reported in the generators_eia860 table. Merge in other useful fields including the latitude & longitude of the plant that the generators are part of, canonical plant & operator names and the PUDL IDs of the plant and operator, for merging with other PUDL data sources. Fill in data for adjacent years if requested, but never fill in earlier than the earliest working year of data for EIA923, and never add more than one year on after the reported data (since there should at most be a one year lag between EIA923 and EIA860 reporting) Args: pudl_engine (sqlalchemy.engine.Engine): SQLAlchemy connection engine for the PUDL DB. start_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. end_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. Returns: pandas.DataFrame: A DataFrame containing all the fields of the EIA 860 Generators table. """ pt = pudl.output.pudltabl.get_table_meta(pudl_engine) # Almost all the info we need will come from here. gens_eia860_tbl = pt['generators_eia860'] gens_eia860_select = sa.sql.select([gens_eia860_tbl, ]) # To get the Lat/Lon coordinates plants_entity_eia_tbl = pt['plants_entity_eia'] plants_entity_eia_select = sa.sql.select([ plants_entity_eia_tbl.c.plant_id_eia, plants_entity_eia_tbl.c.plant_name_eia, plants_entity_eia_tbl.c.latitude, plants_entity_eia_tbl.c.longitude, plants_entity_eia_tbl.c.state, plants_entity_eia_tbl.c.balancing_authority_code, plants_entity_eia_tbl.c.balancing_authority_name, plants_entity_eia_tbl.c.iso_rto_name, plants_entity_eia_tbl.c.iso_rto_code, ]) if start_date is not None: start_date = pd.to_datetime(start_date) # We don't want to get too crazy with the date extensions... # start_date shouldn't go back before the earliest working year of # EIA 923 eia923_start_date = \ pd.to_datetime('{}-01-01'.format( min(pc.working_years['eia923']))) if start_date < eia923_start_date: raise AssertionError(f""" EIA 860 generators start_date ({start_date}) is before the earliest EIA 923 data is available ({eia923_start_date}). That's too much backfilling.""") gens_eia860_select = gens_eia860_select.where( gens_eia860_tbl.c.report_date >= start_date ) if end_date is not None: end_date = pd.to_datetime(end_date) # end_date shouldn't be more than one year ahead of the most recent # year for which we have EIA 860 data: eia860_end_date = \ pd.to_datetime('{}-12-31'.format( max(pc.working_years['eia860']))) if end_date > eia860_end_date + pd.DateOffset(years=1): raise AssertionError(f""" EIA 860 end_date ({end_date}) is more than a year after the most recent EIA 860 data available ({eia860_end_date}). That's too much forward filling.""") gens_eia860_select = gens_eia860_select.where( gens_eia860_tbl.c.report_date <= end_date ) gens_eia860 = pd.read_sql(gens_eia860_select, pudl_engine) plants_entity_eia_df = pd.read_sql(plants_entity_eia_select, pudl_engine) out_df = pd.merge(gens_eia860, plants_entity_eia_df, how='left', on=['plant_id_eia']) out_df.report_date = pd.to_datetime(out_df.report_date) # Bring in some generic plant & utility information: pu_eia = plants_utils_eia860(pudl_engine, start_date=start_date, end_date=end_date) out_df = pd.merge(out_df, pu_eia, on=['report_date', 'plant_id_eia', 'plant_name_eia', 'utility_id_eia']) # Drop a few extraneous fields... out_df = out_df.drop(['id'], axis='columns') # In order to be able to differentiate betweet single and multi-fuel # plants, we need to count how many different simple energy sources there # are associated with plant's generators. This allows us to do the simple # lumping of an entire plant's fuel & generation if its primary fuels # are homogeneous, and split out fuel & generation by fuel if it is # hetereogeneous. ft_count = out_df[['plant_id_eia', 'fuel_type_code_pudl', 'report_date']].\ drop_duplicates().groupby(['plant_id_eia', 'report_date']).count() ft_count = ft_count.reset_index() ft_count = ft_count.rename( columns={'fuel_type_code_pudl': 'fuel_type_count'}) out_df = pd.merge(out_df, ft_count, how='left', on=['plant_id_eia', 'report_date']) first_cols = [ 'report_date', 'plant_id_eia', 'plant_id_pudl', 'plant_name_eia', 'utility_id_eia', 'utility_id_pudl', 'utility_name_eia', 'generator_id', ] # Re-arrange the columns for easier readability: out_df = pudl.helpers.organize_cols(out_df, first_cols) out_df = pudl.helpers.extend_annual( out_df, start_date=start_date, end_date=end_date) out_df = out_df.sort_values(['report_date', 'plant_id_eia', 'generator_id']) return out_df
[docs]def boiler_generator_assn_eia860(pudl_engine, start_date=None, end_date=None): """Pull all fields from the EIA 860 boiler generator association table. Args: pudl_engine (sqlalchemy.engine.Engine): SQLAlchemy connection engine for the PUDL DB. start_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. end_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. Returns: pandas.DataFrame: A DataFrame containing all the fields from the EIA 860 boiler generator association table. """ pt = pudl.output.pudltabl.get_table_meta(pudl_engine) bga_eia860_tbl = pt['boiler_generator_assn_eia860'] bga_eia860_select = sa.sql.select([bga_eia860_tbl]) if start_date is not None: start_date = pd.to_datetime(start_date) bga_eia860_select = bga_eia860_select.where( bga_eia860_tbl.c.report_date >= start_date ) if end_date is not None: end_date = pd.to_datetime(end_date) bga_eia860_select = bga_eia860_select.where( bga_eia860_tbl.c.report_date <= end_date ) bga_eia860_df = pd.read_sql(bga_eia860_select, pudl_engine) bga_eia860_df = bga_eia860_df.drop(['id'], axis='columns') out_df = pudl.helpers.extend_annual(bga_eia860_df, start_date=start_date, end_date=end_date) return out_df
[docs]def ownership_eia860(pudl_engine, start_date=None, end_date=None): """Pull a useful set of fields related to ownership_eia860 table. Args: pudl_engine (sqlalchemy.engine.Engine): SQLAlchemy connection engine for the PUDL DB. start_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. end_date (date-like): date-like object, including a string of the form 'YYYY-MM-DD' which will be used to specify the date range of records to be pulled. Dates are inclusive. Returns: pandas.DataFrame: A DataFrame containing a useful set of fields related to the EIA 860 Ownership table. """ o_df = pd.read_sql("ownership_eia860", pudl_engine) pu_eia = plants_utils_eia860(pudl_engine, start_date=start_date, end_date=end_date) pu_eia = pu_eia[['plant_id_eia', 'plant_id_pudl', 'plant_name_eia', 'utility_name_eia', 'utility_id_pudl', 'report_date']] o_df['report_date'] = pd.to_datetime(o_df.report_date) out_df = pd.merge(o_df, pu_eia, how='left', on=['report_date', 'plant_id_eia']) out_df = out_df.drop(['id'], axis='columns') out_df = out_df.dropna(subset=[ 'plant_id_eia', 'plant_id_pudl', 'utility_id_eia', 'utility_id_pudl', 'generator_id', 'owner_utility_id_eia', ]) first_cols = [ 'report_date', 'plant_id_eia', 'plant_id_pudl', 'plant_name_eia', 'utility_id_eia', 'utility_id_pudl', 'utility_name_eia', 'generator_id', 'owner_utility_id_eia', 'owner_name', ] # Re-arrange the columns for easier readability: out_df = pudl.helpers.organize_cols(out_df, first_cols) out_df['plant_id_pudl'] = out_df.plant_id_pudl.astype(int) out_df['utility_id_pudl'] = out_df.utility_id_pudl.astype(int) out_df = pudl.helpers.extend_annual( out_df, start_date=start_date, end_date=end_date) return out_df