"""
Retrieve data from EPA's Integrated Planning Model (IPM) v6.
Unlike most of the PUDL data sources, IPM is not an annual timeseries. This
file assumes that only v6 will be used as an input, so there are a limited
number of files.
This module was written by :user:`gschivley`
"""
import io
import logging
from pathlib import Path
from typing import Any, Dict, List, NamedTuple
import pandas as pd
from pudl.workspace.datastore import Datastore
# $ from pudl.workspace import datastore as datastore
logger = logging.getLogger(__name__)
[docs]class TableSettings(NamedTuple):
"""Contains information for how to access and load EpaIpm dataframes."""
table_name: str
file: str
excel_settings: Dict[str, Any] = {}
[docs]class EpaIpmDatastore:
"""Helper for extracting EpaIpm dataframes from Datastore."""
SETTINGS = (
TableSettings(
table_name="transmission_single_epaipm",
file="table_3-21_annual_transmission_capabilities_of_u.s._model_regions_in_epa_platform_v6_-_2021.xlsx",
excel_settings=dict(
skiprows=3,
usecols='B:F',
index_col=[0, 1])
),
TableSettings(
table_name="transmission_joint_epaipm",
file="table_3-5_transmission_joint_ipm.csv"
),
TableSettings(
table_name="load_curves_epaipm",
file="table_2-2_load_duration_curves_used_in_epa_platform_v6.xlsx",
excel_settings=dict(
skiprows=3,
usecols='B:AB')
),
TableSettings(
table_name="plant_region_map_epaipm_active",
file="needs_v6_november_2018_reference_case_0.xlsx",
excel_settings=dict(
sheet_name='NEEDS v6_Active',
usecols='C,I')
),
TableSettings(
table_name="plant_region_map_epaipm_retired",
file="needs_v6_november_2018_reference_case_0.xlsx",
excel_settings=dict(
sheet_name='NEEDS v6_Retired_Through2021',
usecols='C,I')
),
)
def __init__(self, datastore: Datastore):
"""Creates new instance of EpaIpmDatastore wrapper."""
self.datastore = datastore
[docs] def get_table_settings(self, table_name: str) -> TableSettings:
"""Returns TableSettings for a given table_name."""
for s in self.SETTINGS:
if s.table_name == table_name:
return s
raise ValueError(f"Table {table_name} not found in EpaIpmDatastore.SETTINGS.")
[docs] def get_dataframe(self, table_name: str) -> pd.DataFrame:
"""
Retrieve the specified file from the epaipm archive.
Args:
table_name: table name, from self.table_filename
pandas_args: pandas arguments for parsing the file
Returns:
Pandas dataframe of EPA IPM data.
"""
table_settings = self.get_table_settings(table_name)
f = io.BytesIO(self.datastore.get_unique_resource(
"epaipm",
name=table_settings.file))
path = Path(table_settings.file)
if path.suffix == ".xlsx":
return pd.read_excel(f, table_settings.excel_settings)
if path.suffix == ".csv":
return pd.read_csv(f)
raise ValueError(f"{path.suffix}: unknown file format for {path}")