"""Generic extractor for all FERC XBRL data."""
import io
from datetime import date
from pathlib import Path
from dagster import Field, Noneable, op
from ferc_xbrl_extractor.cli import run_main
import pudl
from pudl.settings import FercGenericXbrlToSqliteSettings, XbrlFormNumber
from pudl.workspace.datastore import Datastore
from pudl.workspace.setup import PudlPaths
[docs]
logger = pudl.logging_helpers.get_logger(__name__)
[docs]
class FercXbrlDatastore:
"""Simple datastore wrapper for accessing ferc1 xbrl resources."""
def __init__(self, datastore: Datastore):
"""Instantiate datastore wrapper for ferc1 resources."""
self.datastore = datastore
[docs]
def get_taxonomy(self, year: int, form: XbrlFormNumber) -> tuple[io.BytesIO, str]:
"""Returns the path to the taxonomy entry point within the an archive."""
taxonomy_dates = {2021: date(2022, 1, 1), 2022: date(2022, 1, 1)}
taxonomy_date = taxonomy_dates[year]
raw_archive = self.datastore.get_unique_resource(
f"ferc{form.value}",
year=taxonomy_date.year,
data_format="xbrl_taxonomy",
)
taxonomy_entry_point = f"taxonomy/form{form.value}/{taxonomy_date}/form/form{form.value}/form-{form.value}_{taxonomy_date.isoformat()}.xsd"
return io.BytesIO(raw_archive), taxonomy_entry_point
[docs]
def get_filings(self, year: int, form: XbrlFormNumber) -> io.BytesIO:
"""Return the corresponding archive full of XBRL filings."""
return io.BytesIO(
self.datastore.get_unique_resource(
f"ferc{form.value}", year=year, data_format="xbrl"
)
)
@op(
config_schema={
"clobber": Field(
bool, description="Clobber existing ferc1 database.", default_value=False
),
"workers": Field(
Noneable(int),
description="Specify number of worker processes for parsing XBRL filings.",
default_value=None,
),
"batch_size": Field(
int,
description="Specify number of XBRL instances to be processed at a time (defaults to 50)",
default_value=50,
),
},
required_resource_keys={"ferc_to_sqlite_settings", "datastore"},
)
[docs]
def xbrl2sqlite(context) -> None:
"""Clone the FERC Form 1 XBRL Database to SQLite."""
output_path = PudlPaths().output_dir
clobber = context.op_config["clobber"]
batch_size = context.op_config["batch_size"]
workers = context.op_config["workers"]
ferc_to_sqlite_settings = context.resources.ferc_to_sqlite_settings
datastore = context.resources.datastore
datastore = FercXbrlDatastore(datastore)
# Loop through all other forms and perform conversion
for form in XbrlFormNumber:
# Get desired settings object
settings = ferc_to_sqlite_settings.get_xbrl_dataset_settings(form)
# If no settings for form in question, skip
if settings is None:
continue
if settings.disabled:
logger.info(f"Dataset ferc{form}_xbrl is disabled, skipping")
continue
sql_path = PudlPaths().sqlite_db_path(f"ferc{form.value}_xbrl")
if sql_path.exists():
if clobber:
sql_path.unlink()
else:
raise RuntimeError(
f"Found existing DB at {sql_path} and clobber was set to False. Aborting."
)
convert_form(
settings,
form,
datastore,
output_path=output_path,
sql_path=sql_path,
batch_size=batch_size,
workers=workers,
)