Source code for pudl.analysis.ml_tools.models

"""Provides tooling for developing/tracking ml models within PUDL.

The main interface from this module is the :func:`pudl_model` decorator, which
is meant to be applied to a dagster `graph`. This decorator will handle finding all
configuration for a model/passing configuration to dagster, creating an
:class:`ExperimentTracker` for the model, and ultimately will return a `graph_asset`
from the model.

There are a few different ways to provide configuration for a PUDL model. First, configuration will come from default values for any dagster `Config`'s which are associated
with `op`'s which make up the model `graph`. For more info on dagster configuration,
see https://docs.dagster.io/concepts/configuration/config-schema. The next way to
provide configuration is through the yaml file: `pudl.package_data.settings.pudl_models.yml`.
Any configuration in this file should be follow dagster's config-schema formatting,
see the `ferc_to_ferc` entry as an example. Configuration provided this way will
override any default values. The final way to provide configuration is through the
dagster UI. To provide configuration this way, click `Open Launchpad` in the UI, and
values can be edited here. This configuration will override both default values and
yaml configuration, but will only be used for a single run.
"""

import importlib

import yaml
from dagster import (
    AssetIn,
    AssetsDefinition,
    GraphDefinition,
    OpDefinition,
    graph_asset,
)

import pudl

from . import experiment_tracking

[docs] logger = pudl.logging_helpers.get_logger(__name__)
[docs] MODEL_CONFIGURATION = {}
[docs] def get_yml_config(experiment_name: str) -> dict: """Load model configuration from yaml file.""" config_file = ( importlib.resources.files("pudl.package_data.settings") / "pudl_models.yml" ) config = yaml.safe_load(config_file.open("r")) if not (model_config := config.get(experiment_name)): raise RuntimeError(f"No {experiment_name} entry in {config_file}") return {experiment_name: model_config}
[docs] def get_default_config(model_graph: GraphDefinition) -> dict: """Get default config values for model.""" def _get_default_from_ops(node: OpDefinition | GraphDefinition): config = {} if isinstance(node, GraphDefinition): config = { "ops": { child_node.name: _get_default_from_ops(child_node) for child_node in node.node_defs } } else: if node.config_schema.default_provided: config = {"config": node.config_schema.default_value} else: config = {"config": None} return config config = {model_graph.name: _get_default_from_ops(model_graph)} config[f"{model_graph.name}_tracker"] = { "config": experiment_tracking.ExperimentTrackerConfig().model_dump() } return config
[docs] def pudl_model(asset_name: str, config_from_yaml: bool = False) -> AssetsDefinition: """Decorator for an ML model that will handle providing configuration to dagster.""" def _decorator(model_graph: GraphDefinition): model_config = get_default_config(model_graph) if config_from_yaml: model_config |= get_yml_config(model_graph.name) MODEL_CONFIGURATION[asset_name] = {"ops": model_config} # Inputs should come from assets except experiment tracker ins = { key: AssetIn(key) for key in model_graph.input_dict if key != "experiment_tracker" } @graph_asset(name=asset_name, ins=ins) def model_asset(**kwargs): experiment_tracker = experiment_tracking.experiment_tracker_factory( experiment_name=model_graph.name, model_config=model_config, )() return model_graph(experiment_tracker, **kwargs) return model_asset return _decorator