Source code for pudl.workspace.datastore_cli

"""
A CLI for fetching public utility data from reporting agency servers.

This script will generate a datastore on a datastore directory. By default, the
directory will end up in wherever you have designated "PUDL_IN" in the settings
file $HOME/.pudl.yml. You can use this script to specific only specific datasets
to download, only specific years or states but by default, it will grab
everything. A populated datastore is required to use other PUDL tools, like the
ETL script (`pudl_etl`) and all of the post-ETL processes.
"""

import argparse
import logging
import pathlib
import sys
import warnings

import coloredlogs

import pudl
import pudl.constants as pc
from pudl.workspace import datastore


[docs]def parse_command_line(argv): """ Parse command line arguments. See the -h option for more details. Args: argv (str): Command line arguments, which must include caller filename. Returns: dict: Dictionary of command line arguments and their parsed values. """ parser = argparse.ArgumentParser(description=__doc__) # This is necessary because on the Windows console "None" doesn't end up # getting encoded correctly to print at the console. Somehow. default_pudl_in = pudl.workspace.setup.get_defaults()["pudl_in"] if default_pudl_in is None: default_pudl_in = "None" parser.add_argument( '-q', '--quiet', dest='verbose', action='store_false', help="Quiet mode. Suppress download progress indicators and warnings.", default=True ) parser.add_argument( '-z', '--zip', dest='unzip', action='store_false', help="Do not unzip downloaded data files.", default=True ) parser.add_argument( '-c', '--clobber', action='store_true', help="Clobber existing zipfiles in the datastore if they exist.", default=False ) parser.add_argument( '-d', '--datastore_dir', type=str, help="""Directory where the datastore should be located. (default: %(default)s).""", default=default_pudl_in, ) parser.add_argument( '-s', '--sources', nargs='+', choices=pc.data_sources, help="""List of data sources which should be downloaded. (default: %(default)s).""", default=pc.data_sources ) parser.add_argument( '-y', '--years', dest='years', nargs='+', help="""List of years for which data should be downloaded. Different data sources have differet valid years. If data is not available for a specified year and data source, it will be ignored. If no years are specified, all available data will be downloaded for all requested data sources.""", default=[] ) parser.add_argument( '--no_download', '-n', action='store_false', dest='download', help="""Do not attempt to download fresh data from the original sources. Instead assume that the zipfiles or other original data is already present, and organize it locally.""", default=True ) parser.add_argument( '-t', '--states', nargs='+', choices=pc.cems_states.keys(), type=str.upper, help="""List of two letter US state abbreviations indicating which states data should be downloaded. Currently only applicable to the EPA's CEMS dataset.""", default=pc.cems_states.keys() ) arguments = parser.parse_args(argv[1:]) return arguments
[docs]def main(): """Manage and update the PUDL datastore.""" # Display logged output from the PUDL package: logger = logging.getLogger(pudl.__name__) log_format = '%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s' coloredlogs.install(fmt=log_format, level='INFO', logger=logger) args = parse_command_line(sys.argv) # Generate a list of valid years of data to download for each data source. # If no years were specified, use the full set of valid years. # If years were specified, keep only th years which are valid for that # data source, and optionally output a message saying which years are # being ignored because they aren't valid. years_by_source = {} for source in args.sources: if not args.years: years_by_source[source] = pc.data_years[source] else: years_by_source[source] = [int(year) for year in args.years if int(year) in pc.data_years[source]] if source == "epaipm": years_by_source[source] = pc.data_years[source] continue bad_years = [int(year) for year in args.years if int(year) not in pc.data_years[source]] if args.verbose and bad_years: warnings.warn(f"Invalid {source} years ignored: {bad_years}.") datastore.parallel_update( sources=args.sources, years_by_source=years_by_source, states=args.states, data_dir=str(pathlib.Path(args.datastore_dir, "data")), clobber=args.clobber, unzip=args.unzip, dl=args.download, )
if __name__ == '__main__': sys.exit(main())