Source code for pudl.output.export

"""Routines for exporting data from PUDL for use elsewhere.

Function names should be indicative of the format of the thing that's being
exported (e.g. CSV, Excel spreadsheets, parquet files, HDF5).
"""

import logging

import pandas as pd

import pudl

logger = logging.getLogger(__name__)


[docs]def annotated_xlsx(df, notes_dict, tags_dict, first_cols, sheet_name, xlsx_writer): """Outputs an annotated spreadsheet workbook based on compiled dataframes. Creates annotation tab and header rows for EIA 860, EIA 923, and FERC 1 fields in a dataframe. This is done using an Excel Writer object, which must be created and saved outside the function, thereby allowing multiple sheets and associated annotations to be compiled in the same Excel file. Args: df (pandas.DataFrame): The dataframe for which annotations are being created notes_dict (dict): dictionary with column names as keys and long annotations as values tags_dict (dict): dictionary of dictionaries with tag categories as keys for outer dictionary and values are dictionaries with column names as keys and values are tag within the tag category first_cols (list): ordered list of columns that should come first in outfile sheet_name (string): name of data sheet in output spreadsheet xlsx_writer (pandas.ExcelWriter): this is an ExcelWriter object used to accumulate multiple tabs, which must be created outside of function, before calling the first time e.g. "xlsx_writer = pd.ExcelWriter('outfile.xlsx')" Returns: xlsx_writer (pandas.ExcelWriter): which must be called outside the function, after final use of function, for writing out to excel: "xlsx_writer.save()" """ first_cols = [c for c in first_cols if c in df.columns] df = pudl.helpers.organize_cols(df, first_cols) # Transpose the original dataframe to easily add and map tags as columns dfnew = df.transpose() # For loop where tag is metadata field (e.g. data_source or data_origin) & # column is a nested dictionary of column name & value; maps tags_dict to # columns in df and creates a new column for each tag category for tag, column_dict in tags_dict.items(): dfnew[tag] = dfnew.index.to_series().map(column_dict) # Take the new columns that were created for each tag category and add them # to the index for tag, column_dict in tags_dict.items(): dfnew = dfnew.set_index([tag], append=True) # Transpose to return data fields to columns dfnew = dfnew.transpose() # Create an excel sheet for the data frame dfnew.to_excel(xlsx_writer, sheet_name=str(sheet_name), na_rep='NA') # Convert notes dictionary into a pandas series notes = pd.Series(notes_dict, name='notes') notes.index.name = 'field_name' # Create an excel sheet of the notes_dict notes.to_excel(xlsx_writer, sheet_name=str(sheet_name) + '_notes', na_rep='NA') # ZS: Don't think we *need* to return the xlsx_writer object here, because # any alternations we've made are stored within the object -- and its scope # exists beyond this function (since it was passed in). # If we *are* going to return the xlsx_writer, then we should probably be # making copy of it up front and not alter the one that's passed in. Either # idiom is common, but the mix of both might be confusing. # Return the xlsx_writer object, which can be written out, outside of # function, with 'xlsx_writer.save()' return xlsx_writer