From 1cdbc49653dd86dfc8d94b0cdadc94214c14d812 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Tue, 17 Mar 2026 18:20:25 +0000 Subject: [PATCH 1/9] #206: Updated `debug` mode --- README.md | 2 +- config.toml | 27 ++-- main.py | 14 +- python/ase.py | 20 ++- python/employment.py | 12 +- python/hh_characteristics.py | 21 ++- python/hs_hh.py | 13 +- python/parsers.py | 241 +++++++++++------------------------ python/pop_type.py | 20 ++- python/staging.py | 7 +- python/startup.py | 10 +- python/utils.py | 11 +- 12 files changed, 175 insertions(+), 223 deletions(-) diff --git a/README.md b/README.md index 1dda0a3..9c8fc1d 100644 --- a/README.md +++ b/README.md @@ -17,7 +17,7 @@ server = "" # SQL instance containing GIS database database = "" # database within instance containing GIS datasets (GQ/LUDU) [sql] -staging = "" # unconditional network folder path visible to SQL instance for BULK INSERT +staging = '' # unconditional network folder path visible to SQL instance for BULK INSERT ``` ## Running diff --git a/config.toml b/config.toml index f195bbc..f6c7e18 100644 --- a/config.toml +++ b/config.toml @@ -5,28 +5,19 @@ # The `run` section contains configuration for running every module of the Estimates # Program for a specified set of years [run] -enabled = true +enabled = false mgra = "mgra15" start_year = 2020 end_year = 2024 version = "1.1.1-dev" comments = "Example comment" -# The `debug` section contains configuration for running a subset of modules of the -# Estimates Program for a given set of years. All parameters must be provided except for -# `run_id` and `comments`. If `run_id` is `null`, then a new `run_id` will be -# automatically created, similar to `run` mode +# The `debug` section contains configuration for running a single module for a single +# year based on the input data of an existing complete Estimates run. The input module +# string can be any of startup, housing_and_households, population, population_by_ase, +# household_characteristics, employment, or staging [debug] -enabled = false -run_id = -1 # -1 is interpreted as None -start_year = 2022 -end_year = 2023 -version = "1.1.1-dev" -comments = "" -startup = false -housing_and_households = false -population = false -population_by_ase = false -household_characteristics = false -employment = false -staging = false +enabled = true +run_id = 82 # The run_id for the released v24 Estimates +year = 2022 +module = "staging" diff --git a/main.py b/main.py index be83321..d91e08d 100644 --- a/main.py +++ b/main.py @@ -30,7 +30,7 @@ if utils.RUN_INSTRUCTIONS["startup"]: utils.display_ascii_art("data/welcome.txt") logger.info("Running Startup module...\n") - startup.run_startup() + startup.run_startup(debug=utils.DEBUG) # Loop through the years first for year in utils.RUN_INSTRUCTIONS["years"]: @@ -41,27 +41,27 @@ # Housing and Households module if utils.RUN_INSTRUCTIONS["housing_and_households"]: logger.info("Running Housing and Households module...") - hs_hh.run_hs_hh(year) + hs_hh.run_hs_hh(year, debug=utils.DEBUG) # Population module if utils.RUN_INSTRUCTIONS["population"]: logger.info("Running Population module...") - pop.run_pop(year) + pop.run_pop(year, debug=utils.DEBUG) # Population by Age/Sex/Ethnicity module if utils.RUN_INSTRUCTIONS["population_by_ase"]: logger.info("Running Population by Age/Sex/Ethnicity module...") - ase.run_ase(year) + ase.run_ase(year, debug=utils.DEBUG) # Household Characteristics module if utils.RUN_INSTRUCTIONS["household_characteristics"]: logger.info("Running Household Characteristics module...") - hh_characteristics.run_hh_characteristics(year) + hh_characteristics.run_hh_characteristics(year, debug=utils.DEBUG) # Employment module if utils.RUN_INSTRUCTIONS["employment"]: logger.info("Running Employment module...") - employment.run_employment(year) + employment.run_employment(year, debug=utils.DEBUG) # Diagnostic print for this year logger.info(f"Finished running {year}\n") @@ -70,7 +70,7 @@ # [metadata].[run] table if utils.RUN_INSTRUCTIONS["staging"]: logger.info("Running Staging module...") - staging.run_staging() + staging.run_staging(debug=utils.DEBUG) # Final print for completion logger.info("Completed") diff --git a/python/ase.py b/python/ase.py index 246bac0..e2a15a5 100644 --- a/python/ase.py +++ b/python/ase.py @@ -17,7 +17,7 @@ logger = logging.getLogger(__name__) -def run_ase(year: int) -> None: +def run_ase(year: int, debug: bool) -> None: """Orchestrator function for age/sex/ethnicity population by type. Creates regional age/sex/ethnicity controls by population type. Then @@ -69,7 +69,7 @@ def run_ase(year: int) -> None: controls_outputs = _create_controls(controls_inputs) _validate_controls_outputs(controls_outputs) - _insert_controls(controls_outputs) + _insert_controls(controls_outputs, debug) # Calculate MGRA age/sex/ethnicity population by population type ase_inputs = _get_ase_inputs(year) @@ -78,7 +78,7 @@ def run_ase(year: int) -> None: ase_outputs = _create_ase(year, ase_inputs) _validate_ase_outputs(ase_outputs) - _insert_ase(ase_outputs) + _insert_ase(ase_outputs, debug) @functools.lru_cache(maxsize=1) @@ -245,8 +245,13 @@ def _validate_controls_outputs(controls_outputs: pd.DataFrame) -> None: ) -def _insert_controls(controls_outputs: pd.DataFrame) -> None: +def _insert_controls(controls_outputs: pd.DataFrame, debug: bool) -> None: """Insert regional age/sex/ethnicity controls to database.""" + + # Skip insertion if running in debug mode + if debug: + return + with utils.ESTIMATES_ENGINE.connect() as con: controls_outputs.to_sql( name="controls_ase", @@ -915,8 +920,13 @@ def _validate_ase_outputs(ase_outputs: dict[str, pd.DataFrame]) -> None: ) -def _insert_ase(ase_outputs: dict[str, pd.DataFrame]) -> None: +def _insert_ase(ase_outputs: dict[str, pd.DataFrame], debug: bool) -> None: """Insert age/sex/ethnicity population by type to database.""" + + # Skip insertion if running in debug mode + if debug: + return + for pop_type, output in ase_outputs.items(): logger.info("Loading Estimates for " + pop_type) diff --git a/python/employment.py b/python/employment.py index 49b3394..adc9edb 100644 --- a/python/employment.py +++ b/python/employment.py @@ -11,7 +11,7 @@ generator = np.random.default_rng(utils.RANDOM_SEED) -def run_employment(year: int): +def run_employment(year: int, debug: bool): """Control function to create jobs data by naics_code (NAICS) at the MGRA level. Get the LEHD LODES data, aggregate to the MGRA level using the block to MGRA @@ -37,7 +37,7 @@ def run_employment(year: int): jobs_outputs = _create_jobs_output(jobs_inputs) _validate_jobs_outputs(jobs_outputs) - _insert_jobs(jobs_inputs, jobs_outputs) + _insert_jobs(jobs_inputs, jobs_outputs, debug) def _get_lodes_data(year: int) -> pd.DataFrame: @@ -268,10 +268,16 @@ def _validate_jobs_outputs(jobs_outputs: dict[str, pd.DataFrame]) -> None: def _insert_jobs( - jobs_inputs: dict[str, pd.DataFrame], jobs_outputs: dict[str, pd.DataFrame] + jobs_inputs: dict[str, pd.DataFrame], + jobs_outputs: dict[str, pd.DataFrame], + debug: bool, ) -> None: """Insert input and output data related to jobs to the database.""" + # Skip insertion if running in debug mode + if debug: + return + # Insert input and output data to database with utils.ESTIMATES_ENGINE.connect() as con: diff --git a/python/hh_characteristics.py b/python/hh_characteristics.py index 05834a2..87c3400 100644 --- a/python/hh_characteristics.py +++ b/python/hh_characteristics.py @@ -12,7 +12,7 @@ generator = np.random.default_rng(utils.RANDOM_SEED) -def run_hh_characteristics(year: int) -> None: +def run_hh_characteristics(year: int, debug: bool) -> None: """Orchestrator function to calculate and insert household characteristics. The exact household characteristics created are: @@ -51,7 +51,7 @@ def run_hh_characteristics(year: int) -> None: hh_income_outputs = _create_hh_income(hh_income_inputs) _validate_hh_income_outputs(hh_income_outputs) - _insert_hh_income(hh_income_inputs, hh_income_outputs) + _insert_hh_income(hh_income_inputs, hh_income_outputs, debug) # Then do households by size hh_size_inputs = _get_hh_size_inputs(year) @@ -60,7 +60,7 @@ def run_hh_characteristics(year: int) -> None: hh_size_outputs = _create_hh_size(hh_size_inputs) _validate_hh_size_outputs(hh_size_outputs) - _insert_hh_size(hh_size_inputs, hh_size_outputs) + _insert_hh_size(hh_size_inputs, hh_size_outputs, debug) def _get_hh_income_inputs(year: int) -> dict[str, pd.DataFrame]: @@ -417,8 +417,14 @@ def _validate_hh_size_outputs(hh_size_outputs: dict[str, pd.DataFrame]) -> None: def _insert_hh_income( hh_income_inputs: dict[str, pd.DataFrame], hh_income_outputs: dict[str, pd.DataFrame], + debug: bool, ) -> None: """Insert hh characteristics and tract level controls to database""" + + # Skip insertion if running in debug mode + if debug: + return + with utils.ESTIMATES_ENGINE.connect() as con: hh_income_inputs["hh_income_tract_controls"][ ["run_id", "year", "tract", "income_category", "value"] @@ -446,9 +452,16 @@ def _insert_hh_income( def _insert_hh_size( - hh_size_inputs: dict[str, pd.DataFrame], hh_size_outputs: dict[str, pd.DataFrame] + hh_size_inputs: dict[str, pd.DataFrame], + hh_size_outputs: dict[str, pd.DataFrame], + debug: bool, ) -> None: """Insert hh characteristics and tract level controls to database""" + + # Skip insertion if running in debug mode + if debug: + return + with utils.ESTIMATES_ENGINE.connect() as con: hh_size_inputs["hhs_tract_controls"].rename( columns={"household_size": "metric"} diff --git a/python/hs_hh.py b/python/hs_hh.py index 10e0a43..7e4fa2f 100644 --- a/python/hs_hh.py +++ b/python/hs_hh.py @@ -12,7 +12,7 @@ generator = np.random.default_rng(utils.RANDOM_SEED) -def run_hs_hh(year: int) -> None: +def run_hs_hh(year: int, debug: bool) -> None: """Orchestrator function to calculate and insert housing stock and households. Inserts housing stock by MGRA from SANDAG's LUDU database for a given year @@ -48,7 +48,7 @@ def run_hs_hh(year: int) -> None: hs_hh_outputs = _create_hs_hh(hs_hh_inputs) _validate_hs_hh_outputs(hs_hh_outputs) - _insert_hs_hh(hs_hh_inputs, hs_hh_outputs) + _insert_hs_hh(hs_hh_inputs, hs_hh_outputs, debug) def _calculate_hh_adjustment(households: int, housing_stock: int) -> int: @@ -239,9 +239,16 @@ def _validate_hs_hh_outputs(hs_hh_outputs: dict[str, pd.DataFrame]) -> None: def _insert_hs_hh( - hs_hh_inputs: dict[str, pd.DataFrame], hs_hh_outputs: dict[str, pd.DataFrame] + hs_hh_inputs: dict[str, pd.DataFrame], + hs_hh_outputs: dict[str, pd.DataFrame], + debug: bool, ) -> None: """Insert occupancy controls and households results to database.""" + + # Skip insertion if running in debug mode + if debug: + return + with utils.ESTIMATES_ENGINE.connect() as con: hs_hh_inputs["hs"].drop(columns=["tract", "city"]).to_sql( name="hs", diff --git a/python/parsers.py b/python/parsers.py index be8abc3..32ebc93 100644 --- a/python/parsers.py +++ b/python/parsers.py @@ -1,6 +1,16 @@ import cerberus import sqlalchemy as sql +_MODULES = [ + "startup", + "housing_and_households", + "population", + "population_by_ase", + "household_characteristics", + "employment", + "staging", +] + class InputParser: """A class to parse and validate input configurations. @@ -45,6 +55,7 @@ def __init__(self, config: dict, engine: sql.Engine) -> None: self.run_instructions = {} self.run_id = None self.mgra_version = None + self.debug = False def parse_config(self) -> None: """Control flow to parse the runtime configuration @@ -60,10 +71,6 @@ def parse_config(self) -> None: Returns: None """ - # Convert -1 to None for run_id (TOML doesn't support null/None) - if self._config.get("debug", {}).get("run_id") == -1: - self._config["debug"]["run_id"] = None - self._validate_config() self.run_id = self._parse_run_id() self.mgra_version = self._parse_mgra_version() @@ -77,42 +84,37 @@ def parse_config(self) -> None: self._config["run"]["end_year"] + 1, ) ) - for key in [ - "startup", - "housing_and_households", - "population", - "population_by_ase", - "household_characteristics", - "employment", - "staging", - ]: + for key in _MODULES: self.run_instructions[key] = True elif self._config["debug"]["enabled"]: - self.run_instructions["years"] = list( - range( - self._start_year, - self._end_year + 1, - ) - ) - for key in [ - "startup", - "housing_and_households", - "population", - "population_by_ase", - "household_characteristics", - "employment", - "staging", - ]: - self.run_instructions[key] = self._config["debug"][key] - - def _check_run_id(self, run_id: int) -> None: - """Check if supplied run id exists in the database.""" + self.debug = True + self.run_instructions["years"] = [self._start_year] + for key in _MODULES: + self.run_instructions[key] = key == self._config["debug"]["module"] + + def _check_run_id(self, run_id: int, complete: bool = False) -> None: + """Check if supplied run id exists in the database. + + Args: + run_id: The [run_id] to check for + complete: Default False. If True, then only check for [run_id]s marked as + [complete] = 1. If False, don't check for [complete] status + + Return: + None + + Raises: + ValueError: If [run_id] does not exist in the database + """ with self._engine.connect() as con: # Ensure supplied run id exists in the database query = sql.text( - """ + f""" SELECT CASE WHEN EXISTS ( - SELECT [run_id] FROM [metadata].[run] WHERE run_id = :run_id + SELECT [run_id] + FROM [metadata].[run] + WHERE [run_id] = :run_id + {"AND [complete] = 1" if complete else ""} ) THEN 1 ELSE 0 END """ ) @@ -153,22 +155,16 @@ def _validate_config(self) -> None: "type": "dict", "schema": { "enabled": {"type": "boolean"}, - "run_id": {"type": "integer", "nullable": True}, - "start_year": {"type": "integer", "min": min_max_years[0]}, - "end_year": {"type": "integer", "max": min_max_years[1]}, - "version": { + "run_id": {"type": "integer"}, + "year": { + "type": "integer", + "min": min_max_years[0], + "max": min_max_years[1], + }, + "module": { "type": "string", - "allowed": versions, - "nullable": True, + "allowed": _MODULES, }, - "comments": {"type": "string", "nullable": True}, - "startup": {"type": "boolean"}, - "housing_and_households": {"type": "boolean"}, - "population": {"type": "boolean"}, - "population_by_ase": {"type": "boolean"}, - "household_characteristics": {"type": "boolean"}, - "employment": {"type": "boolean"}, - "staging": {"type": "boolean"}, }, }, } @@ -177,33 +173,19 @@ def _validate_config(self) -> None: raise ValueError(validator.errors) # Make sure our years are not travelling backwards in time - for run_type in ["run", "debug"]: - if self._config[run_type]["enabled"] and ( - self._config[run_type]["start_year"] - > self._config[run_type]["end_year"] - ): - raise ValueError( - f"Key 'start year' cannot be greater than key 'end year' in '{run_type}' settings" - ) - - # Check that if we are in debug mode and trying to re-use a 'run_id'... - if ( - self._config["debug"]["enabled"] - and self._config["debug"]["run_id"] is not None + if self._config["run"]["enabled"] and ( + self._config["run"]["start_year"] > self._config["run"]["end_year"] ): + raise ValueError( + f"Key 'start year' cannot be greater than key 'end year' in 'run' settings" + ) + + # Check that if we are in debug mode... + if self._config["debug"]["enabled"]: # That the provided 'run_id' is valid - self._check_run_id(self._config["debug"]["run_id"]) - - # That 'version', and 'comments' are null - for key in ["version", "comments"]: - if self._config["debug"][key] is not None: - raise ValueError( - f"If a debug 'run_id' is provided, then the debug key of " - f"'{key}' must be null" - ) - - # That the 'start_year' and 'end_year' values, conform with those already - # in [metadata].[run] + self._check_run_id(self._config["debug"]["run_id"], complete=True) + + # That the 'year' value conforms with those already in [metadata].[run] with self._engine.connect() as con: existing_start_year = con.execute( sql.text( @@ -211,14 +193,14 @@ def _validate_config(self) -> None: ), {"run_id": self._config["debug"]["run_id"]}, ).scalar() - if self._config["debug"]["start_year"] < existing_start_year: + if self._config["debug"]["year"] < existing_start_year: raise ValueError( - f"The provided debug 'start_year' of {self._config['debug']['start_year']} " + f"The provided debug 'year' of {self._config['debug']['year']} " f"is less than the [metadata].[run] 'start_year' of " f"{existing_start_year} for 'run_id' {self._config["debug"]["run_id"]}" ) else: - self._start_year = self._config["debug"]["start_year"] + self._start_year = self._config["debug"]["year"] with self._engine.connect() as con: existing_end_year = con.execute( @@ -227,79 +209,14 @@ def _validate_config(self) -> None: ), {"run_id": self._config["debug"]["run_id"]}, ).scalar() - if self._config["debug"]["end_year"] > existing_end_year: + if self._config["debug"]["year"] > existing_end_year: raise ValueError( - f"The provided debug 'end_year' of {self._config['debug']['end_year']} " + f"The provided debug 'year' of {self._config['debug']['year']} " f"is greater than the [metadata].[run] 'end_year' of " f"{existing_end_year} for 'run_id' {self._config["debug"]["run_id"]}" ) else: - self._end_year = self._config["debug"]["end_year"] - - # Check that in debug mode, if no 'run_id' is provided... - if self._config["debug"]["enabled"] and self._config["debug"]["run_id"] is None: - # That all of 'start_year', 'end_year', and 'version' are provided. Note - # that 'comments' can still be null - for key in ["start_year", "end_year", "version"]: - if self._config["debug"][key] is None: - raise ValueError( - f"If a debug 'run_id' is not provided, then the debug key of " - f"'{key}' must be provided" - ) - - # That the dependency chain of modules is correct - if self._config["debug"]["staging"]: - for key in [ - "startup", - "housing_and_households", - "population", - "population_by_ase", - "household_characteristics", - "employment", - ]: - if not self._config["debug"][key]: - raise ValueError( - f"Because debug key 'staging' is enabled, debug key " - f"'{key}' must also be enabled" - ) - if self._config["debug"]["household_characteristics"]: - for key in [ - "startup", - "housing_and_households", - "population", - "population_by_ase", - ]: - if not self._config["debug"][key]: - raise ValueError( - f"Because debug key 'household_characteristics' is " - f"enabled, debug key '{key}' must also be enabled" - ) - if self._config["debug"]["population_by_ase"]: - for key in ["startup", "housing_and_households", "population"]: - if not self._config["debug"][key]: - raise ValueError( - f"Because debug key 'population_by_ase' is enabled, " - f"debug key '{key}' must also be enabled" - ) - if self._config["debug"]["population"]: - for key in ["startup", "housing_and_households"]: - if not self._config["debug"][key]: - raise ValueError( - f"Because debug key 'population' is enabled, debug key " - f"'{key}' must also be enabled" - ) - if self._config["debug"]["housing_and_households"]: - if not self._config["debug"]["startup"]: - raise ValueError( - "Because debug key 'housing_and_households' is enabled, " - "debug key 'startup' must also be enabled" - ) - if self._config["debug"]["employment"]: - if not self._config["debug"]["startup"]: - raise ValueError( - "Because debug key 'employment' is enabled, " - "debug key 'startup' must also be enabled" - ) + self._end_year = self._config["debug"]["year"] def _parse_run_id(self) -> int: """Parse the run id from the configuration file. @@ -315,22 +232,12 @@ def _parse_run_id(self) -> int: Raises: ValueError: If any of the configuration values are invalid. """ - # Create a new run id if standard run mode is enabled, or if we are running a - # subset of Estimates via debug mode - if self._config["run"]["enabled"] or ( - self._config["debug"]["enabled"] and self._config["debug"]["run_id"] is None - ): + # Create a new run id if standard run mode is enabled + if self._config["run"]["enabled"]: with self._engine.connect() as con: - # Override default arguments if debug mode is enabled - if self._config["debug"]["enabled"]: - comments = self._config["debug"]["comments"] - self._start_year = self._config["debug"]["start_year"] - self._end_year = self._config["debug"]["end_year"] - else: - comments = self._config["run"]["comments"] - self._start_year = self._config["run"]["start_year"] - self._end_year = self._config["run"]["end_year"] + self._start_year = self._config["run"]["start_year"] + self._end_year = self._config["run"]["end_year"] # Create run id from the most recent run id in the database run_id = con.execute( @@ -372,23 +279,19 @@ def _parse_run_id(self) -> int: "start_year": self._start_year, "end_year": self._end_year, "version": self._config["run"]["version"], - "comments": comments, + "comments": self._config["run"]["comments"], }, ) # Commit the transaction con.commit() - # Return the valid 'run_id' - return run_id + # For debug mode, simply return the pre-selected [run_id] + else: + run_id = self._config["debug"]["run_id"] - # Use the supplied 'run_id' if debug mode is enabled. Note the existence of the - # 'run_id' has already been checked - if ( - self._config["debug"]["enabled"] - and self._config["debug"]["run_id"] is not None - ): - return self._config["debug"]["run_id"] + # Return the [run_id] this Estimates Program run is using + return run_id def _parse_mgra_version(self) -> str: """Parse the MGRA version from the configuration file.""" @@ -399,7 +302,7 @@ def _parse_mgra_version(self) -> str: # Get mgra version from database if debug mode is enabled elif self._config["debug"]["enabled"]: # Ensure run id exists in the database - self._check_run_id(run_id=self.run_id) + self._check_run_id(run_id=self.run_id, complete=True) with self._engine.connect() as con: query = sql.text( diff --git a/python/pop_type.py b/python/pop_type.py index 08ea6f6..106c74c 100644 --- a/python/pop_type.py +++ b/python/pop_type.py @@ -11,7 +11,7 @@ generator = np.random.default_rng(utils.RANDOM_SEED) -def run_pop(year: int): +def run_pop(year: int, debug: bool): """Control function to create population by type (GQ and HHP) data Get MGRA group quarters input data, create the output data, then load both into the @@ -51,7 +51,7 @@ def run_pop(year: int): gq_outputs = _create_gq_outputs(gq_inputs) _validate_gq_outputs(gq_outputs) - _insert_gq(gq_inputs, gq_outputs) + _insert_gq(gq_inputs, gq_outputs, debug) # Then do Household Population hhp_inputs = _get_hhp_inputs(year) @@ -60,7 +60,7 @@ def run_pop(year: int): hhp_outputs = _create_hhp_outputs(hhp_inputs) _validate_hhp_outputs(hhp_outputs) - _insert_hhp(hhp_inputs, hhp_outputs) + _insert_hhp(hhp_inputs, hhp_outputs, debug) def _get_gq_inputs(year: int) -> dict[str, pd.DataFrame]: @@ -165,10 +165,14 @@ def _validate_gq_outputs(gq_outputs: dict[str, pd.DataFrame]) -> None: def _insert_gq( - gq_inputs: dict[str, pd.DataFrame], gq_outputs: dict[str, pd.DataFrame] + gq_inputs: dict[str, pd.DataFrame], gq_outputs: dict[str, pd.DataFrame], debug: bool ) -> None: """Insert both input and output data for MGRA group quarters""" + # Skip insertion if running in debug mode + if debug: + return + # Insert controls and group quarters results to database with utils.ESTIMATES_ENGINE.connect() as con: gq_inputs["city_controls"].to_sql( @@ -382,10 +386,16 @@ def _validate_hhp_outputs(hhp_outputs: dict[str, pd.DataFrame]) -> None: def _insert_hhp( - hhp_inputs: dict[str, pd.DataFrame], hhp_outputs: dict[str, pd.DataFrame] + hhp_inputs: dict[str, pd.DataFrame], + hhp_outputs: dict[str, pd.DataFrame], + debug: bool, ) -> None: """Insert input and output data related to household population""" + # Skip insertion if running in debug mode + if debug: + return + # Insert input and output data to database with utils.ESTIMATES_ENGINE.connect() as con: hhp_inputs["city_controls"].to_sql( diff --git a/python/staging.py b/python/staging.py index ff4efd2..ddd458b 100644 --- a/python/staging.py +++ b/python/staging.py @@ -6,11 +6,16 @@ import python.utils as utils -def run_staging() -> None: +def run_staging(debug: bool) -> None: """Orchestrator function for the staging module Mark the run as being completed and update the end date. """ + + # Skip UPDATE if running in debug mode + if debug: + return + with utils.ESTIMATES_ENGINE.connect() as con: script = sql.text( f"UPDATE [metadata].[run] " diff --git a/python/startup.py b/python/startup.py index 3fe1d9f..63eec96 100644 --- a/python/startup.py +++ b/python/startup.py @@ -5,16 +5,20 @@ import python.utils as utils -def run_startup(): +def run_startup(debug: bool): """Control function to call the correct functions in the correct order""" # Startup requires no input data # Startup requires no processing of input data - _insert_outputs() + _insert_outputs(debug) -def _insert_outputs(): +def _insert_outputs(debug: bool): """Insert output data related to the Startup module""" + # Skip insertion if running in debug mode + if debug: + return + # Insert the MGRA geography with utils.ESTIMATES_ENGINE.connect() as con: with open(utils.SQL_FOLDER / "insert_mgra.sql") as file: diff --git a/python/utils.py b/python/utils.py index 7e8ac5b..160021b 100644 --- a/python/utils.py +++ b/python/utils.py @@ -104,16 +104,19 @@ RUN_INSTRUCTIONS = input_parser.run_instructions RUN_ID = input_parser.run_id MGRA_VERSION = input_parser.mgra_version +DEBUG = input_parser.debug -logger.info( - f"RUN_ID: {RUN_ID}, MGRA_VERSION: {MGRA_VERSION}, YEARS: {RUN_INSTRUCTIONS["years"]}" -) +logger.info(f"RUN_ID: {RUN_ID}") +logger.info(f"MGRA_VERSION: {MGRA_VERSION}") +logger.info(f"DEBUG: {DEBUG}") +logger.info(f"RUN_INSTRUCTIONS: {RUN_INSTRUCTIONS}") ############################## # UTILITY LISTS AND MAPPINGS # ############################## -RANDOM_SEED = 42 # Seed for random number generation +# For deterministic random number generation +RANDOM_SEED = 42 HOUSEHOLD_SIZES = list(range(1, 8)) From e0eada461814ab1028e6ae25a654ae4638c09d87 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Tue, 17 Mar 2026 20:51:31 +0000 Subject: [PATCH 2/9] #206: Implemented local output for `debug` mode --- .gitignore | 113 +++++++++++++++++++++++++------- config.toml | 8 +-- python/ase.py | 119 ++++++++++++++++++---------------- python/employment.py | 36 +++++++---- python/hh_characteristics.py | 122 ++++++++++++++++++++--------------- python/hs_hh.py | 75 +++++++++++---------- python/parsers.py | 9 ++- python/pop_type.py | 94 ++++++++++++++++----------- python/utils.py | 7 ++ 9 files changed, 360 insertions(+), 223 deletions(-) diff --git a/.gitignore b/.gitignore index 2e22898..ef21d49 100644 --- a/.gitignore +++ b/.gitignore @@ -1,10 +1,6 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so +############################### +# Project specific .gitignore # +############################### # Secrets file secrets.toml @@ -12,8 +8,22 @@ secrets.toml # Log file log.txt -# Testing Python files -test.py +# Debug output file +debug_output/ + +################################################################## +# Python .gitignore # +# https://github.com/github/gitignore/blob/main/Python.gitignore # +# Retrieved 2026-03-17 # +################################################################## + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so # Distribution / packaging .Python @@ -36,8 +46,8 @@ share/python-wheels/ MANIFEST # PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. *.manifest *.spec @@ -55,7 +65,7 @@ htmlcov/ nosetests.xml coverage.xml *.cover -*.py,cover +*.py.cover .hypothesis/ .pytest_cache/ cover/ @@ -101,22 +111,37 @@ ipython_config.py # However, in case of collaboration, if having platform-specific dependencies or dependencies # having no cross-platform support, pipenv may install dependencies that don't work, or not # install all needed dependencies. -#Pipfile.lock +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +uv.lock # poetry # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. # This is especially recommended for binary packages to ensure reproducibility, and is more # commonly ignored for libraries. # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock +# poetry.lock +# poetry.toml # pdm # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm __pypackages__/ @@ -125,11 +150,25 @@ __pypackages__/ celerybeat-schedule celerybeat.pid +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + # SageMath parsed files *.sage.py # Environments .env +.envrc .venv env/ venv/ @@ -162,11 +201,35 @@ dmypy.json cython_debug/ # PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. .idea/ -# VSCode settings -.vscode \ No newline at end of file +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +.vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/config.toml b/config.toml index f6c7e18..627f104 100644 --- a/config.toml +++ b/config.toml @@ -5,7 +5,7 @@ # The `run` section contains configuration for running every module of the Estimates # Program for a specified set of years [run] -enabled = false +enabled = true mgra = "mgra15" start_year = 2020 end_year = 2024 @@ -17,7 +17,7 @@ comments = "Example comment" # string can be any of startup, housing_and_households, population, population_by_ase, # household_characteristics, employment, or staging [debug] -enabled = true +enabled = false run_id = 82 # The run_id for the released v24 Estimates -year = 2022 -module = "staging" +year = 2020 +module = "" diff --git a/python/ase.py b/python/ase.py index e2a15a5..5cd9de6 100644 --- a/python/ase.py +++ b/python/ase.py @@ -248,19 +248,23 @@ def _validate_controls_outputs(controls_outputs: pd.DataFrame) -> None: def _insert_controls(controls_outputs: pd.DataFrame, debug: bool) -> None: """Insert regional age/sex/ethnicity controls to database.""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return - - with utils.ESTIMATES_ENGINE.connect() as con: - controls_outputs.to_sql( - name="controls_ase", - con=con, - schema="inputs", - if_exists="append", - index=False, + controls_outputs.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"ase_inputs_controls_ase.csv", index=False ) + # Otherwise, insert into the database + else: + with utils.ESTIMATES_ENGINE.connect() as con: + controls_outputs.to_sql( + name="controls_ase", + con=con, + schema="inputs", + if_exists="append", + index=False, + ) + def _get_seed_inputs(year: int) -> dict[str, pd.DataFrame]: """Get inputs required to generate census tract age/sex/ethnicity seed data.""" @@ -923,53 +927,58 @@ def _validate_ase_outputs(ase_outputs: dict[str, pd.DataFrame]) -> None: def _insert_ase(ase_outputs: dict[str, pd.DataFrame], debug: bool) -> None: """Insert age/sex/ethnicity population by type to database.""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return - - for pop_type, output in ase_outputs.items(): - logger.info("Loading Estimates for " + pop_type) - - # Write the DataFrame to a CSV file - csv_temp_location = utils.BULK_INSERT_STAGING / (pop_type + ".txt") - ( - output.loc[lambda df: df["value"] != 0][ - [ - "run_id", - "year", - "mgra", - "pop_type", - "age_group", - "sex", - "ethnicity", - "value", - ] - ].to_csv( - csv_temp_location, - header=False, - index=False, - sep="|", - quoting=csv.QUOTE_NONE, + for name, data in ase_outputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"ase_outputs_{name}.csv", index=False ) - ) - # Bulk insert the CSV file into the production database - with utils.ESTIMATES_ENGINE.connect() as con: - query = sql.text( - f""" - BULK INSERT [outputs].[ase] - FROM '{csv_temp_location.as_posix()}' - WITH ( - TABLOCK, - MAXERRORS=0, - FIELDTERMINATOR = '|', - ROWTERMINATOR = '0x0A', - CHECK_CONSTRAINTS - ) - """ + # Otherwise, load to database + else: + for pop_type, output in ase_outputs.items(): + logger.info("Loading Estimates for " + pop_type) + + # Write the DataFrame to a CSV file + csv_temp_location = utils.BULK_INSERT_STAGING / (pop_type + ".txt") + ( + output.loc[lambda df: df["value"] != 0][ + [ + "run_id", + "year", + "mgra", + "pop_type", + "age_group", + "sex", + "ethnicity", + "value", + ] + ].to_csv( + csv_temp_location, + header=False, + index=False, + sep="|", + quoting=csv.QUOTE_NONE, + ) ) - con.execute(query) - con.commit() - # Remove the temporary CSV file - csv_temp_location.unlink() + # Bulk insert the CSV file into the production database + with utils.ESTIMATES_ENGINE.connect() as con: + query = sql.text( + f""" + BULK INSERT [outputs].[ase] + FROM '{csv_temp_location.as_posix()}' + WITH ( + TABLOCK, + MAXERRORS=0, + FIELDTERMINATOR = '|', + ROWTERMINATOR = '0x0A', + CHECK_CONSTRAINTS + ) + """ + ) + con.execute(query) + con.commit() + + # Remove the temporary CSV file + csv_temp_location.unlink() diff --git a/python/employment.py b/python/employment.py index adc9edb..1aa92e3 100644 --- a/python/employment.py +++ b/python/employment.py @@ -274,21 +274,29 @@ def _insert_jobs( ) -> None: """Insert input and output data related to jobs to the database.""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return + for name, data in jobs_inputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"emp_inputs_{name}.csv", index=False + ) + for name, data in jobs_outputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"emp_outputs_{name}.csv", index=False + ) - # Insert input and output data to database - with utils.ESTIMATES_ENGINE.connect() as con: + # Otherwise, insert to database + else: + with utils.ESTIMATES_ENGINE.connect() as con: - jobs_inputs["control_totals"].to_sql( - name="controls_jobs", - con=con, - schema="inputs", - if_exists="append", - index=False, - ) + jobs_inputs["control_totals"].to_sql( + name="controls_jobs", + con=con, + schema="inputs", + if_exists="append", + index=False, + ) - jobs_outputs["results"].to_sql( - name="jobs", con=con, schema="outputs", if_exists="append", index=False - ) + jobs_outputs["results"].to_sql( + name="jobs", con=con, schema="outputs", if_exists="append", index=False + ) diff --git a/python/hh_characteristics.py b/python/hh_characteristics.py index 87c3400..924b641 100644 --- a/python/hh_characteristics.py +++ b/python/hh_characteristics.py @@ -421,34 +421,43 @@ def _insert_hh_income( ) -> None: """Insert hh characteristics and tract level controls to database""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return + for name, data in hh_income_inputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"hh_char_inputs_{name}.csv", index=False + ) + for name, data in hh_income_outputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"hh_char_outputs_{name}.csv", index=False + ) - with utils.ESTIMATES_ENGINE.connect() as con: - hh_income_inputs["hh_income_tract_controls"][ - ["run_id", "year", "tract", "income_category", "value"] - ].rename(columns={"income_category": "metric"}).assign( - metric=lambda df: "Income Category - " + df["metric"] - ).to_sql( - schema="inputs", - name="controls_tract", - if_exists="append", - con=con, - index=False, - ) + # Otherwise, load to database + else: + with utils.ESTIMATES_ENGINE.connect() as con: + hh_income_inputs["hh_income_tract_controls"][ + ["run_id", "year", "tract", "income_category", "value"] + ].rename(columns={"income_category": "metric"}).assign( + metric=lambda df: "Income Category - " + df["metric"] + ).to_sql( + schema="inputs", + name="controls_tract", + if_exists="append", + con=con, + index=False, + ) - hh_income_outputs["hh_income"][ - ["run_id", "year", "mgra", "income_category", "hh"] - ].rename(columns={"income_category": "metric", "hh": "value"}).assign( - metric=lambda df: "Income Category - " + df["metric"] - ).to_sql( - schema="outputs", - name="hh_characteristics", - if_exists="append", - con=con, - index=False, - ) + hh_income_outputs["hh_income"][ + ["run_id", "year", "mgra", "income_category", "hh"] + ].rename(columns={"income_category": "metric", "hh": "value"}).assign( + metric=lambda df: "Income Category - " + df["metric"] + ).to_sql( + schema="outputs", + name="hh_characteristics", + if_exists="append", + con=con, + index=False, + ) def _insert_hh_size( @@ -458,33 +467,42 @@ def _insert_hh_size( ) -> None: """Insert hh characteristics and tract level controls to database""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return + for name, data in hh_size_inputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"hh_char_inputs_{name}.csv", index=False + ) + for name, data in hh_size_outputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"hh_char_outputs_{name}.csv", index=False + ) - with utils.ESTIMATES_ENGINE.connect() as con: - hh_size_inputs["hhs_tract_controls"].rename( - columns={"household_size": "metric"} - ).assign( - metric=lambda df: "Household Size - " - + df["metric"].astype(str).replace("7", "7+") - ).to_sql( - schema="inputs", - name="controls_tract", - if_exists="append", - con=con, - index=False, - ) + # Otherwise, load to database + else: + with utils.ESTIMATES_ENGINE.connect() as con: + hh_size_inputs["hhs_tract_controls"].rename( + columns={"household_size": "metric"} + ).assign( + metric=lambda df: "Household Size - " + + df["metric"].astype(str).replace("7", "7+") + ).to_sql( + schema="inputs", + name="controls_tract", + if_exists="append", + con=con, + index=False, + ) - hh_size_outputs["hh_size"][ - ["run_id", "year", "mgra", "household_size", "hh"] - ].rename(columns={"household_size": "metric", "hh": "value"}).assign( - metric=lambda df: "Household Size - " - + df["metric"].astype(str).replace("7", "7+") - ).to_sql( - schema="outputs", - name="hh_characteristics", - if_exists="append", - con=con, - index=False, - ) + hh_size_outputs["hh_size"][ + ["run_id", "year", "mgra", "household_size", "hh"] + ].rename(columns={"household_size": "metric", "hh": "value"}).assign( + metric=lambda df: "Household Size - " + + df["metric"].astype(str).replace("7", "7+") + ).to_sql( + schema="outputs", + name="hh_characteristics", + if_exists="append", + con=con, + index=False, + ) diff --git a/python/hs_hh.py b/python/hs_hh.py index 7e4fa2f..5d9c30e 100644 --- a/python/hs_hh.py +++ b/python/hs_hh.py @@ -245,38 +245,47 @@ def _insert_hs_hh( ) -> None: """Insert occupancy controls and households results to database.""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return + for name, data in hs_hh_inputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"hs_hh_inputs_{name}.csv", index=False + ) + for name, data in hs_hh_outputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"hs_hh_outputs_{name}.csv", index=False + ) - with utils.ESTIMATES_ENGINE.connect() as con: - hs_hh_inputs["hs"].drop(columns=["tract", "city"]).to_sql( - name="hs", - con=con, - schema="outputs", - if_exists="append", - index=False, - ) - hs_hh_inputs["city_controls"].to_sql( - name="controls_city", - con=con, - schema="inputs", - if_exists="append", - index=False, - ) - hs_hh_inputs["tract_controls"].assign( - metric=lambda x: "Occupancy Rate - " + x["structure_type"] - ).drop(columns="structure_type").to_sql( - name="controls_tract", - con=con, - schema="inputs", - if_exists="append", - index=False, - ) - hs_hh_outputs["hh"].to_sql( - name="hh", - con=con, - schema="outputs", - if_exists="append", - index=False, - ) + # Otherwise, load to database + else: + with utils.ESTIMATES_ENGINE.connect() as con: + hs_hh_inputs["hs"].drop(columns=["tract", "city"]).to_sql( + name="hs", + con=con, + schema="outputs", + if_exists="append", + index=False, + ) + hs_hh_inputs["city_controls"].to_sql( + name="controls_city", + con=con, + schema="inputs", + if_exists="append", + index=False, + ) + hs_hh_inputs["tract_controls"].assign( + metric=lambda x: "Occupancy Rate - " + x["structure_type"] + ).drop(columns="structure_type").to_sql( + name="controls_tract", + con=con, + schema="inputs", + if_exists="append", + index=False, + ) + hs_hh_outputs["hh"].to_sql( + name="hh", + con=con, + schema="outputs", + if_exists="append", + index=False, + ) diff --git a/python/parsers.py b/python/parsers.py index 32ebc93..3b633b7 100644 --- a/python/parsers.py +++ b/python/parsers.py @@ -163,7 +163,7 @@ def _validate_config(self) -> None: }, "module": { "type": "string", - "allowed": _MODULES, + "allowed": _MODULES + [""], }, }, }, @@ -185,6 +185,13 @@ def _validate_config(self) -> None: # That the provided 'run_id' is valid self._check_run_id(self._config["debug"]["run_id"], complete=True) + # That a valid module was provided + if self._config["debug"]["module"] not in _MODULES: + raise ValueError( + f"Debug key 'module' must be one of {', '.join(_MODULES)}. " + f"Instead, \"{self._config['debug']['module']}\" was provided." + ) + # That the 'year' value conforms with those already in [metadata].[run] with self._engine.connect() as con: existing_start_year = con.execute( diff --git a/python/pop_type.py b/python/pop_type.py index 106c74c..9890c08 100644 --- a/python/pop_type.py +++ b/python/pop_type.py @@ -169,26 +169,34 @@ def _insert_gq( ) -> None: """Insert both input and output data for MGRA group quarters""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return + for name, data in gq_inputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"pop_inputs_{name}.csv", index=False + ) + for name, data in gq_outputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"pop_outputs_{name}.csv", index=False + ) - # Insert controls and group quarters results to database - with utils.ESTIMATES_ENGINE.connect() as con: - gq_inputs["city_controls"].to_sql( - name="controls_city", - con=con, - schema="inputs", - if_exists="append", - index=False, - ) - gq_outputs["gq"].drop(columns="city").to_sql( - name="gq", - con=con, - schema="outputs", - if_exists="append", - index=False, - ) + # Otherwise, insert controls and group quarters results to database + else: + with utils.ESTIMATES_ENGINE.connect() as con: + gq_inputs["city_controls"].to_sql( + name="controls_city", + con=con, + schema="inputs", + if_exists="append", + index=False, + ) + gq_outputs["gq"].drop(columns="city").to_sql( + name="gq", + con=con, + schema="outputs", + if_exists="append", + index=False, + ) def _get_hhp_inputs(year: int) -> dict[str, pd.DataFrame]: @@ -392,27 +400,35 @@ def _insert_hhp( ) -> None: """Insert input and output data related to household population""" - # Skip insertion if running in debug mode + # Save locally if in debug mode if debug: - return + for name, data in hhp_inputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"pop_inputs_{name}.csv", index=False + ) + for name, data in hhp_outputs.items(): + data.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"pop_outputs_{name}.csv", index=False + ) - # Insert input and output data to database - with utils.ESTIMATES_ENGINE.connect() as con: - hhp_inputs["city_controls"].to_sql( - name="controls_city", - con=con, - schema="inputs", - if_exists="append", - index=False, - ) + # Otherwise, insert to database + else: + with utils.ESTIMATES_ENGINE.connect() as con: + hhp_inputs["city_controls"].to_sql( + name="controls_city", + con=con, + schema="inputs", + if_exists="append", + index=False, + ) - hhp_inputs["tract_controls"].assign(metric="Household Size").to_sql( - name="controls_tract", - con=con, - schema="inputs", - if_exists="append", - index=False, - ) - hhp_outputs["hhp"].to_sql( - name="hhp", con=con, schema="outputs", if_exists="append", index=False - ) + hhp_inputs["tract_controls"].assign(metric="Household Size").to_sql( + name="controls_tract", + con=con, + schema="inputs", + if_exists="append", + index=False, + ) + hhp_outputs["hhp"].to_sql( + name="hhp", con=con, schema="outputs", if_exists="append", index=False + ) diff --git a/python/utils.py b/python/utils.py index 160021b..b483c72 100644 --- a/python/utils.py +++ b/python/utils.py @@ -111,6 +111,13 @@ logger.info(f"DEBUG: {DEBUG}") logger.info(f"RUN_INSTRUCTIONS: {RUN_INSTRUCTIONS}") +# Additionally, if we are in debug mode, we don't load any data to database since the +# [run_id] we are re-using is already complete. Instead, we save data locally into an +# ignored folder +DEBUG_OUTPUT_FOLDER = ROOT_FOLDER / "debug_output" +if DEBUG: + DEBUG_OUTPUT_FOLDER.mkdir(parents=False, exist_ok=True) + ############################## # UTILITY LISTS AND MAPPINGS # ############################## From 2bbd7aba8ccd5b5b99a81235122c6e0af1a6fd9e Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Wed, 18 Mar 2026 15:37:41 +0000 Subject: [PATCH 3/9] #206: Documentation update --- README.md | 66 ++++++++++++++--------------------------------------- config.toml | 4 ++-- 2 files changed, 19 insertions(+), 51 deletions(-) diff --git a/README.md b/README.md index 9c8fc1d..be36b9e 100644 --- a/README.md +++ b/README.md @@ -47,68 +47,36 @@ mgra = "mgra15" start_year = 2020 # The last year inclusive to end running with -end_year = 2023 +end_year = 2024 # The code version -version = "0.0.0-dev" +version = "1.1.1-dev" # Additional notes on this run comments = "Example comment" -# The 'debug' section contains configuration for running a subset of modules of the -# Estimates Program for a given set of years. All parameters must be provided except for -# 'run_id', 'version', and 'comments'. If 'run_id' is -1, then a new 'run_id' will -# be automatically created, similar to 'run' mode +# The `debug` section contains configuration for running a single module for a single +# year based on the input data of an existing complete Estimates run. Output data is not +# written to database, but is instead saved to a local folder debug_output\, which is +# ignored by .gitignore [debug] # Whether to use the 'debug' section. Mutually exclusive with 'run' mode enabled = false -# (Optional) If provided, then most parameters in the 'debug' section will be pulled -# from '[run].[metadata]'. If not provided, then a new 'run_id' will be automatically -# created. Use -1 to indicate no run_id (TOML doesn't support null) -run_id = -1 +# The [run_id] of a fully [complete] Estimates Program run. Input data for debugging +# will be pulled from this [run_id] +run_id = 82 # The run_id for the released v24 Estimates -# The first year inclusive and last year inclusive to run. In the case that... -# * The value of 'run_id' is -1, the values will be loaded into [metadata].[run] -# and will be used as is -# * The value of 'run_id' is not -1, the values will be checked against the values -# already in '[run].[metadata]' -start_year = 2020 -end_year = 2023 - -# (Optional) The code version. If provided, then 'run_id' must be -1 -version = "0.0.0-dev" - -# (Optional) Additional notes on this run. If provided, then 'run_id' must be -1 -comments = "" - -# Whether to run the 'startup' module -startup = false - -# Whether to run the 'housing_and_households' module. If enabled, then any above -# modules must all be enabled due to module dependencies -housing_and_households = false - -# Whether to run the 'population' module. If enabled, then any above modules must all -# be enabled due to module dependencies -population = false - -# Whether to run the 'population_by_ase' module. If enabled, then any above modules -# must all be enabled due to module dependencies -population_by_ase = false - -# Whether to run the 'household_characteristics' module. If enabled, then any above -# modules must all be enabled due to module dependencies -household_characteristics = false - -# Whether to run the 'employment' module. If enabled, then startup module must also be -# enabled due to module dependencies -employment = false +# The year of the Estimates Program to run. This year must be consistent with the stored +# [start_year] and [end_year] associated with the above [run_id] in [metadata].[run] +year = 2020 -# Whether to run the 'staging' module. If enabled, then any above modules must all be -# enabled due to module dependencies -staging = false +# The module of the Estimates Program to run. Since only [complete] [run_id]s are +# allowed, this can be any Estimates Program module. Explicitly, the valid inputs +# are "startup", "housing_and_households", "population", "population_by_ase", +# "household_characteristics", "employment", or "staging" +module = "" ``` ### Production Database Schema diff --git a/config.toml b/config.toml index 627f104..8c3b1a4 100644 --- a/config.toml +++ b/config.toml @@ -14,8 +14,8 @@ comments = "Example comment" # The `debug` section contains configuration for running a single module for a single # year based on the input data of an existing complete Estimates run. The input module -# string can be any of startup, housing_and_households, population, population_by_ase, -# household_characteristics, employment, or staging +# string can be any of "startup", "housing_and_households", "population", +# "population_by_ase", "household_characteristics", "employment", or "staging" [debug] enabled = false run_id = 82 # The run_id for the released v24 Estimates From 24ca466db109f8230b1c19b8f9dca2d097643061 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Wed, 18 Mar 2026 15:51:34 +0000 Subject: [PATCH 4/9] #206: Documentation update --- python/parsers.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/parsers.py b/python/parsers.py index 3b633b7..2b070ad 100644 --- a/python/parsers.py +++ b/python/parsers.py @@ -19,9 +19,7 @@ class InputParser: function is: * (Class variable 'run_instructions') Explicit instructions on which modules to run on which years - * (Class variable 'run_id') The value of 'run_id' to use. If standard run mode is - enabled or if no 'run_id' was provided in 'debug' mode, then a new 'run_id' was - created and inserted into '[run].[metadata]' + * (Class variable 'run_id') The value of 'run_id' to use * (Class variable 'mgra_version') The MGRA version to run on Attributes: From c4d70542367975ba46a2c87e9b9268f33055dd6e Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Wed, 18 Mar 2026 19:48:41 +0000 Subject: [PATCH 5/9] #207: Added temporary testing file --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index ef21d49..12ec231 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,9 @@ log.txt # Debug output file debug_output/ +# Temporary testing files +test.py + ################################################################## # Python .gitignore # # https://github.com/github/gitignore/blob/main/Python.gitignore # From 597d9144d6bdf35150679194cfeefcf449cf1957 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Wed, 18 Mar 2026 20:13:25 +0000 Subject: [PATCH 6/9] #206: Aligned debug output with actual table names --- python/ase.py | 55 +++++++++++----------- python/employment.py | 14 +++--- python/hh_characteristics.py | 89 +++++++++++++++++++++--------------- python/hs_hh.py | 22 +++++---- python/pop_type.py | 31 ++++++------- 5 files changed, 115 insertions(+), 96 deletions(-) diff --git a/python/ase.py b/python/ase.py index 5cd9de6..6599e60 100644 --- a/python/ase.py +++ b/python/ase.py @@ -251,7 +251,7 @@ def _insert_controls(controls_outputs: pd.DataFrame, debug: bool) -> None: # Save locally if in debug mode if debug: controls_outputs.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"ase_inputs_controls_ase.csv", index=False + utils.DEBUG_OUTPUT_FOLDER / f"inputs_controls_ase.csv", index=False ) # Otherwise, insert into the database @@ -927,33 +927,36 @@ def _validate_ase_outputs(ase_outputs: dict[str, pd.DataFrame]) -> None: def _insert_ase(ase_outputs: dict[str, pd.DataFrame], debug: bool) -> None: """Insert age/sex/ethnicity population by type to database.""" - # Save locally if in debug mode - if debug: - for name, data in ase_outputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"ase_outputs_{name}.csv", index=False - ) + for pop_type, pop_type_data in ase_outputs.items(): + logger.info("Loading Estimates for " + pop_type) + + # For loading speed, remove all data which is zero. See GitHub for more details: + # https://github.com/SANDAG/Estimates-Program/pull/184 + pop_type_data_no_zero = pop_type_data.loc[lambda df: df["value"] != 0][ + [ + "run_id", + "year", + "mgra", + "pop_type", + "age_group", + "sex", + "ethnicity", + "value", + ] + ] - # Otherwise, load to database - else: - for pop_type, output in ase_outputs.items(): - logger.info("Loading Estimates for " + pop_type) + # Write the data locally if in debug mode + if debug: + pop_type_data_no_zero.to_csv( + utils.DEBUG_OUTPUT_FOLDER / f"outputs_ase_{pop_type}.csv", index=False + ) - # Write the DataFrame to a CSV file + # Otherwise, load to database + else: + # First, write the DataFrame to a CSV file in the network location csv_temp_location = utils.BULK_INSERT_STAGING / (pop_type + ".txt") ( - output.loc[lambda df: df["value"] != 0][ - [ - "run_id", - "year", - "mgra", - "pop_type", - "age_group", - "sex", - "ethnicity", - "value", - ] - ].to_csv( + pop_type_data_no_zero.to_csv( csv_temp_location, header=False, index=False, @@ -962,7 +965,7 @@ def _insert_ase(ase_outputs: dict[str, pd.DataFrame], debug: bool) -> None: ) ) - # Bulk insert the CSV file into the production database + # Then, bulk insert the CSV file into the production database with utils.ESTIMATES_ENGINE.connect() as con: query = sql.text( f""" @@ -980,5 +983,5 @@ def _insert_ase(ase_outputs: dict[str, pd.DataFrame], debug: bool) -> None: con.execute(query) con.commit() - # Remove the temporary CSV file + # Finally, remove the temporary CSV file csv_temp_location.unlink() diff --git a/python/employment.py b/python/employment.py index 1aa92e3..8bd13a2 100644 --- a/python/employment.py +++ b/python/employment.py @@ -276,14 +276,12 @@ def _insert_jobs( # Save locally if in debug mode if debug: - for name, data in jobs_inputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"emp_inputs_{name}.csv", index=False - ) - for name, data in jobs_outputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"emp_outputs_{name}.csv", index=False - ) + jobs_inputs["control_totals"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_jobs.csv", index=False + ) + jobs_outputs["results"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "outputs_jobs.csv", index=False + ) # Otherwise, insert to database else: diff --git a/python/hh_characteristics.py b/python/hh_characteristics.py index 924b641..101891a 100644 --- a/python/hh_characteristics.py +++ b/python/hh_characteristics.py @@ -421,25 +421,36 @@ def _insert_hh_income( ) -> None: """Insert hh characteristics and tract level controls to database""" + inputs_controls_tract = ( + hh_income_inputs["hh_income_tract_controls"][ + ["run_id", "year", "tract", "income_category", "value"] + ] + .rename(columns={"income_category": "metric"}) + .assign(metric=lambda df: "Income Category - " + df["metric"]) + ) + outputs_hh_characteristics = ( + hh_income_outputs["hh_income"][ + ["run_id", "year", "mgra", "income_category", "hh"] + ] + .rename(columns={"income_category": "metric", "hh": "value"}) + .assign(metric=lambda df: "Income Category - " + df["metric"]) + ) + # Save locally if in debug mode if debug: - for name, data in hh_income_inputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"hh_char_inputs_{name}.csv", index=False - ) - for name, data in hh_income_outputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"hh_char_outputs_{name}.csv", index=False - ) + inputs_controls_tract.to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_tract_hh_income.csv", + index=False, + ) + outputs_hh_characteristics.to_csv( + utils.DEBUG_OUTPUT_FOLDER / "outputs_hh_characteristics_hh_income.csv", + index=False, + ) # Otherwise, load to database else: with utils.ESTIMATES_ENGINE.connect() as con: - hh_income_inputs["hh_income_tract_controls"][ - ["run_id", "year", "tract", "income_category", "value"] - ].rename(columns={"income_category": "metric"}).assign( - metric=lambda df: "Income Category - " + df["metric"] - ).to_sql( + inputs_controls_tract.to_sql( schema="inputs", name="controls_tract", if_exists="append", @@ -447,11 +458,7 @@ def _insert_hh_income( index=False, ) - hh_income_outputs["hh_income"][ - ["run_id", "year", "mgra", "income_category", "hh"] - ].rename(columns={"income_category": "metric", "hh": "value"}).assign( - metric=lambda df: "Income Category - " + df["metric"] - ).to_sql( + outputs_hh_characteristics.to_sql( schema="outputs", name="hh_characteristics", if_exists="append", @@ -467,26 +474,37 @@ def _insert_hh_size( ) -> None: """Insert hh characteristics and tract level controls to database""" + inputs_controls_tract = ( + hh_size_inputs["hhs_tract_controls"] + .rename(columns={"household_size": "metric"}) + .assign( + metric=lambda df: "Household Size - " + + df["metric"].astype(str).replace("7", "7+") + ) + ) + outputs_hh_characteristics = ( + hh_size_outputs["hh_size"][["run_id", "year", "mgra", "household_size", "hh"]] + .rename(columns={"household_size": "metric", "hh": "value"}) + .assign( + metric=lambda df: "Household Size - " + + df["metric"].astype(str).replace("7", "7+") + ) + ) + # Save locally if in debug mode if debug: - for name, data in hh_size_inputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"hh_char_inputs_{name}.csv", index=False - ) - for name, data in hh_size_outputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"hh_char_outputs_{name}.csv", index=False - ) + inputs_controls_tract.to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_tract_hh_size.csv", index=False + ) + outputs_hh_characteristics.to_csv( + utils.DEBUG_OUTPUT_FOLDER / "outputs_hh_characteristics_hh_size.csv", + index=False, + ) # Otherwise, load to database else: with utils.ESTIMATES_ENGINE.connect() as con: - hh_size_inputs["hhs_tract_controls"].rename( - columns={"household_size": "metric"} - ).assign( - metric=lambda df: "Household Size - " - + df["metric"].astype(str).replace("7", "7+") - ).to_sql( + inputs_controls_tract.to_sql( schema="inputs", name="controls_tract", if_exists="append", @@ -494,12 +512,7 @@ def _insert_hh_size( index=False, ) - hh_size_outputs["hh_size"][ - ["run_id", "year", "mgra", "household_size", "hh"] - ].rename(columns={"household_size": "metric", "hh": "value"}).assign( - metric=lambda df: "Household Size - " - + df["metric"].astype(str).replace("7", "7+") - ).to_sql( + outputs_hh_characteristics.to_sql( schema="outputs", name="hh_characteristics", if_exists="append", diff --git a/python/hs_hh.py b/python/hs_hh.py index 5d9c30e..5a63121 100644 --- a/python/hs_hh.py +++ b/python/hs_hh.py @@ -247,14 +247,20 @@ def _insert_hs_hh( # Save locally if in debug mode if debug: - for name, data in hs_hh_inputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"hs_hh_inputs_{name}.csv", index=False - ) - for name, data in hs_hh_outputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"hs_hh_outputs_{name}.csv", index=False - ) + hs_hh_inputs["hs"].drop(columns=["tract", "city"]).to_csv( + utils.DEBUG_OUTPUT_FOLDER / "outputs_hs.csv", index=False + ) + hs_hh_inputs["city_controls"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_city.csv", index=False + ) + hs_hh_inputs["tract_controls"].assign( + metric=lambda x: "Occupancy Rate - " + x["structure_type"] + ).drop(columns="structure_type").to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_tract.csv", index=False + ) + hs_hh_outputs["hh"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "outputs_hh.csv", index=False + ) # Otherwise, load to database else: diff --git a/python/pop_type.py b/python/pop_type.py index 9890c08..a9913da 100644 --- a/python/pop_type.py +++ b/python/pop_type.py @@ -171,14 +171,12 @@ def _insert_gq( # Save locally if in debug mode if debug: - for name, data in gq_inputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"pop_inputs_{name}.csv", index=False - ) - for name, data in gq_outputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"pop_outputs_{name}.csv", index=False - ) + gq_inputs["city_controls"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_city_gq.csv", index=False + ) + gq_outputs["gq"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "outputs_gq.csv", index=False + ) # Otherwise, insert controls and group quarters results to database else: @@ -402,14 +400,15 @@ def _insert_hhp( # Save locally if in debug mode if debug: - for name, data in hhp_inputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"pop_inputs_{name}.csv", index=False - ) - for name, data in hhp_outputs.items(): - data.to_csv( - utils.DEBUG_OUTPUT_FOLDER / f"pop_outputs_{name}.csv", index=False - ) + hhp_inputs["city_controls"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_city_pop.csv", index=False + ) + hhp_inputs["tract_controls"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_tract.csv", index=False + ) + hhp_outputs["hhp"].to_csv( + utils.DEBUG_OUTPUT_FOLDER / "outputs_hhp.csv", index=False + ) # Otherwise, insert to database else: From 2280a2ed1358f7070c3f50eddd6d5eb3f93683b7 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Wed, 18 Mar 2026 20:13:58 +0000 Subject: [PATCH 7/9] #206: Simplified code --- python/parsers.py | 70 ++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 41 deletions(-) diff --git a/python/parsers.py b/python/parsers.py index 2b070ad..f8e777e 100644 --- a/python/parsers.py +++ b/python/parsers.py @@ -90,36 +90,27 @@ def parse_config(self) -> None: for key in _MODULES: self.run_instructions[key] = key == self._config["debug"]["module"] - def _check_run_id(self, run_id: int, complete: bool = False) -> None: - """Check if supplied run id exists in the database. - - Args: - run_id: The [run_id] to check for - complete: Default False. If True, then only check for [run_id]s marked as - [complete] = 1. If False, don't check for [complete] status - - Return: - None - - Raises: - ValueError: If [run_id] does not exist in the database - """ + def _check_run_id(self, run_id: int) -> None: + """Check if supplied run id exists in the database and is complete""" with self._engine.connect() as con: # Ensure supplied run id exists in the database query = sql.text( - f""" + """ SELECT CASE WHEN EXISTS ( SELECT [run_id] FROM [metadata].[run] WHERE [run_id] = :run_id - {"AND [complete] = 1" if complete else ""} + AND [complete] = 1 ) THEN 1 ELSE 0 END """ ) exists = con.execute(query, {"run_id": run_id}).scalar() if exists == 0: - raise ValueError("run_id does not exist in the database") + raise ValueError( + f"Either the [run_id]={run_id} does not exist in the database or " + f"it is not marked as [complete]=1" + ) def _validate_config(self) -> None: """Validate the contents of the configuration dictionary @@ -181,7 +172,7 @@ def _validate_config(self) -> None: # Check that if we are in debug mode... if self._config["debug"]["enabled"]: # That the provided 'run_id' is valid - self._check_run_id(self._config["debug"]["run_id"], complete=True) + self._check_run_id(self._config["debug"]["run_id"]) # That a valid module was provided if self._config["debug"]["module"] not in _MODULES: @@ -190,37 +181,34 @@ def _validate_config(self) -> None: f"Instead, \"{self._config['debug']['module']}\" was provided." ) - # That the 'year' value conforms with those already in [metadata].[run] + # That the 'year' value conforms with the [start_year] and [end_year] + # already in [metadata].[run] with self._engine.connect() as con: - existing_start_year = con.execute( + check_year = con.execute( sql.text( - "SELECT [start_year] FROM [metadata].[run] WHERE run_id = :run_id" + """ + SELECT + CASE + WHEN :year BETWEEN [start_year] AND [end_year] THEN 1 + ELSE 0 + END + FROM [metadata].[run] + WHERE [run_id] = :run_id + """ ), - {"run_id": self._config["debug"]["run_id"]}, + { + "run_id": self._config["debug"]["run_id"], + "year": self._config["debug"]["year"], + }, ).scalar() - if self._config["debug"]["year"] < existing_start_year: + if check_year == 0: raise ValueError( f"The provided debug 'year' of {self._config['debug']['year']} " - f"is less than the [metadata].[run] 'start_year' of " - f"{existing_start_year} for 'run_id' {self._config["debug"]["run_id"]}" + f"is not within the range of [metadata].[run] 'start_year' and " + f"'end_year' for 'run_id' {self._config['debug']['run_id']}" ) else: self._start_year = self._config["debug"]["year"] - - with self._engine.connect() as con: - existing_end_year = con.execute( - sql.text( - "SELECT [end_year] FROM [metadata].[run] WHERE run_id = :run_id" - ), - {"run_id": self._config["debug"]["run_id"]}, - ).scalar() - if self._config["debug"]["year"] > existing_end_year: - raise ValueError( - f"The provided debug 'year' of {self._config['debug']['year']} " - f"is greater than the [metadata].[run] 'end_year' of " - f"{existing_end_year} for 'run_id' {self._config["debug"]["run_id"]}" - ) - else: self._end_year = self._config["debug"]["year"] def _parse_run_id(self) -> int: @@ -307,7 +295,7 @@ def _parse_mgra_version(self) -> str: # Get mgra version from database if debug mode is enabled elif self._config["debug"]["enabled"]: # Ensure run id exists in the database - self._check_run_id(run_id=self.run_id, complete=True) + self._check_run_id(run_id=self.run_id) with self._engine.connect() as con: query = sql.text( From 0e180a0cccb490f4f27435f6d5282ec48ee59f28 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Wed, 18 Mar 2026 20:14:17 +0000 Subject: [PATCH 8/9] #206: More details for `debug` mode --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index be36b9e..d9cba8e 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,8 @@ comments = "Example comment" # The `debug` section contains configuration for running a single module for a single # year based on the input data of an existing complete Estimates run. Output data is not # written to database, but is instead saved to a local folder debug_output\, which is -# ignored by .gitignore +# ignored by .gitignore. No data is saved locally for the "startup" and "staging" +# modules [debug] # Whether to use the 'debug' section. Mutually exclusive with 'run' mode From 365274844fe14b5dc23d409e246988b5f3bb4933 Mon Sep 17 00:00:00 2001 From: Eric Liu Date: Wed, 18 Mar 2026 20:22:09 +0000 Subject: [PATCH 9/9] #206: Actually aligned debug csv and table names --- python/pop_type.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pop_type.py b/python/pop_type.py index a9913da..3c608e3 100644 --- a/python/pop_type.py +++ b/python/pop_type.py @@ -174,7 +174,7 @@ def _insert_gq( gq_inputs["city_controls"].to_csv( utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_city_gq.csv", index=False ) - gq_outputs["gq"].to_csv( + gq_outputs["gq"].drop(columns="city").to_csv( utils.DEBUG_OUTPUT_FOLDER / "outputs_gq.csv", index=False ) @@ -403,7 +403,7 @@ def _insert_hhp( hhp_inputs["city_controls"].to_csv( utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_city_pop.csv", index=False ) - hhp_inputs["tract_controls"].to_csv( + hhp_inputs["tract_controls"].assign(metric="Household Size").to_csv( utils.DEBUG_OUTPUT_FOLDER / "inputs_controls_tract.csv", index=False ) hhp_outputs["hhp"].to_csv(