Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
- bump: patch
- bump: minor
changes:
fixed:
- Add TANF takeup (22%) assignment to CPS data pipeline so takes_up_tanf_if_eligible is persisted in the dataset.
enhanced:
- Add capital income predictors (interest, dividend, rental income) to SIPP liquid asset imputation model for improved accuracy.
36 changes: 35 additions & 1 deletion policyengine_us_data/calibration/source_impute.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@

SIPP_ASSETS_PREDICTORS = [
"employment_income",
"interest_income",
"dividend_income",
"rental_income",
"age",
"is_female",
"is_married",
Expand Down Expand Up @@ -458,6 +461,10 @@ def _impute_sipp(
"TVAL_BANK",
"TVAL_STMF",
"TVAL_BOND",
"TINC_BANK",
"TINC_STMF",
"TINC_BOND",
"TINC_RENT",
]
asset_df = pd.read_csv(
STORAGE_FOLDER / "pu2023.csv",
Expand All @@ -473,6 +480,11 @@ def _impute_sipp(
asset_df["is_female"] = asset_df.ESEX == 2
asset_df["is_married"] = asset_df.EMS == 1
asset_df["employment_income"] = asset_df.TPTOTINC * 12
asset_df["interest_income"] = (
asset_df["TINC_BANK"].fillna(0) + asset_df["TINC_BOND"].fillna(0)
) * 12
asset_df["dividend_income"] = asset_df["TINC_STMF"].fillna(0) * 12
asset_df["rental_income"] = asset_df["TINC_RENT"].fillna(0) * 12
asset_df["household_weight"] = asset_df.WPFINWGT
asset_df["is_under_18"] = asset_df.TAGE < 18
asset_df["count_under_18"] = (
Expand All @@ -484,6 +496,9 @@ def _impute_sipp(

asset_train_cols = [
"employment_income",
"interest_income",
"dividend_income",
"rental_income",
"bank_account_assets",
"stock_assets",
"bond_assets",
Expand All @@ -510,7 +525,14 @@ def _impute_sipp(
data,
time_period,
dataset_path,
["employment_income", "age", "is_male"],
[
"employment_income",
"interest_income",
"dividend_income",
"rental_income",
"age",
"is_male",
],
)
if "is_male" in cps_asset_df.columns:
cps_asset_df["is_female"] = (
Expand All @@ -529,6 +551,18 @@ def _impute_sipp(
if "count_under_18" in cps_tip_df.columns
else 0.0
)
for cap_var in [
"interest_income",
"dividend_income",
"rental_income",
]:
if cap_var not in cps_asset_df.columns:
if cap_var in data:
cps_asset_df[cap_var] = data[cap_var][time_period].astype(
np.float32
)
else:
cps_asset_df[cap_var] = 0.0

asset_vars = [
"bank_account_assets",
Expand Down
3 changes: 3 additions & 0 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -1779,6 +1779,9 @@ def add_tips(self, cps: h5py.File):
"person_id",
"household_id",
"employment_income",
"interest_income",
"dividend_income",
"rental_income",
"age",
"household_weight",
"is_female",
Expand Down
19 changes: 19 additions & 0 deletions policyengine_us_data/datasets/sipp/sipp.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,11 @@ def get_tip_model() -> QRF:
"TVAL_BANK", # Checking, savings, money market
"TVAL_STMF", # Stocks and mutual funds
"TVAL_BOND", # Bonds and government securities
# Income from assets (monthly, person-level)
"TINC_BANK", # Interest from bank accounts
"TINC_STMF", # Dividends from stocks/mutual funds
"TINC_BOND", # Interest from bonds
"TINC_RENT", # Rental income
# SSI receipt (for validation)
"RSSI_YRYN", # Received SSI in at least one month
]
Expand Down Expand Up @@ -200,6 +205,14 @@ def train_asset_model():
df["household_weight"] = df.WPFINWGT
df["household_id"] = df.SSUID

# Capital income predictors (annualized from monthly SIPP)
# Maps to CPS: interest_income, dividend_income, rental_income
df["interest_income"] = (
df["TINC_BANK"].fillna(0) + df["TINC_BOND"].fillna(0)
) * 12
df["dividend_income"] = df["TINC_STMF"].fillna(0) * 12
df["rental_income"] = df["TINC_RENT"].fillna(0) * 12

# Calculate household-level counts
df["is_under_18"] = df.TAGE < 18
df["count_under_18"] = (
Expand All @@ -210,6 +223,9 @@ def train_asset_model():
[
"household_id",
"employment_income",
"interest_income",
"dividend_income",
"rental_income",
"bank_account_assets",
"stock_assets",
"bond_assets",
Expand Down Expand Up @@ -239,6 +255,9 @@ def train_asset_model():
X_train=sipp,
predictors=[
"employment_income",
"interest_income",
"dividend_income",
"rental_income",
"age",
"is_female",
"is_married",
Expand Down