From 73880bbf843487070496177526b538481de1e612 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:01:36 +0000 Subject: [PATCH 1/8] feat: impute below-threshold student loan borrowers The FRS only captures borrowers making PAYE repayments (above threshold). SLC data shows ~55% of Plan 2 holders earn below threshold. This adds probabilistic imputation of below-threshold borrowers for Plan 2 and Plan 5 based on SLC "liable to repay" counts. Changes: - Add slc/plan_*_borrowers_liable targets alongside existing above_threshold - Probabilistically assign plans to tertiary-educated people without repayments, constrained by age band and cohort - Add compute_student_loan_plan_liable() for calibration Closes #281 --- .../datasets/imputations/student_loans.py | 162 +++++++++++++++--- .../targets/build_loss_matrix.py | 5 +- .../targets/compute/__init__.py | 2 + policyengine_uk_data/targets/compute/other.py | 30 +++- policyengine_uk_data/targets/sources/slc.py | 135 ++++++++++----- .../tests/test_student_loan_targets.py | 61 +++++-- 6 files changed, 308 insertions(+), 87 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py index 38391899..f86655c9 100644 --- a/policyengine_uk_data/datasets/imputations/student_loans.py +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -2,30 +2,86 @@ Student loan plan imputation. This module imputes the student_loan_plan variable based on: -- Whether the person has reported student loan repayments +- Whether the person has reported student loan repayments (above threshold) - Their estimated university attendance year (inferred from age) +- Probabilistic assignment for below-threshold borrowers The imputation assigns plan types according to when the loan system changed: -- NONE: No reported repayments +- NONE: No loan - PLAN_1: Started university before September 2012 - PLAN_2: Started September 2012 - August 2023 - PLAN_5: Started September 2023 onwards -This enables policyengine-uk's student_loan_repayment variable to calculate -repayments using official threshold parameters. +The FRS only records active repayers (via PAYE). SLC data shows many borrowers +earn below repayment thresholds. This imputation fills that gap by +probabilistically assigning plans to tertiary-educated people without +reported repayments, based on SLC "liable to repay" minus "above threshold" +counts. """ import numpy as np from policyengine_uk.data import UKSingleYearDataset from policyengine_uk import Microsimulation +# England regions for filtering (SLC data covers England only) +_ENGLAND_REGIONS = { + "NORTH_EAST", + "NORTH_WEST", + "YORKSHIRE", + "EAST_MIDLANDS", + "WEST_MIDLANDS", + "EAST_OF_ENGLAND", + "LONDON", + "SOUTH_EAST", + "SOUTH_WEST", +} + +# SLC liable-to-repay counts (Higher Education total, England) +# Source: https://explore-education-statistics.service.gov.uk/data-tables/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d +_PLAN_2_LIABLE = { + 2025: 8_940_000, + 2026: 9_710_000, + 2027: 10_360_000, + 2028: 10_615_000, + 2029: 10_600_000, + 2030: 10_525_000, +} + +_PLAN_5_LIABLE = { + 2025: 10_000, + 2026: 230_000, + 2027: 630_000, + 2028: 1_380_000, + 2029: 2_360_000, + 2030: 3_400_000, +} + +# SLC above-threshold counts (borrowers making repayments) +_PLAN_2_ABOVE_THRESHOLD = { + 2025: 3_985_000, + 2026: 4_460_000, + 2027: 4_825_000, + 2028: 5_045_000, + 2029: 5_160_000, + 2030: 5_205_000, +} + +_PLAN_5_ABOVE_THRESHOLD = { + 2026: 35_000, + 2027: 145_000, + 2028: 390_000, + 2029: 770_000, + 2030: 1_235_000, +} + def impute_student_loan_plan( dataset: UKSingleYearDataset, year: int = 2025, + seed: int = 42, ) -> UKSingleYearDataset: """ - Impute student loan plan type based on age and reported repayments. + Impute student loan plan type based on age, repayments, and education. The plan type determines which repayment threshold applies: - PLAN_1: £26,065 (2025), pre-Sept 2012 England/Wales @@ -33,46 +89,98 @@ def impute_student_loan_plan( - PLAN_4: Scottish loans (not imputed here - requires explicit flag) - PLAN_5: £25,000 (2025), Sept 2023 onwards + This function: + 1. Assigns plans to people with reported repayments (above threshold) + 2. Probabilistically assigns plans to tertiary-educated people without + repayments (below threshold) to match SLC liable-to-repay totals + Args: dataset: PolicyEngine UK dataset with student_loan_repayments. year: The simulation year, used to estimate university attendance. + seed: Random seed for reproducibility. Returns: Dataset with imputed student_loan_plan values. """ dataset = dataset.copy() sim = Microsimulation(dataset=dataset) + rng = np.random.default_rng(seed) - # Get required variables age = sim.calculate("age").values - student_loan_repayments = sim.calculate("student_loan_repayments").values - - # Determine if person has a student loan based on reported repayments - has_student_loan = student_loan_repayments > 0 - - # Estimate when they started university (assume age 18) - # For simulation year Y and age A, university start year = Y - A + 18 - estimated_uni_start_year = year - age + 18 + repayments = sim.calculate("student_loan_repayments").values + education = sim.calculate("highest_education").values + region = sim.calculate("region", map_to="person").values + weights = sim.calculate("person_weight").values + + is_england = np.isin(region, list(_ENGLAND_REGIONS)) + is_tertiary = education == "TERTIARY" + has_repayments = repayments > 0 + + # Estimate university start year (assume started at 18) + uni_start_year = year - age + 18 + + # Age bands for plausible loan holders + # Plan 1: 32-55 (started before 2012, still repaying) + # Plan 2: 21-33 (started 2012-2022) + # Plan 5: 21-24 (started 2023+) + plan_1_age_mask = (age >= 32) & (age <= 55) + plan_2_age_mask = (age >= 21) & (age <= 33) + plan_5_age_mask = (age >= 21) & (age <= 24) + + # Cohort masks based on university start year + plan_1_cohort = uni_start_year < 2012 + plan_2_cohort = (uni_start_year >= 2012) & (uni_start_year < 2023) + plan_5_cohort = uni_start_year >= 2023 - # Assign plan types based on when loan system changed - # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5" plan = np.full(len(age), "NONE", dtype=object) - # Plan 1: Started before September 2012 - plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012) - plan[plan_1_mask] = "PLAN_1" + # Step 1: Assign plans to people with reported repayments + plan[has_repayments & plan_1_cohort] = "PLAN_1" + plan[has_repayments & plan_2_cohort] = "PLAN_2" + plan[has_repayments & plan_5_cohort] = "PLAN_5" - # Plan 2: Started September 2012 - August 2023 - plan_2_mask = has_student_loan & ( - (estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023) + # Step 2: Probabilistically assign below-threshold borrowers + # Only for tertiary-educated people in England without repayments + no_repayments = ~has_repayments + + # Calculate target below-threshold counts + plan_2_below = _PLAN_2_LIABLE.get(year, 0) - _PLAN_2_ABOVE_THRESHOLD.get( + year, 0 + ) + plan_5_below = _PLAN_5_LIABLE.get(year, 0) - _PLAN_5_ABOVE_THRESHOLD.get( + year, 0 ) - plan[plan_2_mask] = "PLAN_2" - # Plan 5: Started September 2023 onwards - plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023) - plan[plan_5_mask] = "PLAN_5" + # Plan 2 below-threshold assignment + plan_2_eligible = ( + no_repayments + & is_tertiary + & is_england + & plan_2_age_mask + & plan_2_cohort + ) + if plan_2_below > 0 and plan_2_eligible.sum() > 0: + eligible_weight = (weights * plan_2_eligible).sum() + if eligible_weight > 0: + prob = min(1.0, plan_2_below / eligible_weight) + draws = rng.random(len(age)) + plan[plan_2_eligible & (draws < prob)] = "PLAN_2" + + # Plan 5 below-threshold assignment + plan_5_eligible = ( + no_repayments + & is_tertiary + & is_england + & plan_5_age_mask + & plan_5_cohort + ) + if plan_5_below > 0 and plan_5_eligible.sum() > 0: + eligible_weight = (weights * plan_5_eligible).sum() + if eligible_weight > 0: + prob = min(1.0, plan_5_below / eligible_weight) + draws = rng.random(len(age)) + plan[plan_5_eligible & (draws < prob)] = "PLAN_5" - # Store as the plan type dataset.person["student_loan_plan"] = plan return dataset diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py index 8af391a3..92e3f5b6 100644 --- a/policyengine_uk_data/targets/build_loss_matrix.py +++ b/policyengine_uk_data/targets/build_loss_matrix.py @@ -38,6 +38,7 @@ compute_scotland_uc_child, compute_scottish_child_payment, compute_student_loan_plan, + compute_student_loan_plan_liable, compute_ss_contributions, compute_ss_headcount, compute_ss_it_relief, @@ -306,8 +307,10 @@ def _compute_column( return compute_scottish_child_payment(target, ctx) # Student loan plan borrower counts (SLC) - if name.startswith("slc/plan_"): + if name.startswith("slc/plan_") and "above_threshold" in name: return compute_student_loan_plan(target, ctx) + if name.startswith("slc/plan_") and "liable" in name: + return compute_student_loan_plan_liable(target, ctx) # PIP claimants if name in ( diff --git a/policyengine_uk_data/targets/compute/__init__.py b/policyengine_uk_data/targets/compute/__init__.py index 8a329c30..47e7c5e3 100644 --- a/policyengine_uk_data/targets/compute/__init__.py +++ b/policyengine_uk_data/targets/compute/__init__.py @@ -38,6 +38,7 @@ compute_savings_interest, compute_scottish_child_payment, compute_student_loan_plan, + compute_student_loan_plan_liable, compute_vehicles, ) @@ -57,6 +58,7 @@ "compute_scotland_uc_child", "compute_scottish_child_payment", "compute_student_loan_plan", + "compute_student_loan_plan_liable", "compute_ss_contributions", "compute_ss_headcount", "compute_ss_it_relief", diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py index c2037c8e..0631793f 100644 --- a/policyengine_uk_data/targets/compute/other.py +++ b/policyengine_uk_data/targets/compute/other.py @@ -52,8 +52,8 @@ def compute_scottish_child_payment(target, ctx) -> np.ndarray: def compute_student_loan_plan(target, ctx) -> np.ndarray: """Count England borrowers on a given plan with repayments > 0. - SLC targets cover borrowers liable to repay AND earning above threshold - in England only — matching exactly what the FRS captures via PAYE. + SLC "above_threshold" targets cover borrowers liable to repay AND earning + above threshold in England only — matching what the FRS captures via PAYE. """ plan_name = target.name # e.g. "slc/plan_2_borrowers_above_threshold" if "plan_2" in plan_name: @@ -65,6 +65,32 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray: plan = ctx.sim.calculate("student_loan_plan").values region = ctx.sim.calculate("region", map_to="person").values + repayments = ctx.sim.calculate("student_loan_repayments").values + + is_england = np.isin(region, list(_ENGLAND_REGIONS)) + # Only count those with repayments > 0 (above threshold) + on_plan = (plan == plan_value) & is_england & (repayments > 0) + + return ctx.household_from_person(on_plan.astype(float)) + + +def compute_student_loan_plan_liable(target, ctx) -> np.ndarray: + """Count ALL England borrowers on a given plan (including below-threshold). + + SLC "liable" targets cover all borrowers liable to repay, regardless of + whether they earn above the repayment threshold. + """ + plan_name = target.name # e.g. "slc/plan_2_borrowers_liable" + if "plan_2" in plan_name: + plan_value = "PLAN_2" + elif "plan_5" in plan_name: + plan_value = "PLAN_5" + else: + return None + + plan = ctx.sim.calculate("student_loan_plan").values + region = ctx.sim.calculate("region", map_to="person").values + is_england = np.isin(region, list(_ENGLAND_REGIONS)) on_plan = (plan == plan_value) & is_england diff --git a/policyengine_uk_data/targets/sources/slc.py b/policyengine_uk_data/targets/sources/slc.py index 557f5236..af484751 100644 --- a/policyengine_uk_data/targets/sources/slc.py +++ b/policyengine_uk_data/targets/sources/slc.py @@ -1,8 +1,11 @@ """Student Loans Company (SLC) calibration targets. -Borrower counts for England only: Plan 2 and Plan 5, restricted to -borrowers liable to repay and earning above the repayment threshold. -This matches the FRS coverage (PAYE deductions only). +Borrower counts for England only: Plan 2 and Plan 5. + +Two types of targets are provided: +1. "above threshold" - borrowers liable to repay AND earning above threshold + (matches FRS coverage via PAYE deductions) +2. "liable" - total borrowers liable to repay (includes below-threshold) Source: Explore Education Statistics — Student loan forecasts for England, Table 6a: Forecast number of student borrowers liable to repay and number @@ -33,13 +36,16 @@ def _fetch_slc_data() -> dict: """Fetch and parse SLC Table 6a data from Explore Education Statistics. Returns: - Dict with keys 'plan_2' and 'plan_5', each containing a dict - mapping calendar year (int) to borrower count above threshold (int). + Dict with nested structure: + { + 'plan_2': {'above_threshold': {...}, 'liable': {...}}, + 'plan_5': {'above_threshold': {...}, 'liable': {...}} + } + Each inner dict maps calendar year (int) to borrower count (int). """ response = requests.get(_PERMALINK_URL, timeout=30) response.raise_for_status() - # Extract JSON data from __NEXT_DATA__ script tag match = re.search( r'', response.text, @@ -50,63 +56,74 @@ def _fetch_slc_data() -> dict: next_data = json.loads(match.group(1)) table_json = next_data["props"]["pageProps"]["data"]["table"]["json"] - # Parse header row to get years - columns go newest to oldest + # Parse header row to get years (columns go newest to oldest) # Structure: Plan 2 (6 years), Plan 5 (6 years), Plan 3 (5 years) header_row = table_json["thead"][1] - # Get Plan 2 years (first 6 columns) plan_2_years = [] for i in range(6): year_text = header_row[i]["text"] # e.g., "2029-30" start_year = int(year_text.split("-")[0]) - calendar_year = start_year + 1 # 2029-30 → 2030 - plan_2_years.append(calendar_year) + plan_2_years.append(start_year + 1) # 2029-30 → 2030 - # Get Plan 5 years (next 6 columns) plan_5_years = [] for i in range(6, 12): year_text = header_row[i]["text"] start_year = int(year_text.split("-")[0]) - calendar_year = start_year + 1 - plan_5_years.append(calendar_year) + plan_5_years.append(start_year + 1) - # Find the "Higher education total" / "earning above threshold" row - # This is the row following "Higher education total" with "liable to repay" tbody = table_json["tbody"] - # Row 11 contains: header + 6 Plan 2 values + 6 Plan 5 values + 5 Plan 3 - target_row = None - for row in tbody: + # Find "Higher education total" rows + # Row 10: [0]="Higher education total", [1]="Number of borrowers liable...", + # [2-7]=Plan 2 data, [8-13]=Plan 5 data + # Row 11: [0]="Number of borrowers...earning above...", + # [1-6]=Plan 2 data, [7-12]=Plan 5 data + liable_row = None + above_threshold_row = None + + for i, row in enumerate(tbody): header_text = row[0].get("text", "") - if "earning above repayment threshold" in header_text: - # Check if previous context was "Higher education total" - # Actually, row 11 is after HE total row 10, and starts with - # the "earning above" header (no group header due to rowSpan) - target_row = row + if header_text == "Higher education total": + # This row contains liable-to-repay data + liable_row = row + # Next row should be above-threshold data + if i + 1 < len(tbody): + next_row = tbody[i + 1] + next_header = next_row[0].get("text", "") + if "earning above" in next_header: + above_threshold_row = next_row break - if target_row is None: + if above_threshold_row is None: raise ValueError("Could not find 'earning above threshold' row") - - # Parse Plan 2 data (cells 1-6, mapping to plan_2_years) - plan_2_data = {} - for i, year in enumerate(plan_2_years): - cell_idx = 1 + i # Skip header cell - value_text = target_row[cell_idx].get("text", "") - if value_text and value_text not in ("no data", "0"): - value = int(value_text.replace(",", "")) - plan_2_data[year] = value - - # Parse Plan 5 data (cells 7-12, mapping to plan_5_years) - plan_5_data = {} - for i, year in enumerate(plan_5_years): - cell_idx = 7 + i # Skip header + Plan 2 cells - value_text = target_row[cell_idx].get("text", "") - if value_text and value_text not in ("no data", "0"): - value = int(value_text.replace(",", "")) - plan_5_data[year] = value - - return {"plan_2": plan_2_data, "plan_5": plan_5_data} + if liable_row is None: + raise ValueError("Could not find 'Higher education total' row") + + def parse_values(row, start_idx, years): + """Parse numeric values from row starting at start_idx.""" + data = {} + for i, year in enumerate(years): + cell_idx = start_idx + i + if cell_idx >= len(row): + continue + value_text = row[cell_idx].get("text", "") + if value_text and value_text not in ("no data", "0"): + data[year] = int(value_text.replace(",", "")) + return data + + # Liable row: data starts at index 2 (after header and subheader) + p2_liable = parse_values(liable_row, 2, plan_2_years) + p5_liable = parse_values(liable_row, 8, plan_5_years) + + # Above threshold row: data starts at index 1 (after header only) + p2_above = parse_values(above_threshold_row, 1, plan_2_years) + p5_above = parse_values(above_threshold_row, 7, plan_5_years) + + return { + "plan_2": {"above_threshold": p2_above, "liable": p2_liable}, + "plan_5": {"above_threshold": p5_above, "liable": p5_liable}, + } def get_targets() -> list[Target]: @@ -115,6 +132,7 @@ def get_targets() -> list[Target]: targets = [] + # Above-threshold targets (borrowers with PAYE deductions) targets.append( Target( name="slc/plan_2_borrowers_above_threshold", @@ -122,11 +140,10 @@ def get_targets() -> list[Target]: source="slc", unit=Unit.COUNT, is_count=True, - values=slc_data["plan_2"], + values=slc_data["plan_2"]["above_threshold"], reference_url=_PERMALINK_URL, ) ) - targets.append( Target( name="slc/plan_5_borrowers_above_threshold", @@ -134,7 +151,31 @@ def get_targets() -> list[Target]: source="slc", unit=Unit.COUNT, is_count=True, - values=slc_data["plan_5"], + values=slc_data["plan_5"]["above_threshold"], + reference_url=_PERMALINK_URL, + ) + ) + + # Liable-to-repay targets (all borrowers including below-threshold) + targets.append( + Target( + name="slc/plan_2_borrowers_liable", + variable="student_loan_plan", + source="slc", + unit=Unit.COUNT, + is_count=True, + values=slc_data["plan_2"]["liable"], + reference_url=_PERMALINK_URL, + ) + ) + targets.append( + Target( + name="slc/plan_5_borrowers_liable", + variable="student_loan_plan", + source="slc", + unit=Unit.COUNT, + is_count=True, + values=slc_data["plan_5"]["liable"], reference_url=_PERMALINK_URL, ) ) diff --git a/policyengine_uk_data/tests/test_student_loan_targets.py b/policyengine_uk_data/tests/test_student_loan_targets.py index 1209c42f..6a861f88 100644 --- a/policyengine_uk_data/tests/test_student_loan_targets.py +++ b/policyengine_uk_data/tests/test_student_loan_targets.py @@ -10,25 +10,66 @@ def test_slc_targets_registered(): targets = {t.name: t for t in get_all_targets()} assert "slc/plan_2_borrowers_above_threshold" in targets assert "slc/plan_5_borrowers_above_threshold" in targets + assert "slc/plan_2_borrowers_liable" in targets + assert "slc/plan_5_borrowers_liable" in targets -def test_slc_plan2_values(): - """Plan 2 target values match SLC Table 6a.""" +def test_slc_plan2_above_threshold_values(): + """Plan 2 above-threshold values match SLC Table 6a HE total.""" from policyengine_uk_data.targets.registry import get_all_targets targets = {t.name: t for t in get_all_targets()} p2 = targets["slc/plan_2_borrowers_above_threshold"] - assert p2.values[2025] == 3_670_000 - assert p2.values[2026] == 4_130_000 - assert p2.values[2029] == 4_820_000 + # Values from Row 11 (HE total, above threshold) + assert p2.values[2025] == 3_985_000 + assert p2.values[2026] == 4_460_000 + assert p2.values[2030] == 5_205_000 -def test_slc_plan5_values(): - """Plan 5 target values match SLC Table 6a.""" +def test_slc_plan5_above_threshold_values(): + """Plan 5 above-threshold values match SLC Table 6a HE total.""" from policyengine_uk_data.targets.registry import get_all_targets targets = {t.name: t for t in get_all_targets()} p5 = targets["slc/plan_5_borrowers_above_threshold"] - assert 2025 not in p5.values # no Plan 5 borrowers yet in 2024-25 - assert p5.values[2026] == 25_000 - assert p5.values[2029] == 700_000 + # Values from Row 11 (HE total, above threshold) + assert 2025 not in p5.values # 0 in 2024-25 + assert p5.values[2026] == 35_000 + assert p5.values[2030] == 1_235_000 + + +def test_slc_plan2_liable_values(): + """Plan 2 liable-to-repay values match SLC Table 6a HE total.""" + from policyengine_uk_data.targets.registry import get_all_targets + + targets = {t.name: t for t in get_all_targets()} + p2 = targets["slc/plan_2_borrowers_liable"] + # Values from Row 10 (HE total, liable to repay) + assert p2.values[2025] == 8_940_000 + assert p2.values[2026] == 9_710_000 + assert p2.values[2030] == 10_525_000 + + +def test_slc_plan5_liable_values(): + """Plan 5 liable-to-repay values match SLC Table 6a HE total.""" + from policyengine_uk_data.targets.registry import get_all_targets + + targets = {t.name: t for t in get_all_targets()} + p5 = targets["slc/plan_5_borrowers_liable"] + # Values from Row 10 (HE total, liable to repay) + assert p5.values[2025] == 10_000 + assert p5.values[2026] == 230_000 + assert p5.values[2030] == 3_400_000 + + +def test_liable_exceeds_above_threshold(): + """Liable-to-repay counts exceed above-threshold counts.""" + from policyengine_uk_data.targets.registry import get_all_targets + + targets = {t.name: t for t in get_all_targets()} + p2_liable = targets["slc/plan_2_borrowers_liable"] + p2_above = targets["slc/plan_2_borrowers_above_threshold"] + + for year in p2_above.values: + if year in p2_liable.values: + assert p2_liable.values[year] > p2_above.values[year] From f9317ffed9394b23159ce1d4c61fc438961784d3 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:10:03 +0000 Subject: [PATCH 2/8] Add changelog entry --- changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/changelog.yaml b/changelog.yaml index 58f6820c..2328c7ac 100644 --- a/changelog.yaml +++ b/changelog.yaml @@ -576,3 +576,11 @@ - Upload datasets to public HuggingFace repo (policyengine/policyengine-uk-data) in addition to private repo, so policyengine-uk gets the latest data. date: 2026-02-23 13:26:29 +- bump: minor + changes: + added: + - SLC "liable to repay" targets for Plan 2 and Plan 5, covering all borrowers + including those below repayment threshold. + - Probabilistic imputation of below-threshold student loan borrowers based on + tertiary education, age, and SLC forecast data. + date: 2026-02-23 16:00:00 From 29d25fc12dba26209881017852a1cdf82197db3b Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:11:20 +0000 Subject: [PATCH 3/8] Add changelog_entry.yaml --- changelog_entry.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..f8928d56 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,7 @@ +- bump: minor + changes: + added: + - SLC "liable to repay" targets for Plan 2 and Plan 5, covering all borrowers + including those below repayment threshold. + - Probabilistic imputation of below-threshold student loan borrowers based on + tertiary education, age, and SLC forecast data. From 2ad6f3da145d59928f8fed3dd82b61924458d4d7 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:27:18 +0000 Subject: [PATCH 4/8] Fix Plan 5 age band to include young graduates --- .../datasets/imputations/student_loans.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py index f86655c9..ecfa0415 100644 --- a/policyengine_uk_data/datasets/imputations/student_loans.py +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -119,13 +119,14 @@ def impute_student_loan_plan( # Estimate university start year (assume started at 18) uni_start_year = year - age + 18 - # Age bands for plausible loan holders - # Plan 1: 32-55 (started before 2012, still repaying) - # Plan 2: 21-33 (started 2012-2022) - # Plan 5: 21-24 (started 2023+) - plan_1_age_mask = (age >= 32) & (age <= 55) - plan_2_age_mask = (age >= 21) & (age <= 33) - plan_5_age_mask = (age >= 21) & (age <= 24) + # Age bands for plausible loan holders (graduates typically 21+) + # Plan 1: 32+ (started before 2012, graduated 21+ by 2015) + # Plan 2: 21+ and cohort 2012-2022 + # Plan 5: 21+ and cohort 2023+ (but in early years, recent grads are 18-22) + plan_1_age_mask = age >= 32 + plan_2_age_mask = age >= 21 + # Plan 5: use cohort constraint only since graduates are very young in early years + plan_5_age_mask = age >= 18 # Anyone 18+ who started 2023+ could have a loan # Cohort masks based on university start year plan_1_cohort = uni_start_year < 2012 From 218ff5e99f14071b4534f0b9227ad9b844ec9f0a Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:34:33 +0000 Subject: [PATCH 5/8] Remove tertiary education filter - SLC shows ~94% of cohort has loans --- .../datasets/imputations/student_loans.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py index ecfa0415..c2c1f625 100644 --- a/policyengine_uk_data/datasets/imputations/student_loans.py +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -13,10 +13,10 @@ - PLAN_5: Started September 2023 onwards The FRS only records active repayers (via PAYE). SLC data shows many borrowers -earn below repayment thresholds. This imputation fills that gap by -probabilistically assigning plans to tertiary-educated people without -reported repayments, based on SLC "liable to repay" minus "above threshold" -counts. +earn below repayment thresholds (~55% of Plan 2 holders). This imputation +fills that gap by probabilistically assigning plans to people in the relevant +age cohort without reported repayments, based on SLC "liable to repay" minus +"above threshold" counts. """ import numpy as np @@ -108,12 +108,10 @@ def impute_student_loan_plan( age = sim.calculate("age").values repayments = sim.calculate("student_loan_repayments").values - education = sim.calculate("highest_education").values region = sim.calculate("region", map_to="person").values weights = sim.calculate("person_weight").values is_england = np.isin(region, list(_ENGLAND_REGIONS)) - is_tertiary = education == "TERTIARY" has_repayments = repayments > 0 # Estimate university start year (assume started at 18) @@ -153,9 +151,9 @@ def impute_student_loan_plan( ) # Plan 2 below-threshold assignment + # No tertiary filter - SLC data shows ~94% of cohort has loans plan_2_eligible = ( no_repayments - & is_tertiary & is_england & plan_2_age_mask & plan_2_cohort @@ -170,7 +168,6 @@ def impute_student_loan_plan( # Plan 5 below-threshold assignment plan_5_eligible = ( no_repayments - & is_tertiary & is_england & plan_5_age_mask & plan_5_cohort From 8f9562989829d09b73564c744585c6681f1b9f54 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:46:16 +0000 Subject: [PATCH 6/8] Fix EDUCQUAL mapping - codes were inverted (1-7 are degrees, not 17-21) --- policyengine_uk_data/datasets/frs.py | 126 +++++++++++++++++++-------- 1 file changed, 92 insertions(+), 34 deletions(-) diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py index 37ce70dc..50a091cc 100644 --- a/policyengine_uk_data/datasets/frs.py +++ b/policyengine_uk_data/datasets/frs.py @@ -208,43 +208,101 @@ def determine_education_level(fted_val, typeed2_val, age_val): ) # Add highest education from EDUCQUAL (highest qualification achieved) - # Codes from FRS ADT_324X classification; unmapped codes default to UPPER_SECONDARY + # Based on FRS 2022-23 documentation (SPSS value labels) EDUCQUAL_MAP = { - 1: "NOT_COMPLETED_PRIMARY", - 2: "LOWER_SECONDARY", # GCSE D-G / CSE 2-5 - 3: "LOWER_SECONDARY", # GCSE A-C / O-level A-C - 4: "UPPER_SECONDARY", # AS-level - 5: "UPPER_SECONDARY", # A-level (1 subject) - 6: "UPPER_SECONDARY", # A-level (2 subjects) - 7: "UPPER_SECONDARY", # A-level (3+ subjects) - 8: "LOWER_SECONDARY", # Scottish Standard/Ordinary Grade - 9: "UPPER_SECONDARY", # Scottish Higher Grade - 10: "UPPER_SECONDARY", # Scottish 6th Year Studies + # Degree level and above (TERTIARY) + 1: "TERTIARY", # Doctorate or MPhil + 2: "TERTIARY", # Masters, PGCE or other postgrad + 3: "TERTIARY", # Degree inc foundation degree + 4: "TERTIARY", # Teaching qualification (excl PGCE) + 5: "TERTIARY", # Foreign qualification at degree level + 6: "TERTIARY", # Other work-related qual at degree level + 7: "TERTIARY", # Other professional qual at degree level + # Higher education below degree (POST_SECONDARY) + 8: "POST_SECONDARY", # Other HE qualification below degree + 9: "POST_SECONDARY", # Nursing or other medical + 10: "POST_SECONDARY", # Diploma in higher education 11: "POST_SECONDARY", # HNC/HND - 12: "POST_SECONDARY", # City & Guilds advanced / BTEC National - 13: "UPPER_SECONDARY", # City & Guilds craft / BTEC General - 14: "POST_SECONDARY", # ONC/OND / BTEC National (lower) - 15: "UPPER_SECONDARY", # City & Guilds foundation - 16: "POST_SECONDARY", # RSA advanced - 17: "TERTIARY", # First/foundation degree - 18: "TERTIARY", # Second degree - 19: "TERTIARY", # Higher degree (Masters/PhD) - 20: "TERTIARY", # PGCE / teaching qualification - 21: "TERTIARY", # Nursing/paramedical qualification - 66: "UPPER_SECONDARY", # NVQ/SVQ Level 1 - 67: "UPPER_SECONDARY", # NVQ/SVQ Level 2 - 68: "UPPER_SECONDARY", # NVQ/SVQ Level 3 - 69: "POST_SECONDARY", # NVQ/SVQ Level 4 - 70: "TERTIARY", # NVQ/SVQ Level 5 + 12: "POST_SECONDARY", # BTEC higher level + 13: "POST_SECONDARY", # SCOTVEC higher level + 14: "POST_SECONDARY", # NVQ/SVQ Level 4 + 15: "POST_SECONDARY", # NVQ/SVQ Level 5 + 16: "POST_SECONDARY", # RSA higher diploma / OCR Level 4 + # A-level equivalent (UPPER_SECONDARY) + 17: "UPPER_SECONDARY", # A-Level or equivalent + 18: "UPPER_SECONDARY", # Welsh Baccalaureate Advanced + 19: "UPPER_SECONDARY", # Scottish Baccalaureate + 20: "UPPER_SECONDARY", # International Baccalaureate + 21: "UPPER_SECONDARY", # AS-level or equivalent + 22: "UPPER_SECONDARY", # Certificate of 6th Year Studies + 23: "UPPER_SECONDARY", # Access to Higher Education + 24: "UPPER_SECONDARY", # Scottish Higher/Intermediate + 25: "UPPER_SECONDARY", # Skills for work Higher + 26: "POST_SECONDARY", # ONC/OND + 27: "POST_SECONDARY", # BTEC National level + 28: "POST_SECONDARY", # SCOTVEC National level + 29: "UPPER_SECONDARY", # New Diploma Advanced + 30: "UPPER_SECONDARY", # New Diploma Progression + 31: "UPPER_SECONDARY", # NVQ/SVQ Level 3 + 32: "UPPER_SECONDARY", # GNVQ Advanced + 33: "UPPER_SECONDARY", # RSA advanced diploma / OCR Level 3 + 34: "UPPER_SECONDARY", # City and Guilds advanced craft + 35: "UPPER_SECONDARY", # Welsh Baccalaureate Intermediate + # GCSE/O-level equivalent (LOWER_SECONDARY) + 36: "LOWER_SECONDARY", # O-Level (5+) + 37: "LOWER_SECONDARY", # Scottish Standard Grade (5+) + 38: "LOWER_SECONDARY", # GCSE (5+) + 39: "LOWER_SECONDARY", # CSE (5+) + 40: "LOWER_SECONDARY", # Scottish National level 5 + 41: "LOWER_SECONDARY", # Skills for work National 5 + 42: "LOWER_SECONDARY", # BTEC first diploma + 43: "LOWER_SECONDARY", # SCOTVEC first diploma + 44: "LOWER_SECONDARY", # New Diploma Higher (level 2) + 45: "LOWER_SECONDARY", # NVQ/SVQ Level 2 + 46: "LOWER_SECONDARY", # GNVQ Intermediate + 47: "LOWER_SECONDARY", # RSA diploma / OCR Level 2 + 48: "LOWER_SECONDARY", # City and Guilds craft + 49: "LOWER_SECONDARY", # Other high school leavers qual + # Below GCSE / basic qualifications + 50: "LOWER_SECONDARY", # BTEC (unspecified) + 51: "LOWER_SECONDARY", # BTEC first cert + 52: "LOWER_SECONDARY", # SCOTVEC (unspecified) + 53: "LOWER_SECONDARY", # SCOTVEC first cert + 54: "LOWER_SECONDARY", # SCOTVEC modules + 55: "LOWER_SECONDARY", # New Diploma (unspecified) + 56: "LOWER_SECONDARY", # New Diploma Foundation + 57: "LOWER_SECONDARY", # Welsh Baccalaureate (unspecified) + 58: "LOWER_SECONDARY", # Welsh Baccalaureate Foundation + 59: "LOWER_SECONDARY", # NVQ/SVQ (unspecified) + 60: "LOWER_SECONDARY", # NVQ/SVQ Level 1 + 61: "LOWER_SECONDARY", # GNVQ (unspecified) + 62: "LOWER_SECONDARY", # GNVQ Part One Intermediate + 63: "LOWER_SECONDARY", # GNVQ Full Foundation + 64: "LOWER_SECONDARY", # GNVQ Part One Foundation + 65: "LOWER_SECONDARY", # O-Level (unspecified) + 66: "LOWER_SECONDARY", # O-Level (fewer than 5) + 67: "LOWER_SECONDARY", # Scottish Standard Grade (unspecified) + 68: "LOWER_SECONDARY", # Scottish Standard Grade (fewer than 5) + 69: "LOWER_SECONDARY", # GCSE (unspecified) + 70: "LOWER_SECONDARY", # GCSE (fewer than 5) + 71: "LOWER_SECONDARY", # Scottish National 1-4 + 72: "LOWER_SECONDARY", # Scottish National (unspecified) + 73: "LOWER_SECONDARY", # Skills for work National 3-4 + 74: "LOWER_SECONDARY", # Skills for work (unspecified) + 75: "LOWER_SECONDARY", # CSE (unspecified) + 76: "LOWER_SECONDARY", # CSE (fewer than 5) + 77: "LOWER_SECONDARY", # RSA/OCR (unspecified) + 78: "LOWER_SECONDARY", # RSA other / OCR Level 1 + 79: "LOWER_SECONDARY", # City and Guilds (unspecified) + 80: "LOWER_SECONDARY", # City and Guilds foundation + 81: "LOWER_SECONDARY", # YT Certificate + 82: "LOWER_SECONDARY", # Key Skills / Core Skills + 83: "NOT_COMPLETED_PRIMARY", # Basic Skills (literacy/numeracy) + 84: "NOT_COMPLETED_PRIMARY", # Entry Level Qualifications + 85: "NOT_COMPLETED_PRIMARY", # Award/Certificate at entry level + 86: "LOWER_SECONDARY", # Other professional/vocational/foreign } - # Codes 22-65 and 71-85 are further vocational/professional qualifications; - # treat as POST_SECONDARY. Codes 86-87 are catch-alls; treat as UPPER_SECONDARY. - for code in range(22, 66): - EDUCQUAL_MAP[code] = "POST_SECONDARY" - for code in range(71, 86): - EDUCQUAL_MAP[code] = "POST_SECONDARY" - EDUCQUAL_MAP[86] = "UPPER_SECONDARY" - EDUCQUAL_MAP[87] = "UPPER_SECONDARY" + # Code 87 is missing - means no qualification data; default to UPPER_SECONDARY educqual = pd.to_numeric(person.educqual, errors="coerce") pe_person["highest_education"] = educqual.map(EDUCQUAL_MAP).fillna( From 48a8f12b50e613a71f2b21af3eaf228d1f2edb0c Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:47:01 +0000 Subject: [PATCH 7/8] Re-enable tertiary filter now that EDUCQUAL mapping is fixed --- .../datasets/imputations/student_loans.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py index c2c1f625..72c90ab3 100644 --- a/policyengine_uk_data/datasets/imputations/student_loans.py +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -14,9 +14,9 @@ The FRS only records active repayers (via PAYE). SLC data shows many borrowers earn below repayment thresholds (~55% of Plan 2 holders). This imputation -fills that gap by probabilistically assigning plans to people in the relevant -age cohort without reported repayments, based on SLC "liable to repay" minus -"above threshold" counts. +fills that gap by probabilistically assigning plans to tertiary-educated +people in the relevant age cohort without reported repayments, based on SLC +"liable to repay" minus "above threshold" counts. """ import numpy as np @@ -110,8 +110,10 @@ def impute_student_loan_plan( repayments = sim.calculate("student_loan_repayments").values region = sim.calculate("region", map_to="person").values weights = sim.calculate("person_weight").values + education = sim.calculate("highest_education").values is_england = np.isin(region, list(_ENGLAND_REGIONS)) + is_tertiary = education == "TERTIARY" has_repayments = repayments > 0 # Estimate university start year (assume started at 18) @@ -151,9 +153,9 @@ def impute_student_loan_plan( ) # Plan 2 below-threshold assignment - # No tertiary filter - SLC data shows ~94% of cohort has loans plan_2_eligible = ( no_repayments + & is_tertiary & is_england & plan_2_age_mask & plan_2_cohort @@ -168,6 +170,7 @@ def impute_student_loan_plan( # Plan 5 below-threshold assignment plan_5_eligible = ( no_repayments + & is_tertiary & is_england & plan_5_age_mask & plan_5_cohort From 133a8d14c82f9651f9894cf9880c149a83d8cba0 Mon Sep 17 00:00:00 2001 From: Nikhil Woodruff Date: Mon, 23 Feb 2026 15:58:27 +0000 Subject: [PATCH 8/8] Relax Plan 2 age constraints for better coverage The strict cohort constraint (ages 21-31) missed many Plan 2 borrowers who started university late or did postgrad studies. This change: 1. Expands Plan 2 age mask from 21+ to 21-45 2. Uses age mask (not cohort) for Plan 2 below-threshold assignment 3. Assigns repayers to Plan 2 if age 21-45 and not Plan 1 cohort The below-threshold imputation now covers the full 4.95M target. The remaining gap (FRS shows 1.4M repayers vs SLC's 4M) is a data collection issue that calibration targets will help address. --- .../datasets/imputations/student_loans.py | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py index 72c90ab3..084ccf29 100644 --- a/policyengine_uk_data/datasets/imputations/student_loans.py +++ b/policyengine_uk_data/datasets/imputations/student_loans.py @@ -119,14 +119,13 @@ def impute_student_loan_plan( # Estimate university start year (assume started at 18) uni_start_year = year - age + 18 - # Age bands for plausible loan holders (graduates typically 21+) + # Age bands for plausible loan holders # Plan 1: 32+ (started before 2012, graduated 21+ by 2015) - # Plan 2: 21+ and cohort 2012-2022 - # Plan 5: 21+ and cohort 2023+ (but in early years, recent grads are 18-22) + # Plan 2: 21-45 (allows for late starters, postgrads, career changers) + # Plan 5: 18-25 (recent starters only - cohort 2023+) plan_1_age_mask = age >= 32 - plan_2_age_mask = age >= 21 - # Plan 5: use cohort constraint only since graduates are very young in early years - plan_5_age_mask = age >= 18 # Anyone 18+ who started 2023+ could have a loan + plan_2_age_mask = (age >= 21) & (age <= 45) + plan_5_age_mask = (age >= 18) & (age <= 25) # Cohort masks based on university start year plan_1_cohort = uni_start_year < 2012 @@ -136,8 +135,11 @@ def impute_student_loan_plan( plan = np.full(len(age), "NONE", dtype=object) # Step 1: Assign plans to people with reported repayments + # Plan 1: use cohort (started before 2012) + # Plan 2: use age mask (21-45) since many late starters and postgrads exist + # Plan 5: use cohort (started 2023+) plan[has_repayments & plan_1_cohort] = "PLAN_1" - plan[has_repayments & plan_2_cohort] = "PLAN_2" + plan[has_repayments & plan_2_age_mask & ~plan_1_cohort] = "PLAN_2" plan[has_repayments & plan_5_cohort] = "PLAN_5" # Step 2: Probabilistically assign below-threshold borrowers @@ -153,12 +155,9 @@ def impute_student_loan_plan( ) # Plan 2 below-threshold assignment + # Use age mask only (not cohort) since many borrowers started late or did postgrad plan_2_eligible = ( - no_repayments - & is_tertiary - & is_england - & plan_2_age_mask - & plan_2_cohort + no_repayments & is_tertiary & is_england & plan_2_age_mask ) if plan_2_below > 0 and plan_2_eligible.sum() > 0: eligible_weight = (weights * plan_2_eligible).sum()