From 73880bbf843487070496177526b538481de1e612 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:01:36 +0000
Subject: [PATCH 1/8] feat: impute below-threshold student loan borrowers

The FRS only captures borrowers making PAYE repayments (above threshold).
SLC data shows ~55% of Plan 2 holders earn below threshold. This adds
probabilistic imputation of below-threshold borrowers for Plan 2 and
Plan 5 based on SLC "liable to repay" counts.

Changes:
- Add slc/plan_*_borrowers_liable targets alongside existing above_threshold
- Probabilistically assign plans to tertiary-educated people without
  repayments, constrained by age band and cohort
- Add compute_student_loan_plan_liable() for calibration

Closes #281
---
 .../datasets/imputations/student_loans.py     | 162 +++++++++++++++---
 .../targets/build_loss_matrix.py              |   5 +-
 .../targets/compute/__init__.py               |   2 +
 policyengine_uk_data/targets/compute/other.py |  30 +++-
 policyengine_uk_data/targets/sources/slc.py   | 135 ++++++++++-----
 .../tests/test_student_loan_targets.py        |  61 +++++--
 6 files changed, 308 insertions(+), 87 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
index 38391899..f86655c9 100644
--- a/policyengine_uk_data/datasets/imputations/student_loans.py
+++ b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -2,30 +2,86 @@
 Student loan plan imputation.
 
 This module imputes the student_loan_plan variable based on:
-- Whether the person has reported student loan repayments
+- Whether the person has reported student loan repayments (above threshold)
 - Their estimated university attendance year (inferred from age)
+- Probabilistic assignment for below-threshold borrowers
 
 The imputation assigns plan types according to when the loan system changed:
-- NONE: No reported repayments
+- NONE: No loan
 - PLAN_1: Started university before September 2012
 - PLAN_2: Started September 2012 - August 2023
 - PLAN_5: Started September 2023 onwards
 
-This enables policyengine-uk's student_loan_repayment variable to calculate
-repayments using official threshold parameters.
+The FRS only records active repayers (via PAYE). SLC data shows many borrowers
+earn below repayment thresholds. This imputation fills that gap by
+probabilistically assigning plans to tertiary-educated people without
+reported repayments, based on SLC "liable to repay" minus "above threshold"
+counts.
 """
 
 import numpy as np
 from policyengine_uk.data import UKSingleYearDataset
 from policyengine_uk import Microsimulation
 
+# England regions for filtering (SLC data covers England only)
+_ENGLAND_REGIONS = {
+    "NORTH_EAST",
+    "NORTH_WEST",
+    "YORKSHIRE",
+    "EAST_MIDLANDS",
+    "WEST_MIDLANDS",
+    "EAST_OF_ENGLAND",
+    "LONDON",
+    "SOUTH_EAST",
+    "SOUTH_WEST",
+}
+
+# SLC liable-to-repay counts (Higher Education total, England)
+# Source: https://explore-education-statistics.service.gov.uk/data-tables/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d
+_PLAN_2_LIABLE = {
+    2025: 8_940_000,
+    2026: 9_710_000,
+    2027: 10_360_000,
+    2028: 10_615_000,
+    2029: 10_600_000,
+    2030: 10_525_000,
+}
+
+_PLAN_5_LIABLE = {
+    2025: 10_000,
+    2026: 230_000,
+    2027: 630_000,
+    2028: 1_380_000,
+    2029: 2_360_000,
+    2030: 3_400_000,
+}
+
+# SLC above-threshold counts (borrowers making repayments)
+_PLAN_2_ABOVE_THRESHOLD = {
+    2025: 3_985_000,
+    2026: 4_460_000,
+    2027: 4_825_000,
+    2028: 5_045_000,
+    2029: 5_160_000,
+    2030: 5_205_000,
+}
+
+_PLAN_5_ABOVE_THRESHOLD = {
+    2026: 35_000,
+    2027: 145_000,
+    2028: 390_000,
+    2029: 770_000,
+    2030: 1_235_000,
+}
+
 
 def impute_student_loan_plan(
     dataset: UKSingleYearDataset,
     year: int = 2025,
+    seed: int = 42,
 ) -> UKSingleYearDataset:
     """
-    Impute student loan plan type based on age and reported repayments.
+    Impute student loan plan type based on age, repayments, and education.
 
     The plan type determines which repayment threshold applies:
     - PLAN_1: £26,065 (2025), pre-Sept 2012 England/Wales
@@ -33,46 +89,98 @@ def impute_student_loan_plan(
     - PLAN_4: Scottish loans (not imputed here - requires explicit flag)
     - PLAN_5: £25,000 (2025), Sept 2023 onwards
 
+    This function:
+    1. Assigns plans to people with reported repayments (above threshold)
+    2. Probabilistically assigns plans to tertiary-educated people without
+       repayments (below threshold) to match SLC liable-to-repay totals
+
     Args:
         dataset: PolicyEngine UK dataset with student_loan_repayments.
         year: The simulation year, used to estimate university attendance.
+        seed: Random seed for reproducibility.
 
     Returns:
         Dataset with imputed student_loan_plan values.
     """
     dataset = dataset.copy()
     sim = Microsimulation(dataset=dataset)
+    rng = np.random.default_rng(seed)
 
-    # Get required variables
     age = sim.calculate("age").values
-    student_loan_repayments = sim.calculate("student_loan_repayments").values
-
-    # Determine if person has a student loan based on reported repayments
-    has_student_loan = student_loan_repayments > 0
-
-    # Estimate when they started university (assume age 18)
-    # For simulation year Y and age A, university start year = Y - A + 18
-    estimated_uni_start_year = year - age + 18
+    repayments = sim.calculate("student_loan_repayments").values
+    education = sim.calculate("highest_education").values
+    region = sim.calculate("region", map_to="person").values
+    weights = sim.calculate("person_weight").values
+
+    is_england = np.isin(region, list(_ENGLAND_REGIONS))
+    is_tertiary = education == "TERTIARY"
+    has_repayments = repayments > 0
+
+    # Estimate university start year (assume started at 18)
+    uni_start_year = year - age + 18
+
+    # Age bands for plausible loan holders
+    # Plan 1: 32-55 (started before 2012, still repaying)
+    # Plan 2: 21-33 (started 2012-2022)
+    # Plan 5: 21-24 (started 2023+)
+    plan_1_age_mask = (age >= 32) & (age <= 55)
+    plan_2_age_mask = (age >= 21) & (age <= 33)
+    plan_5_age_mask = (age >= 21) & (age <= 24)
+
+    # Cohort masks based on university start year
+    plan_1_cohort = uni_start_year < 2012
+    plan_2_cohort = (uni_start_year >= 2012) & (uni_start_year < 2023)
+    plan_5_cohort = uni_start_year >= 2023
 
-    # Assign plan types based on when loan system changed
-    # StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
     plan = np.full(len(age), "NONE", dtype=object)
 
-    # Plan 1: Started before September 2012
-    plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
-    plan[plan_1_mask] = "PLAN_1"
+    # Step 1: Assign plans to people with reported repayments
+    plan[has_repayments & plan_1_cohort] = "PLAN_1"
+    plan[has_repayments & plan_2_cohort] = "PLAN_2"
+    plan[has_repayments & plan_5_cohort] = "PLAN_5"
 
-    # Plan 2: Started September 2012 - August 2023
-    plan_2_mask = has_student_loan & (
-        (estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
+    # Step 2: Probabilistically assign below-threshold borrowers
+    # Only for tertiary-educated people in England without repayments
+    no_repayments = ~has_repayments
+
+    # Calculate target below-threshold counts
+    plan_2_below = _PLAN_2_LIABLE.get(year, 0) - _PLAN_2_ABOVE_THRESHOLD.get(
+        year, 0
+    )
+    plan_5_below = _PLAN_5_LIABLE.get(year, 0) - _PLAN_5_ABOVE_THRESHOLD.get(
+        year, 0
     )
-    plan[plan_2_mask] = "PLAN_2"
 
-    # Plan 5: Started September 2023 onwards
-    plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
-    plan[plan_5_mask] = "PLAN_5"
+    # Plan 2 below-threshold assignment
+    plan_2_eligible = (
+        no_repayments
+        & is_tertiary
+        & is_england
+        & plan_2_age_mask
+        & plan_2_cohort
+    )
+    if plan_2_below > 0 and plan_2_eligible.sum() > 0:
+        eligible_weight = (weights * plan_2_eligible).sum()
+        if eligible_weight > 0:
+            prob = min(1.0, plan_2_below / eligible_weight)
+            draws = rng.random(len(age))
+            plan[plan_2_eligible & (draws < prob)] = "PLAN_2"
+
+    # Plan 5 below-threshold assignment
+    plan_5_eligible = (
+        no_repayments
+        & is_tertiary
+        & is_england
+        & plan_5_age_mask
+        & plan_5_cohort
+    )
+    if plan_5_below > 0 and plan_5_eligible.sum() > 0:
+        eligible_weight = (weights * plan_5_eligible).sum()
+        if eligible_weight > 0:
+            prob = min(1.0, plan_5_below / eligible_weight)
+            draws = rng.random(len(age))
+            plan[plan_5_eligible & (draws < prob)] = "PLAN_5"
 
-    # Store as the plan type
     dataset.person["student_loan_plan"] = plan
 
     return dataset
diff --git a/policyengine_uk_data/targets/build_loss_matrix.py b/policyengine_uk_data/targets/build_loss_matrix.py
index 8af391a3..92e3f5b6 100644
--- a/policyengine_uk_data/targets/build_loss_matrix.py
+++ b/policyengine_uk_data/targets/build_loss_matrix.py
@@ -38,6 +38,7 @@
     compute_scotland_uc_child,
     compute_scottish_child_payment,
     compute_student_loan_plan,
+    compute_student_loan_plan_liable,
     compute_ss_contributions,
     compute_ss_headcount,
     compute_ss_it_relief,
@@ -306,8 +307,10 @@ def _compute_column(
         return compute_scottish_child_payment(target, ctx)
 
     # Student loan plan borrower counts (SLC)
-    if name.startswith("slc/plan_"):
+    if name.startswith("slc/plan_") and "above_threshold" in name:
         return compute_student_loan_plan(target, ctx)
+    if name.startswith("slc/plan_") and "liable" in name:
+        return compute_student_loan_plan_liable(target, ctx)
 
     # PIP claimants
     if name in (
diff --git a/policyengine_uk_data/targets/compute/__init__.py b/policyengine_uk_data/targets/compute/__init__.py
index 8a329c30..47e7c5e3 100644
--- a/policyengine_uk_data/targets/compute/__init__.py
+++ b/policyengine_uk_data/targets/compute/__init__.py
@@ -38,6 +38,7 @@
     compute_savings_interest,
     compute_scottish_child_payment,
     compute_student_loan_plan,
+    compute_student_loan_plan_liable,
     compute_vehicles,
 )
 
@@ -57,6 +58,7 @@
     "compute_scotland_uc_child",
     "compute_scottish_child_payment",
     "compute_student_loan_plan",
+    "compute_student_loan_plan_liable",
     "compute_ss_contributions",
     "compute_ss_headcount",
     "compute_ss_it_relief",
diff --git a/policyengine_uk_data/targets/compute/other.py b/policyengine_uk_data/targets/compute/other.py
index c2037c8e..0631793f 100644
--- a/policyengine_uk_data/targets/compute/other.py
+++ b/policyengine_uk_data/targets/compute/other.py
@@ -52,8 +52,8 @@ def compute_scottish_child_payment(target, ctx) -> np.ndarray:
 def compute_student_loan_plan(target, ctx) -> np.ndarray:
     """Count England borrowers on a given plan with repayments > 0.
 
-    SLC targets cover borrowers liable to repay AND earning above threshold
-    in England only — matching exactly what the FRS captures via PAYE.
+    SLC "above_threshold" targets cover borrowers liable to repay AND earning
+    above threshold in England only — matching what the FRS captures via PAYE.
     """
     plan_name = target.name  # e.g. "slc/plan_2_borrowers_above_threshold"
     if "plan_2" in plan_name:
@@ -65,6 +65,32 @@ def compute_student_loan_plan(target, ctx) -> np.ndarray:
 
     plan = ctx.sim.calculate("student_loan_plan").values
     region = ctx.sim.calculate("region", map_to="person").values
+    repayments = ctx.sim.calculate("student_loan_repayments").values
+
+    is_england = np.isin(region, list(_ENGLAND_REGIONS))
+    # Only count those with repayments > 0 (above threshold)
+    on_plan = (plan == plan_value) & is_england & (repayments > 0)
+
+    return ctx.household_from_person(on_plan.astype(float))
+
+
+def compute_student_loan_plan_liable(target, ctx) -> np.ndarray:
+    """Count ALL England borrowers on a given plan (including below-threshold).
+
+    SLC "liable" targets cover all borrowers liable to repay, regardless of
+    whether they earn above the repayment threshold.
+    """
+    plan_name = target.name  # e.g. "slc/plan_2_borrowers_liable"
+    if "plan_2" in plan_name:
+        plan_value = "PLAN_2"
+    elif "plan_5" in plan_name:
+        plan_value = "PLAN_5"
+    else:
+        return None
+
+    plan = ctx.sim.calculate("student_loan_plan").values
+    region = ctx.sim.calculate("region", map_to="person").values
+
     is_england = np.isin(region, list(_ENGLAND_REGIONS))
     on_plan = (plan == plan_value) & is_england
 
diff --git a/policyengine_uk_data/targets/sources/slc.py b/policyengine_uk_data/targets/sources/slc.py
index 557f5236..af484751 100644
--- a/policyengine_uk_data/targets/sources/slc.py
+++ b/policyengine_uk_data/targets/sources/slc.py
@@ -1,8 +1,11 @@
 """Student Loans Company (SLC) calibration targets.
 
-Borrower counts for England only: Plan 2 and Plan 5, restricted to
-borrowers liable to repay and earning above the repayment threshold.
-This matches the FRS coverage (PAYE deductions only).
+Borrower counts for England only: Plan 2 and Plan 5.
+
+Two types of targets are provided:
+1. "above threshold" - borrowers liable to repay AND earning above threshold
+   (matches FRS coverage via PAYE deductions)
+2. "liable" - total borrowers liable to repay (includes below-threshold)
 
 Source: Explore Education Statistics — Student loan forecasts for England,
 Table 6a: Forecast number of student borrowers liable to repay and number
@@ -33,13 +36,16 @@ def _fetch_slc_data() -> dict:
     """Fetch and parse SLC Table 6a data from Explore Education Statistics.
 
     Returns:
-        Dict with keys 'plan_2' and 'plan_5', each containing a dict
-        mapping calendar year (int) to borrower count above threshold (int).
+        Dict with nested structure:
+        {
+            'plan_2': {'above_threshold': {...}, 'liable': {...}},
+            'plan_5': {'above_threshold': {...}, 'liable': {...}}
+        }
+        Each inner dict maps calendar year (int) to borrower count (int).
     """
     response = requests.get(_PERMALINK_URL, timeout=30)
     response.raise_for_status()
 
-    # Extract JSON data from __NEXT_DATA__ script tag
     match = re.search(
         r'<script id="__NEXT_DATA__" type="application/json">(.+?)</script>',
         response.text,
@@ -50,63 +56,74 @@ def _fetch_slc_data() -> dict:
     next_data = json.loads(match.group(1))
     table_json = next_data["props"]["pageProps"]["data"]["table"]["json"]
 
-    # Parse header row to get years - columns go newest to oldest
+    # Parse header row to get years (columns go newest to oldest)
     # Structure: Plan 2 (6 years), Plan 5 (6 years), Plan 3 (5 years)
     header_row = table_json["thead"][1]
 
-    # Get Plan 2 years (first 6 columns)
     plan_2_years = []
     for i in range(6):
         year_text = header_row[i]["text"]  # e.g., "2029-30"
         start_year = int(year_text.split("-")[0])
-        calendar_year = start_year + 1  # 2029-30 → 2030
-        plan_2_years.append(calendar_year)
+        plan_2_years.append(start_year + 1)  # 2029-30 → 2030
 
-    # Get Plan 5 years (next 6 columns)
     plan_5_years = []
     for i in range(6, 12):
         year_text = header_row[i]["text"]
         start_year = int(year_text.split("-")[0])
-        calendar_year = start_year + 1
-        plan_5_years.append(calendar_year)
+        plan_5_years.append(start_year + 1)
 
-    # Find the "Higher education total" / "earning above threshold" row
-    # This is the row following "Higher education total" with "liable to repay"
     tbody = table_json["tbody"]
 
-    # Row 11 contains: header + 6 Plan 2 values + 6 Plan 5 values + 5 Plan 3
-    target_row = None
-    for row in tbody:
+    # Find "Higher education total" rows
+    # Row 10: [0]="Higher education total", [1]="Number of borrowers liable...",
+    #         [2-7]=Plan 2 data, [8-13]=Plan 5 data
+    # Row 11: [0]="Number of borrowers...earning above...",
+    #         [1-6]=Plan 2 data, [7-12]=Plan 5 data
+    liable_row = None
+    above_threshold_row = None
+
+    for i, row in enumerate(tbody):
         header_text = row[0].get("text", "")
-        if "earning above repayment threshold" in header_text:
-            # Check if previous context was "Higher education total"
-            # Actually, row 11 is after HE total row 10, and starts with
-            # the "earning above" header (no group header due to rowSpan)
-            target_row = row
+        if header_text == "Higher education total":
+            # This row contains liable-to-repay data
+            liable_row = row
+            # Next row should be above-threshold data
+            if i + 1 < len(tbody):
+                next_row = tbody[i + 1]
+                next_header = next_row[0].get("text", "")
+                if "earning above" in next_header:
+                    above_threshold_row = next_row
             break
 
-    if target_row is None:
+    if above_threshold_row is None:
         raise ValueError("Could not find 'earning above threshold' row")
-
-    # Parse Plan 2 data (cells 1-6, mapping to plan_2_years)
-    plan_2_data = {}
-    for i, year in enumerate(plan_2_years):
-        cell_idx = 1 + i  # Skip header cell
-        value_text = target_row[cell_idx].get("text", "")
-        if value_text and value_text not in ("no data", "0"):
-            value = int(value_text.replace(",", ""))
-            plan_2_data[year] = value
-
-    # Parse Plan 5 data (cells 7-12, mapping to plan_5_years)
-    plan_5_data = {}
-    for i, year in enumerate(plan_5_years):
-        cell_idx = 7 + i  # Skip header + Plan 2 cells
-        value_text = target_row[cell_idx].get("text", "")
-        if value_text and value_text not in ("no data", "0"):
-            value = int(value_text.replace(",", ""))
-            plan_5_data[year] = value
-
-    return {"plan_2": plan_2_data, "plan_5": plan_5_data}
+    if liable_row is None:
+        raise ValueError("Could not find 'Higher education total' row")
+
+    def parse_values(row, start_idx, years):
+        """Parse numeric values from row starting at start_idx."""
+        data = {}
+        for i, year in enumerate(years):
+            cell_idx = start_idx + i
+            if cell_idx >= len(row):
+                continue
+            value_text = row[cell_idx].get("text", "")
+            if value_text and value_text not in ("no data", "0"):
+                data[year] = int(value_text.replace(",", ""))
+        return data
+
+    # Liable row: data starts at index 2 (after header and subheader)
+    p2_liable = parse_values(liable_row, 2, plan_2_years)
+    p5_liable = parse_values(liable_row, 8, plan_5_years)
+
+    # Above threshold row: data starts at index 1 (after header only)
+    p2_above = parse_values(above_threshold_row, 1, plan_2_years)
+    p5_above = parse_values(above_threshold_row, 7, plan_5_years)
+
+    return {
+        "plan_2": {"above_threshold": p2_above, "liable": p2_liable},
+        "plan_5": {"above_threshold": p5_above, "liable": p5_liable},
+    }
 
 
 def get_targets() -> list[Target]:
@@ -115,6 +132,7 @@ def get_targets() -> list[Target]:
 
     targets = []
 
+    # Above-threshold targets (borrowers with PAYE deductions)
     targets.append(
         Target(
             name="slc/plan_2_borrowers_above_threshold",
@@ -122,11 +140,10 @@ def get_targets() -> list[Target]:
             source="slc",
             unit=Unit.COUNT,
             is_count=True,
-            values=slc_data["plan_2"],
+            values=slc_data["plan_2"]["above_threshold"],
             reference_url=_PERMALINK_URL,
         )
     )
-
     targets.append(
         Target(
             name="slc/plan_5_borrowers_above_threshold",
@@ -134,7 +151,31 @@ def get_targets() -> list[Target]:
             source="slc",
             unit=Unit.COUNT,
             is_count=True,
-            values=slc_data["plan_5"],
+            values=slc_data["plan_5"]["above_threshold"],
+            reference_url=_PERMALINK_URL,
+        )
+    )
+
+    # Liable-to-repay targets (all borrowers including below-threshold)
+    targets.append(
+        Target(
+            name="slc/plan_2_borrowers_liable",
+            variable="student_loan_plan",
+            source="slc",
+            unit=Unit.COUNT,
+            is_count=True,
+            values=slc_data["plan_2"]["liable"],
+            reference_url=_PERMALINK_URL,
+        )
+    )
+    targets.append(
+        Target(
+            name="slc/plan_5_borrowers_liable",
+            variable="student_loan_plan",
+            source="slc",
+            unit=Unit.COUNT,
+            is_count=True,
+            values=slc_data["plan_5"]["liable"],
             reference_url=_PERMALINK_URL,
         )
     )
diff --git a/policyengine_uk_data/tests/test_student_loan_targets.py b/policyengine_uk_data/tests/test_student_loan_targets.py
index 1209c42f..6a861f88 100644
--- a/policyengine_uk_data/tests/test_student_loan_targets.py
+++ b/policyengine_uk_data/tests/test_student_loan_targets.py
@@ -10,25 +10,66 @@ def test_slc_targets_registered():
     targets = {t.name: t for t in get_all_targets()}
     assert "slc/plan_2_borrowers_above_threshold" in targets
     assert "slc/plan_5_borrowers_above_threshold" in targets
+    assert "slc/plan_2_borrowers_liable" in targets
+    assert "slc/plan_5_borrowers_liable" in targets
 
 
-def test_slc_plan2_values():
-    """Plan 2 target values match SLC Table 6a."""
+def test_slc_plan2_above_threshold_values():
+    """Plan 2 above-threshold values match SLC Table 6a HE total."""
     from policyengine_uk_data.targets.registry import get_all_targets
 
     targets = {t.name: t for t in get_all_targets()}
     p2 = targets["slc/plan_2_borrowers_above_threshold"]
-    assert p2.values[2025] == 3_670_000
-    assert p2.values[2026] == 4_130_000
-    assert p2.values[2029] == 4_820_000
+    # Values from Row 11 (HE total, above threshold)
+    assert p2.values[2025] == 3_985_000
+    assert p2.values[2026] == 4_460_000
+    assert p2.values[2030] == 5_205_000
 
 
-def test_slc_plan5_values():
-    """Plan 5 target values match SLC Table 6a."""
+def test_slc_plan5_above_threshold_values():
+    """Plan 5 above-threshold values match SLC Table 6a HE total."""
     from policyengine_uk_data.targets.registry import get_all_targets
 
     targets = {t.name: t for t in get_all_targets()}
     p5 = targets["slc/plan_5_borrowers_above_threshold"]
-    assert 2025 not in p5.values  # no Plan 5 borrowers yet in 2024-25
-    assert p5.values[2026] == 25_000
-    assert p5.values[2029] == 700_000
+    # Values from Row 11 (HE total, above threshold)
+    assert 2025 not in p5.values  # 0 in 2024-25
+    assert p5.values[2026] == 35_000
+    assert p5.values[2030] == 1_235_000
+
+
+def test_slc_plan2_liable_values():
+    """Plan 2 liable-to-repay values match SLC Table 6a HE total."""
+    from policyengine_uk_data.targets.registry import get_all_targets
+
+    targets = {t.name: t for t in get_all_targets()}
+    p2 = targets["slc/plan_2_borrowers_liable"]
+    # Values from Row 10 (HE total, liable to repay)
+    assert p2.values[2025] == 8_940_000
+    assert p2.values[2026] == 9_710_000
+    assert p2.values[2030] == 10_525_000
+
+
+def test_slc_plan5_liable_values():
+    """Plan 5 liable-to-repay values match SLC Table 6a HE total."""
+    from policyengine_uk_data.targets.registry import get_all_targets
+
+    targets = {t.name: t for t in get_all_targets()}
+    p5 = targets["slc/plan_5_borrowers_liable"]
+    # Values from Row 10 (HE total, liable to repay)
+    assert p5.values[2025] == 10_000
+    assert p5.values[2026] == 230_000
+    assert p5.values[2030] == 3_400_000
+
+
+def test_liable_exceeds_above_threshold():
+    """Liable-to-repay counts exceed above-threshold counts."""
+    from policyengine_uk_data.targets.registry import get_all_targets
+
+    targets = {t.name: t for t in get_all_targets()}
+    p2_liable = targets["slc/plan_2_borrowers_liable"]
+    p2_above = targets["slc/plan_2_borrowers_above_threshold"]
+
+    for year in p2_above.values:
+        if year in p2_liable.values:
+            assert p2_liable.values[year] > p2_above.values[year]

From f9317ffed9394b23159ce1d4c61fc438961784d3 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:10:03 +0000
Subject: [PATCH 2/8] Add changelog entry

---
 changelog.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/changelog.yaml b/changelog.yaml
index 58f6820c..2328c7ac 100644
--- a/changelog.yaml
+++ b/changelog.yaml
@@ -576,3 +576,11 @@
     - Upload datasets to public HuggingFace repo (policyengine/policyengine-uk-data)
       in addition to private repo, so policyengine-uk gets the latest data.
   date: 2026-02-23 13:26:29
+- bump: minor
+  changes:
+    added:
+    - SLC "liable to repay" targets for Plan 2 and Plan 5, covering all borrowers
+      including those below repayment threshold.
+    - Probabilistic imputation of below-threshold student loan borrowers based on
+      tertiary education, age, and SLC forecast data.
+  date: 2026-02-23 16:00:00

From 29d25fc12dba26209881017852a1cdf82197db3b Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:11:20 +0000
Subject: [PATCH 3/8] Add changelog_entry.yaml

---
 changelog_entry.yaml | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/changelog_entry.yaml b/changelog_entry.yaml
index e69de29b..f8928d56 100644
--- a/changelog_entry.yaml
+++ b/changelog_entry.yaml
@@ -0,0 +1,7 @@
+- bump: minor
+  changes:
+    added:
+    - SLC "liable to repay" targets for Plan 2 and Plan 5, covering all borrowers
+      including those below repayment threshold.
+    - Probabilistic imputation of below-threshold student loan borrowers based on
+      tertiary education, age, and SLC forecast data.

From 2ad6f3da145d59928f8fed3dd82b61924458d4d7 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:27:18 +0000
Subject: [PATCH 4/8] Fix Plan 5 age band to include young graduates

---
 .../datasets/imputations/student_loans.py         | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
index f86655c9..ecfa0415 100644
--- a/policyengine_uk_data/datasets/imputations/student_loans.py
+++ b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -119,13 +119,14 @@ def impute_student_loan_plan(
     # Estimate university start year (assume started at 18)
     uni_start_year = year - age + 18
 
-    # Age bands for plausible loan holders
-    # Plan 1: 32-55 (started before 2012, still repaying)
-    # Plan 2: 21-33 (started 2012-2022)
-    # Plan 5: 21-24 (started 2023+)
-    plan_1_age_mask = (age >= 32) & (age <= 55)
-    plan_2_age_mask = (age >= 21) & (age <= 33)
-    plan_5_age_mask = (age >= 21) & (age <= 24)
+    # Age bands for plausible loan holders (graduates typically 21+)
+    # Plan 1: 32+ (started before 2012, graduated 21+ by 2015)
+    # Plan 2: 21+ and cohort 2012-2022
+    # Plan 5: 21+ and cohort 2023+ (but in early years, recent grads are 18-22)
+    plan_1_age_mask = age >= 32
+    plan_2_age_mask = age >= 21
+    # Plan 5: use cohort constraint only since graduates are very young in early years
+    plan_5_age_mask = age >= 18  # Anyone 18+ who started 2023+ could have a loan
 
     # Cohort masks based on university start year
     plan_1_cohort = uni_start_year < 2012

From 218ff5e99f14071b4534f0b9227ad9b844ec9f0a Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:34:33 +0000
Subject: [PATCH 5/8] Remove tertiary education filter - SLC shows ~94% of
 cohort has loans

---
 .../datasets/imputations/student_loans.py           | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
index ecfa0415..c2c1f625 100644
--- a/policyengine_uk_data/datasets/imputations/student_loans.py
+++ b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -13,10 +13,10 @@
 - PLAN_5: Started September 2023 onwards
 
 The FRS only records active repayers (via PAYE). SLC data shows many borrowers
-earn below repayment thresholds. This imputation fills that gap by
-probabilistically assigning plans to tertiary-educated people without
-reported repayments, based on SLC "liable to repay" minus "above threshold"
-counts.
+earn below repayment thresholds (~55% of Plan 2 holders). This imputation
+fills that gap by probabilistically assigning plans to people in the relevant
+age cohort without reported repayments, based on SLC "liable to repay" minus
+"above threshold" counts.
 """
 
 import numpy as np
@@ -108,12 +108,10 @@ def impute_student_loan_plan(
 
     age = sim.calculate("age").values
     repayments = sim.calculate("student_loan_repayments").values
-    education = sim.calculate("highest_education").values
     region = sim.calculate("region", map_to="person").values
     weights = sim.calculate("person_weight").values
 
     is_england = np.isin(region, list(_ENGLAND_REGIONS))
-    is_tertiary = education == "TERTIARY"
     has_repayments = repayments > 0
 
     # Estimate university start year (assume started at 18)
@@ -153,9 +151,9 @@ def impute_student_loan_plan(
     )
 
     # Plan 2 below-threshold assignment
+    # No tertiary filter - SLC data shows ~94% of cohort has loans
     plan_2_eligible = (
         no_repayments
-        & is_tertiary
         & is_england
         & plan_2_age_mask
         & plan_2_cohort
@@ -170,7 +168,6 @@ def impute_student_loan_plan(
     # Plan 5 below-threshold assignment
     plan_5_eligible = (
         no_repayments
-        & is_tertiary
         & is_england
         & plan_5_age_mask
         & plan_5_cohort

From 8f9562989829d09b73564c744585c6681f1b9f54 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:46:16 +0000
Subject: [PATCH 6/8] Fix EDUCQUAL mapping - codes were inverted (1-7 are
 degrees, not 17-21)

---
 policyengine_uk_data/datasets/frs.py | 126 +++++++++++++++++++--------
 1 file changed, 92 insertions(+), 34 deletions(-)

diff --git a/policyengine_uk_data/datasets/frs.py b/policyengine_uk_data/datasets/frs.py
index 37ce70dc..50a091cc 100644
--- a/policyengine_uk_data/datasets/frs.py
+++ b/policyengine_uk_data/datasets/frs.py
@@ -208,43 +208,101 @@ def determine_education_level(fted_val, typeed2_val, age_val):
     )
 
     # Add highest education from EDUCQUAL (highest qualification achieved)
-    # Codes from FRS ADT_324X classification; unmapped codes default to UPPER_SECONDARY
+    # Based on FRS 2022-23 documentation (SPSS value labels)
     EDUCQUAL_MAP = {
-        1: "NOT_COMPLETED_PRIMARY",
-        2: "LOWER_SECONDARY",  # GCSE D-G / CSE 2-5
-        3: "LOWER_SECONDARY",  # GCSE A-C / O-level A-C
-        4: "UPPER_SECONDARY",  # AS-level
-        5: "UPPER_SECONDARY",  # A-level (1 subject)
-        6: "UPPER_SECONDARY",  # A-level (2 subjects)
-        7: "UPPER_SECONDARY",  # A-level (3+ subjects)
-        8: "LOWER_SECONDARY",  # Scottish Standard/Ordinary Grade
-        9: "UPPER_SECONDARY",  # Scottish Higher Grade
-        10: "UPPER_SECONDARY",  # Scottish 6th Year Studies
+        # Degree level and above (TERTIARY)
+        1: "TERTIARY",  # Doctorate or MPhil
+        2: "TERTIARY",  # Masters, PGCE or other postgrad
+        3: "TERTIARY",  # Degree inc foundation degree
+        4: "TERTIARY",  # Teaching qualification (excl PGCE)
+        5: "TERTIARY",  # Foreign qualification at degree level
+        6: "TERTIARY",  # Other work-related qual at degree level
+        7: "TERTIARY",  # Other professional qual at degree level
+        # Higher education below degree (POST_SECONDARY)
+        8: "POST_SECONDARY",  # Other HE qualification below degree
+        9: "POST_SECONDARY",  # Nursing or other medical
+        10: "POST_SECONDARY",  # Diploma in higher education
         11: "POST_SECONDARY",  # HNC/HND
-        12: "POST_SECONDARY",  # City & Guilds advanced / BTEC National
-        13: "UPPER_SECONDARY",  # City & Guilds craft / BTEC General
-        14: "POST_SECONDARY",  # ONC/OND / BTEC National (lower)
-        15: "UPPER_SECONDARY",  # City & Guilds foundation
-        16: "POST_SECONDARY",  # RSA advanced
-        17: "TERTIARY",  # First/foundation degree
-        18: "TERTIARY",  # Second degree
-        19: "TERTIARY",  # Higher degree (Masters/PhD)
-        20: "TERTIARY",  # PGCE / teaching qualification
-        21: "TERTIARY",  # Nursing/paramedical qualification
-        66: "UPPER_SECONDARY",  # NVQ/SVQ Level 1
-        67: "UPPER_SECONDARY",  # NVQ/SVQ Level 2
-        68: "UPPER_SECONDARY",  # NVQ/SVQ Level 3
-        69: "POST_SECONDARY",  # NVQ/SVQ Level 4
-        70: "TERTIARY",  # NVQ/SVQ Level 5
+        12: "POST_SECONDARY",  # BTEC higher level
+        13: "POST_SECONDARY",  # SCOTVEC higher level
+        14: "POST_SECONDARY",  # NVQ/SVQ Level 4
+        15: "POST_SECONDARY",  # NVQ/SVQ Level 5
+        16: "POST_SECONDARY",  # RSA higher diploma / OCR Level 4
+        # A-level equivalent (UPPER_SECONDARY)
+        17: "UPPER_SECONDARY",  # A-Level or equivalent
+        18: "UPPER_SECONDARY",  # Welsh Baccalaureate Advanced
+        19: "UPPER_SECONDARY",  # Scottish Baccalaureate
+        20: "UPPER_SECONDARY",  # International Baccalaureate
+        21: "UPPER_SECONDARY",  # AS-level or equivalent
+        22: "UPPER_SECONDARY",  # Certificate of 6th Year Studies
+        23: "UPPER_SECONDARY",  # Access to Higher Education
+        24: "UPPER_SECONDARY",  # Scottish Higher/Intermediate
+        25: "UPPER_SECONDARY",  # Skills for work Higher
+        26: "POST_SECONDARY",  # ONC/OND
+        27: "POST_SECONDARY",  # BTEC National level
+        28: "POST_SECONDARY",  # SCOTVEC National level
+        29: "UPPER_SECONDARY",  # New Diploma Advanced
+        30: "UPPER_SECONDARY",  # New Diploma Progression
+        31: "UPPER_SECONDARY",  # NVQ/SVQ Level 3
+        32: "UPPER_SECONDARY",  # GNVQ Advanced
+        33: "UPPER_SECONDARY",  # RSA advanced diploma / OCR Level 3
+        34: "UPPER_SECONDARY",  # City and Guilds advanced craft
+        35: "UPPER_SECONDARY",  # Welsh Baccalaureate Intermediate
+        # GCSE/O-level equivalent (LOWER_SECONDARY)
+        36: "LOWER_SECONDARY",  # O-Level (5+)
+        37: "LOWER_SECONDARY",  # Scottish Standard Grade (5+)
+        38: "LOWER_SECONDARY",  # GCSE (5+)
+        39: "LOWER_SECONDARY",  # CSE (5+)
+        40: "LOWER_SECONDARY",  # Scottish National level 5
+        41: "LOWER_SECONDARY",  # Skills for work National 5
+        42: "LOWER_SECONDARY",  # BTEC first diploma
+        43: "LOWER_SECONDARY",  # SCOTVEC first diploma
+        44: "LOWER_SECONDARY",  # New Diploma Higher (level 2)
+        45: "LOWER_SECONDARY",  # NVQ/SVQ Level 2
+        46: "LOWER_SECONDARY",  # GNVQ Intermediate
+        47: "LOWER_SECONDARY",  # RSA diploma / OCR Level 2
+        48: "LOWER_SECONDARY",  # City and Guilds craft
+        49: "LOWER_SECONDARY",  # Other high school leavers qual
+        # Below GCSE / basic qualifications
+        50: "LOWER_SECONDARY",  # BTEC (unspecified)
+        51: "LOWER_SECONDARY",  # BTEC first cert
+        52: "LOWER_SECONDARY",  # SCOTVEC (unspecified)
+        53: "LOWER_SECONDARY",  # SCOTVEC first cert
+        54: "LOWER_SECONDARY",  # SCOTVEC modules
+        55: "LOWER_SECONDARY",  # New Diploma (unspecified)
+        56: "LOWER_SECONDARY",  # New Diploma Foundation
+        57: "LOWER_SECONDARY",  # Welsh Baccalaureate (unspecified)
+        58: "LOWER_SECONDARY",  # Welsh Baccalaureate Foundation
+        59: "LOWER_SECONDARY",  # NVQ/SVQ (unspecified)
+        60: "LOWER_SECONDARY",  # NVQ/SVQ Level 1
+        61: "LOWER_SECONDARY",  # GNVQ (unspecified)
+        62: "LOWER_SECONDARY",  # GNVQ Part One Intermediate
+        63: "LOWER_SECONDARY",  # GNVQ Full Foundation
+        64: "LOWER_SECONDARY",  # GNVQ Part One Foundation
+        65: "LOWER_SECONDARY",  # O-Level (unspecified)
+        66: "LOWER_SECONDARY",  # O-Level (fewer than 5)
+        67: "LOWER_SECONDARY",  # Scottish Standard Grade (unspecified)
+        68: "LOWER_SECONDARY",  # Scottish Standard Grade (fewer than 5)
+        69: "LOWER_SECONDARY",  # GCSE (unspecified)
+        70: "LOWER_SECONDARY",  # GCSE (fewer than 5)
+        71: "LOWER_SECONDARY",  # Scottish National 1-4
+        72: "LOWER_SECONDARY",  # Scottish National (unspecified)
+        73: "LOWER_SECONDARY",  # Skills for work National 3-4
+        74: "LOWER_SECONDARY",  # Skills for work (unspecified)
+        75: "LOWER_SECONDARY",  # CSE (unspecified)
+        76: "LOWER_SECONDARY",  # CSE (fewer than 5)
+        77: "LOWER_SECONDARY",  # RSA/OCR (unspecified)
+        78: "LOWER_SECONDARY",  # RSA other / OCR Level 1
+        79: "LOWER_SECONDARY",  # City and Guilds (unspecified)
+        80: "LOWER_SECONDARY",  # City and Guilds foundation
+        81: "LOWER_SECONDARY",  # YT Certificate
+        82: "LOWER_SECONDARY",  # Key Skills / Core Skills
+        83: "NOT_COMPLETED_PRIMARY",  # Basic Skills (literacy/numeracy)
+        84: "NOT_COMPLETED_PRIMARY",  # Entry Level Qualifications
+        85: "NOT_COMPLETED_PRIMARY",  # Award/Certificate at entry level
+        86: "LOWER_SECONDARY",  # Other professional/vocational/foreign
     }
-    # Codes 22-65 and 71-85 are further vocational/professional qualifications;
-    # treat as POST_SECONDARY. Codes 86-87 are catch-alls; treat as UPPER_SECONDARY.
-    for code in range(22, 66):
-        EDUCQUAL_MAP[code] = "POST_SECONDARY"
-    for code in range(71, 86):
-        EDUCQUAL_MAP[code] = "POST_SECONDARY"
-    EDUCQUAL_MAP[86] = "UPPER_SECONDARY"
-    EDUCQUAL_MAP[87] = "UPPER_SECONDARY"
+    # Code 87 is missing - means no qualification data; default to UPPER_SECONDARY
 
     educqual = pd.to_numeric(person.educqual, errors="coerce")
     pe_person["highest_education"] = educqual.map(EDUCQUAL_MAP).fillna(

From 48a8f12b50e613a71f2b21af3eaf228d1f2edb0c Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:47:01 +0000
Subject: [PATCH 7/8] Re-enable tertiary filter now that EDUCQUAL mapping is
 fixed

---
 .../datasets/imputations/student_loans.py             | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
index c2c1f625..72c90ab3 100644
--- a/policyengine_uk_data/datasets/imputations/student_loans.py
+++ b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -14,9 +14,9 @@
 
 The FRS only records active repayers (via PAYE). SLC data shows many borrowers
 earn below repayment thresholds (~55% of Plan 2 holders). This imputation
-fills that gap by probabilistically assigning plans to people in the relevant
-age cohort without reported repayments, based on SLC "liable to repay" minus
-"above threshold" counts.
+fills that gap by probabilistically assigning plans to tertiary-educated
+people in the relevant age cohort without reported repayments, based on SLC
+"liable to repay" minus "above threshold" counts.
 """
 
 import numpy as np
@@ -110,8 +110,10 @@ def impute_student_loan_plan(
     repayments = sim.calculate("student_loan_repayments").values
     region = sim.calculate("region", map_to="person").values
     weights = sim.calculate("person_weight").values
+    education = sim.calculate("highest_education").values
 
     is_england = np.isin(region, list(_ENGLAND_REGIONS))
+    is_tertiary = education == "TERTIARY"
     has_repayments = repayments > 0
 
     # Estimate university start year (assume started at 18)
@@ -151,9 +153,9 @@ def impute_student_loan_plan(
     )
 
     # Plan 2 below-threshold assignment
-    # No tertiary filter - SLC data shows ~94% of cohort has loans
     plan_2_eligible = (
         no_repayments
+        & is_tertiary
         & is_england
         & plan_2_age_mask
         & plan_2_cohort
@@ -168,6 +170,7 @@ def impute_student_loan_plan(
     # Plan 5 below-threshold assignment
     plan_5_eligible = (
         no_repayments
+        & is_tertiary
         & is_england
         & plan_5_age_mask
         & plan_5_cohort

From 133a8d14c82f9651f9894cf9880c149a83d8cba0 Mon Sep 17 00:00:00 2001
From: Nikhil Woodruff <nikhil.woodruff@cabinetoffice.gov.uk>
Date: Mon, 23 Feb 2026 15:58:27 +0000
Subject: [PATCH 8/8] Relax Plan 2 age constraints for better coverage

The strict cohort constraint (ages 21-31) missed many Plan 2 borrowers
who started university late or did postgrad studies. This change:

1. Expands Plan 2 age mask from 21+ to 21-45
2. Uses age mask (not cohort) for Plan 2 below-threshold assignment
3. Assigns repayers to Plan 2 if age 21-45 and not Plan 1 cohort

The below-threshold imputation now covers the full 4.95M target.
The remaining gap (FRS shows 1.4M repayers vs SLC's 4M) is a data
collection issue that calibration targets will help address.
---
 .../datasets/imputations/student_loans.py     | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/policyengine_uk_data/datasets/imputations/student_loans.py b/policyengine_uk_data/datasets/imputations/student_loans.py
index 72c90ab3..084ccf29 100644
--- a/policyengine_uk_data/datasets/imputations/student_loans.py
+++ b/policyengine_uk_data/datasets/imputations/student_loans.py
@@ -119,14 +119,13 @@ def impute_student_loan_plan(
     # Estimate university start year (assume started at 18)
     uni_start_year = year - age + 18
 
-    # Age bands for plausible loan holders (graduates typically 21+)
+    # Age bands for plausible loan holders
     # Plan 1: 32+ (started before 2012, graduated 21+ by 2015)
-    # Plan 2: 21+ and cohort 2012-2022
-    # Plan 5: 21+ and cohort 2023+ (but in early years, recent grads are 18-22)
+    # Plan 2: 21-45 (allows for late starters, postgrads, career changers)
+    # Plan 5: 18-25 (recent starters only - cohort 2023+)
     plan_1_age_mask = age >= 32
-    plan_2_age_mask = age >= 21
-    # Plan 5: use cohort constraint only since graduates are very young in early years
-    plan_5_age_mask = age >= 18  # Anyone 18+ who started 2023+ could have a loan
+    plan_2_age_mask = (age >= 21) & (age <= 45)
+    plan_5_age_mask = (age >= 18) & (age <= 25)
 
     # Cohort masks based on university start year
     plan_1_cohort = uni_start_year < 2012
@@ -136,8 +135,11 @@ def impute_student_loan_plan(
     plan = np.full(len(age), "NONE", dtype=object)
 
     # Step 1: Assign plans to people with reported repayments
+    # Plan 1: use cohort (started before 2012)
+    # Plan 2: use age mask (21-45) since many late starters and postgrads exist
+    # Plan 5: use cohort (started 2023+)
     plan[has_repayments & plan_1_cohort] = "PLAN_1"
-    plan[has_repayments & plan_2_cohort] = "PLAN_2"
+    plan[has_repayments & plan_2_age_mask & ~plan_1_cohort] = "PLAN_2"
     plan[has_repayments & plan_5_cohort] = "PLAN_5"
 
     # Step 2: Probabilistically assign below-threshold borrowers
@@ -153,12 +155,9 @@ def impute_student_loan_plan(
     )
 
     # Plan 2 below-threshold assignment
+    # Use age mask only (not cohort) since many borrowers started late or did postgrad
     plan_2_eligible = (
-        no_repayments
-        & is_tertiary
-        & is_england
-        & plan_2_age_mask
-        & plan_2_cohort
+        no_repayments & is_tertiary & is_england & plan_2_age_mask
     )
     if plan_2_below > 0 and plan_2_eligible.sum() > 0:
         eligible_weight = (weights * plan_2_eligible).sum()