Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -576,3 +576,11 @@
- Upload datasets to public HuggingFace repo (policyengine/policyengine-uk-data)
in addition to private repo, so policyengine-uk gets the latest data.
date: 2026-02-23 13:26:29
- bump: minor
changes:
added:
- SLC "liable to repay" targets for Plan 2 and Plan 5, covering all borrowers
including those below repayment threshold.
- Probabilistic imputation of below-threshold student loan borrowers based on
tertiary education, age, and SLC forecast data.
date: 2026-02-23 16:00:00
7 changes: 7 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
- bump: minor
changes:
added:
- SLC "liable to repay" targets for Plan 2 and Plan 5, covering all borrowers
including those below repayment threshold.
- Probabilistic imputation of below-threshold student loan borrowers based on
tertiary education, age, and SLC forecast data.
126 changes: 92 additions & 34 deletions policyengine_uk_data/datasets/frs.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,43 +208,101 @@ def determine_education_level(fted_val, typeed2_val, age_val):
)

# Add highest education from EDUCQUAL (highest qualification achieved)
# Codes from FRS ADT_324X classification; unmapped codes default to UPPER_SECONDARY
# Based on FRS 2022-23 documentation (SPSS value labels)
EDUCQUAL_MAP = {
1: "NOT_COMPLETED_PRIMARY",
2: "LOWER_SECONDARY", # GCSE D-G / CSE 2-5
3: "LOWER_SECONDARY", # GCSE A-C / O-level A-C
4: "UPPER_SECONDARY", # AS-level
5: "UPPER_SECONDARY", # A-level (1 subject)
6: "UPPER_SECONDARY", # A-level (2 subjects)
7: "UPPER_SECONDARY", # A-level (3+ subjects)
8: "LOWER_SECONDARY", # Scottish Standard/Ordinary Grade
9: "UPPER_SECONDARY", # Scottish Higher Grade
10: "UPPER_SECONDARY", # Scottish 6th Year Studies
# Degree level and above (TERTIARY)
1: "TERTIARY", # Doctorate or MPhil
2: "TERTIARY", # Masters, PGCE or other postgrad
3: "TERTIARY", # Degree inc foundation degree
4: "TERTIARY", # Teaching qualification (excl PGCE)
5: "TERTIARY", # Foreign qualification at degree level
6: "TERTIARY", # Other work-related qual at degree level
7: "TERTIARY", # Other professional qual at degree level
# Higher education below degree (POST_SECONDARY)
8: "POST_SECONDARY", # Other HE qualification below degree
9: "POST_SECONDARY", # Nursing or other medical
10: "POST_SECONDARY", # Diploma in higher education
11: "POST_SECONDARY", # HNC/HND
12: "POST_SECONDARY", # City & Guilds advanced / BTEC National
13: "UPPER_SECONDARY", # City & Guilds craft / BTEC General
14: "POST_SECONDARY", # ONC/OND / BTEC National (lower)
15: "UPPER_SECONDARY", # City & Guilds foundation
16: "POST_SECONDARY", # RSA advanced
17: "TERTIARY", # First/foundation degree
18: "TERTIARY", # Second degree
19: "TERTIARY", # Higher degree (Masters/PhD)
20: "TERTIARY", # PGCE / teaching qualification
21: "TERTIARY", # Nursing/paramedical qualification
66: "UPPER_SECONDARY", # NVQ/SVQ Level 1
67: "UPPER_SECONDARY", # NVQ/SVQ Level 2
68: "UPPER_SECONDARY", # NVQ/SVQ Level 3
69: "POST_SECONDARY", # NVQ/SVQ Level 4
70: "TERTIARY", # NVQ/SVQ Level 5
12: "POST_SECONDARY", # BTEC higher level
13: "POST_SECONDARY", # SCOTVEC higher level
14: "POST_SECONDARY", # NVQ/SVQ Level 4
15: "POST_SECONDARY", # NVQ/SVQ Level 5
16: "POST_SECONDARY", # RSA higher diploma / OCR Level 4
# A-level equivalent (UPPER_SECONDARY)
17: "UPPER_SECONDARY", # A-Level or equivalent
18: "UPPER_SECONDARY", # Welsh Baccalaureate Advanced
19: "UPPER_SECONDARY", # Scottish Baccalaureate
20: "UPPER_SECONDARY", # International Baccalaureate
21: "UPPER_SECONDARY", # AS-level or equivalent
22: "UPPER_SECONDARY", # Certificate of 6th Year Studies
23: "UPPER_SECONDARY", # Access to Higher Education
24: "UPPER_SECONDARY", # Scottish Higher/Intermediate
25: "UPPER_SECONDARY", # Skills for work Higher
26: "POST_SECONDARY", # ONC/OND
27: "POST_SECONDARY", # BTEC National level
28: "POST_SECONDARY", # SCOTVEC National level
29: "UPPER_SECONDARY", # New Diploma Advanced
30: "UPPER_SECONDARY", # New Diploma Progression
31: "UPPER_SECONDARY", # NVQ/SVQ Level 3
32: "UPPER_SECONDARY", # GNVQ Advanced
33: "UPPER_SECONDARY", # RSA advanced diploma / OCR Level 3
34: "UPPER_SECONDARY", # City and Guilds advanced craft
35: "UPPER_SECONDARY", # Welsh Baccalaureate Intermediate
# GCSE/O-level equivalent (LOWER_SECONDARY)
36: "LOWER_SECONDARY", # O-Level (5+)
37: "LOWER_SECONDARY", # Scottish Standard Grade (5+)
38: "LOWER_SECONDARY", # GCSE (5+)
39: "LOWER_SECONDARY", # CSE (5+)
40: "LOWER_SECONDARY", # Scottish National level 5
41: "LOWER_SECONDARY", # Skills for work National 5
42: "LOWER_SECONDARY", # BTEC first diploma
43: "LOWER_SECONDARY", # SCOTVEC first diploma
44: "LOWER_SECONDARY", # New Diploma Higher (level 2)
45: "LOWER_SECONDARY", # NVQ/SVQ Level 2
46: "LOWER_SECONDARY", # GNVQ Intermediate
47: "LOWER_SECONDARY", # RSA diploma / OCR Level 2
48: "LOWER_SECONDARY", # City and Guilds craft
49: "LOWER_SECONDARY", # Other high school leavers qual
# Below GCSE / basic qualifications
50: "LOWER_SECONDARY", # BTEC (unspecified)
51: "LOWER_SECONDARY", # BTEC first cert
52: "LOWER_SECONDARY", # SCOTVEC (unspecified)
53: "LOWER_SECONDARY", # SCOTVEC first cert
54: "LOWER_SECONDARY", # SCOTVEC modules
55: "LOWER_SECONDARY", # New Diploma (unspecified)
56: "LOWER_SECONDARY", # New Diploma Foundation
57: "LOWER_SECONDARY", # Welsh Baccalaureate (unspecified)
58: "LOWER_SECONDARY", # Welsh Baccalaureate Foundation
59: "LOWER_SECONDARY", # NVQ/SVQ (unspecified)
60: "LOWER_SECONDARY", # NVQ/SVQ Level 1
61: "LOWER_SECONDARY", # GNVQ (unspecified)
62: "LOWER_SECONDARY", # GNVQ Part One Intermediate
63: "LOWER_SECONDARY", # GNVQ Full Foundation
64: "LOWER_SECONDARY", # GNVQ Part One Foundation
65: "LOWER_SECONDARY", # O-Level (unspecified)
66: "LOWER_SECONDARY", # O-Level (fewer than 5)
67: "LOWER_SECONDARY", # Scottish Standard Grade (unspecified)
68: "LOWER_SECONDARY", # Scottish Standard Grade (fewer than 5)
69: "LOWER_SECONDARY", # GCSE (unspecified)
70: "LOWER_SECONDARY", # GCSE (fewer than 5)
71: "LOWER_SECONDARY", # Scottish National 1-4
72: "LOWER_SECONDARY", # Scottish National (unspecified)
73: "LOWER_SECONDARY", # Skills for work National 3-4
74: "LOWER_SECONDARY", # Skills for work (unspecified)
75: "LOWER_SECONDARY", # CSE (unspecified)
76: "LOWER_SECONDARY", # CSE (fewer than 5)
77: "LOWER_SECONDARY", # RSA/OCR (unspecified)
78: "LOWER_SECONDARY", # RSA other / OCR Level 1
79: "LOWER_SECONDARY", # City and Guilds (unspecified)
80: "LOWER_SECONDARY", # City and Guilds foundation
81: "LOWER_SECONDARY", # YT Certificate
82: "LOWER_SECONDARY", # Key Skills / Core Skills
83: "NOT_COMPLETED_PRIMARY", # Basic Skills (literacy/numeracy)
84: "NOT_COMPLETED_PRIMARY", # Entry Level Qualifications
85: "NOT_COMPLETED_PRIMARY", # Award/Certificate at entry level
86: "LOWER_SECONDARY", # Other professional/vocational/foreign
}
# Codes 22-65 and 71-85 are further vocational/professional qualifications;
# treat as POST_SECONDARY. Codes 86-87 are catch-alls; treat as UPPER_SECONDARY.
for code in range(22, 66):
EDUCQUAL_MAP[code] = "POST_SECONDARY"
for code in range(71, 86):
EDUCQUAL_MAP[code] = "POST_SECONDARY"
EDUCQUAL_MAP[86] = "UPPER_SECONDARY"
EDUCQUAL_MAP[87] = "UPPER_SECONDARY"
# Code 87 is missing - means no qualification data; default to UPPER_SECONDARY

educqual = pd.to_numeric(person.educqual, errors="coerce")
pe_person["highest_education"] = educqual.map(EDUCQUAL_MAP).fillna(
Expand Down
164 changes: 136 additions & 28 deletions policyengine_uk_data/datasets/imputations/student_loans.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,77 +2,185 @@
Student loan plan imputation.

This module imputes the student_loan_plan variable based on:
- Whether the person has reported student loan repayments
- Whether the person has reported student loan repayments (above threshold)
- Their estimated university attendance year (inferred from age)
- Probabilistic assignment for below-threshold borrowers

The imputation assigns plan types according to when the loan system changed:
- NONE: No reported repayments
- NONE: No loan
- PLAN_1: Started university before September 2012
- PLAN_2: Started September 2012 - August 2023
- PLAN_5: Started September 2023 onwards

This enables policyengine-uk's student_loan_repayment variable to calculate
repayments using official threshold parameters.
The FRS only records active repayers (via PAYE). SLC data shows many borrowers
earn below repayment thresholds (~55% of Plan 2 holders). This imputation
fills that gap by probabilistically assigning plans to tertiary-educated
people in the relevant age cohort without reported repayments, based on SLC
"liable to repay" minus "above threshold" counts.
"""

import numpy as np
from policyengine_uk.data import UKSingleYearDataset
from policyengine_uk import Microsimulation

# England regions for filtering (SLC data covers England only)
_ENGLAND_REGIONS = {
"NORTH_EAST",
"NORTH_WEST",
"YORKSHIRE",
"EAST_MIDLANDS",
"WEST_MIDLANDS",
"EAST_OF_ENGLAND",
"LONDON",
"SOUTH_EAST",
"SOUTH_WEST",
}

# SLC liable-to-repay counts (Higher Education total, England)
# Source: https://explore-education-statistics.service.gov.uk/data-tables/permalink/6ff75517-7124-487c-cb4e-08de6eccf22d
_PLAN_2_LIABLE = {
2025: 8_940_000,
2026: 9_710_000,
2027: 10_360_000,
2028: 10_615_000,
2029: 10_600_000,
2030: 10_525_000,
}

_PLAN_5_LIABLE = {
2025: 10_000,
2026: 230_000,
2027: 630_000,
2028: 1_380_000,
2029: 2_360_000,
2030: 3_400_000,
}

# SLC above-threshold counts (borrowers making repayments)
_PLAN_2_ABOVE_THRESHOLD = {
2025: 3_985_000,
2026: 4_460_000,
2027: 4_825_000,
2028: 5_045_000,
2029: 5_160_000,
2030: 5_205_000,
}

_PLAN_5_ABOVE_THRESHOLD = {
2026: 35_000,
2027: 145_000,
2028: 390_000,
2029: 770_000,
2030: 1_235_000,
}


def impute_student_loan_plan(
dataset: UKSingleYearDataset,
year: int = 2025,
seed: int = 42,
) -> UKSingleYearDataset:
"""
Impute student loan plan type based on age and reported repayments.
Impute student loan plan type based on age, repayments, and education.

The plan type determines which repayment threshold applies:
- PLAN_1: £26,065 (2025), pre-Sept 2012 England/Wales
- PLAN_2: £29,385 (2026-2029 frozen), Sept 2012 - Aug 2023
- PLAN_4: Scottish loans (not imputed here - requires explicit flag)
- PLAN_5: £25,000 (2025), Sept 2023 onwards

This function:
1. Assigns plans to people with reported repayments (above threshold)
2. Probabilistically assigns plans to tertiary-educated people without
repayments (below threshold) to match SLC liable-to-repay totals

Args:
dataset: PolicyEngine UK dataset with student_loan_repayments.
year: The simulation year, used to estimate university attendance.
seed: Random seed for reproducibility.

Returns:
Dataset with imputed student_loan_plan values.
"""
dataset = dataset.copy()
sim = Microsimulation(dataset=dataset)
rng = np.random.default_rng(seed)

# Get required variables
age = sim.calculate("age").values
student_loan_repayments = sim.calculate("student_loan_repayments").values

# Determine if person has a student loan based on reported repayments
has_student_loan = student_loan_repayments > 0

# Estimate when they started university (assume age 18)
# For simulation year Y and age A, university start year = Y - A + 18
estimated_uni_start_year = year - age + 18
repayments = sim.calculate("student_loan_repayments").values
region = sim.calculate("region", map_to="person").values
weights = sim.calculate("person_weight").values
education = sim.calculate("highest_education").values

is_england = np.isin(region, list(_ENGLAND_REGIONS))
is_tertiary = education == "TERTIARY"
has_repayments = repayments > 0

# Estimate university start year (assume started at 18)
uni_start_year = year - age + 18

# Age bands for plausible loan holders
# Plan 1: 32+ (started before 2012, graduated 21+ by 2015)
# Plan 2: 21-45 (allows for late starters, postgrads, career changers)
# Plan 5: 18-25 (recent starters only - cohort 2023+)
plan_1_age_mask = age >= 32
plan_2_age_mask = (age >= 21) & (age <= 45)
plan_5_age_mask = (age >= 18) & (age <= 25)

# Cohort masks based on university start year
plan_1_cohort = uni_start_year < 2012
plan_2_cohort = (uni_start_year >= 2012) & (uni_start_year < 2023)
plan_5_cohort = uni_start_year >= 2023

# Assign plan types based on when loan system changed
# StudentLoanPlan is a string enum: "NONE", "PLAN_1", "PLAN_2", "PLAN_4", "PLAN_5"
plan = np.full(len(age), "NONE", dtype=object)

# Plan 1: Started before September 2012
plan_1_mask = has_student_loan & (estimated_uni_start_year < 2012)
plan[plan_1_mask] = "PLAN_1"

# Plan 2: Started September 2012 - August 2023
plan_2_mask = has_student_loan & (
(estimated_uni_start_year >= 2012) & (estimated_uni_start_year < 2023)
# Step 1: Assign plans to people with reported repayments
# Plan 1: use cohort (started before 2012)
# Plan 2: use age mask (21-45) since many late starters and postgrads exist
# Plan 5: use cohort (started 2023+)
plan[has_repayments & plan_1_cohort] = "PLAN_1"
plan[has_repayments & plan_2_age_mask & ~plan_1_cohort] = "PLAN_2"
plan[has_repayments & plan_5_cohort] = "PLAN_5"

# Step 2: Probabilistically assign below-threshold borrowers
# Only for tertiary-educated people in England without repayments
no_repayments = ~has_repayments

# Calculate target below-threshold counts
plan_2_below = _PLAN_2_LIABLE.get(year, 0) - _PLAN_2_ABOVE_THRESHOLD.get(
year, 0
)
plan_5_below = _PLAN_5_LIABLE.get(year, 0) - _PLAN_5_ABOVE_THRESHOLD.get(
year, 0
)
plan[plan_2_mask] = "PLAN_2"

# Plan 5: Started September 2023 onwards
plan_5_mask = has_student_loan & (estimated_uni_start_year >= 2023)
plan[plan_5_mask] = "PLAN_5"
# Plan 2 below-threshold assignment
# Use age mask only (not cohort) since many borrowers started late or did postgrad
plan_2_eligible = (
no_repayments & is_tertiary & is_england & plan_2_age_mask
)
if plan_2_below > 0 and plan_2_eligible.sum() > 0:
eligible_weight = (weights * plan_2_eligible).sum()
if eligible_weight > 0:
prob = min(1.0, plan_2_below / eligible_weight)
draws = rng.random(len(age))
plan[plan_2_eligible & (draws < prob)] = "PLAN_2"

# Plan 5 below-threshold assignment
plan_5_eligible = (
no_repayments
& is_tertiary
& is_england
& plan_5_age_mask
& plan_5_cohort
)
if plan_5_below > 0 and plan_5_eligible.sum() > 0:
eligible_weight = (weights * plan_5_eligible).sum()
if eligible_weight > 0:
prob = min(1.0, plan_5_below / eligible_weight)
draws = rng.random(len(age))
plan[plan_5_eligible & (draws < prob)] = "PLAN_5"

# Store as the plan type
dataset.person["student_loan_plan"] = plan

return dataset
5 changes: 4 additions & 1 deletion policyengine_uk_data/targets/build_loss_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
compute_scotland_uc_child,
compute_scottish_child_payment,
compute_student_loan_plan,
compute_student_loan_plan_liable,
compute_ss_contributions,
compute_ss_headcount,
compute_ss_it_relief,
Expand Down Expand Up @@ -306,8 +307,10 @@ def _compute_column(
return compute_scottish_child_payment(target, ctx)

# Student loan plan borrower counts (SLC)
if name.startswith("slc/plan_"):
if name.startswith("slc/plan_") and "above_threshold" in name:
return compute_student_loan_plan(target, ctx)
if name.startswith("slc/plan_") and "liable" in name:
return compute_student_loan_plan_liable(target, ctx)

# PIP claimants
if name in (
Expand Down
Loading