From 673be259b6426e5fb752fac45697a2acd9c9564e Mon Sep 17 00:00:00 2001 From: Soroush Bassam Date: Mon, 16 Mar 2026 20:50:47 -0700 Subject: [PATCH 1/2] update specs for rl training checkpoint save/resume --- openapi.yaml | 114 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 111 insertions(+), 3 deletions(-) diff --git a/openapi.yaml b/openapi.yaml index 13d9f86..01390ca 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -7492,6 +7492,64 @@ paths: schema: description: Operation ID type: string + /rl/training-sessions/{session_id}/operations/training-checkpoint: + post: + summary: Save training checkpoint + description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step) to object storage. + operationId: createTrainingCheckpoint + tags: [RL] + responses: + "200": + description: Save training checkpoint operation details + content: + application/json: + schema: + $ref: '#/components/schemas/RL.TrainingCheckpointOperation' + default: + description: An unexpected error response. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorData' + parameters: + - name: session_id + in: path + required: true + schema: + description: Training session ID + type: string + /rl/training-sessions/{session_id}/operations/training-checkpoint/{operation_id}: + get: + summary: Get save training checkpoint operation + description: Retrieves the current status and result of a save training checkpoint operation. + operationId: getTrainingCheckpointOperation + tags: [RL] + responses: + "200": + description: Save training checkpoint operation details + content: + application/json: + schema: + $ref: '#/components/schemas/RL.TrainingCheckpointOperation' + default: + description: An unexpected error response. + content: + application/json: + schema: + $ref: '#/components/schemas/ErrorData' + parameters: + - name: session_id + in: path + required: true + schema: + description: Training session ID + type: string + - name: operation_id + in: path + required: true + schema: + description: Operation ID + type: string /rl/checkpoints/{id}/download: get: summary: Download checkpoint @@ -7976,10 +8034,10 @@ components: description: Base model to use for the training session type: string example: meta-llama/Meta-Llama-3-8B-Instruct - checkpoint_id: - description: Checkpoint ID to use for the training session + resume_from_checkpoint_id: + description: Checkpoint ID to resume from type: string - example: checkpoint-123 + example: 123e4567-e89b-12d3-a456-426614174000 lora_config: $ref: '#/components/schemas/RL.LoraConfig' RL.TrainingSessionStatus: @@ -8015,6 +8073,16 @@ components: type: object $ref: '#/components/schemas/RL.InferenceCheckpoint' description: List of saved inference checkpoints for this session + training_checkpoints: + type: array + items: + type: object + $ref: '#/components/schemas/RL.TrainingCheckpoint' + description: List of saved training checkpoints for this session + resume_from_checkpoint_id: + type: string + example: 123e4567-e89b-12d3-a456-426614174000 + description: Checkpoint ID this session was resumed from step: description: Current training step type: string @@ -8151,6 +8219,46 @@ components: format: date-time example: "2026-01-02T00:00:00Z" description: Timestamp when the model was registered + RL.TrainingCheckpoint: + type: object + description: Saved training checkpoint + properties: + id: + type: string + example: 123e4567-e89b-12d3-a456-426614174000 + description: Unique identifier for the checkpoint + step: + type: string + format: uint64 + example: 42 + description: Training step at time of save + created_at: + type: string + format: date-time + example: "2026-01-02T00:00:00Z" + description: Timestamp when the checkpoint was created + RL.TrainingCheckpointResult: + type: object + properties: + checkpoint_id: + type: string + example: 550e8400-e29b-41d4-a716-446655440000 + description: ID of the saved training checkpoint (use for resume via Start) + RL.TrainingCheckpointOperation: + type: object + properties: + id: + type: string + example: 550e8400-e29b-41d4-a716-446655440000 + description: Operation ID + status: + $ref: '#/components/schemas/RL.TrainingOperationStatus' + example: TRAINING_OPERATION_STATUS_PENDING + description: Operation status + output: + $ref: '#/components/schemas/RL.TrainingCheckpointResult' + error: + $ref: '#/components/schemas/RL.TrainingOperationError' RL.CheckpointVariant: type: string enum: From 464236095739f7f5786512eb52b12c95fe4fcf3c Mon Sep 17 00:00:00 2001 From: Soroush Date: Tue, 17 Mar 2026 14:09:10 -0700 Subject: [PATCH 2/2] Update openapi.yaml Co-authored-by: Gleb Khaykin --- openapi.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openapi.yaml b/openapi.yaml index 01390ca..272d625 100644 --- a/openapi.yaml +++ b/openapi.yaml @@ -7495,7 +7495,7 @@ paths: /rl/training-sessions/{session_id}/operations/training-checkpoint: post: summary: Save training checkpoint - description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step) to object storage. + description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step). operationId: createTrainingCheckpoint tags: [RL] responses: