From 673be259b6426e5fb752fac45697a2acd9c9564e Mon Sep 17 00:00:00 2001
From: Soroush Bassam <sbassam@together.ai>
Date: Mon, 16 Mar 2026 20:50:47 -0700
Subject: [PATCH 1/2] update specs for rl training checkpoint save/resume

---
 openapi.yaml | 114 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 111 insertions(+), 3 deletions(-)

diff --git a/openapi.yaml b/openapi.yaml
index 13d9f86..01390ca 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -7492,6 +7492,64 @@ paths:
           schema:
             description: Operation ID
             type: string
+  /rl/training-sessions/{session_id}/operations/training-checkpoint:
+    post:
+      summary: Save training checkpoint
+      description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step) to object storage.
+      operationId: createTrainingCheckpoint
+      tags: [RL]
+      responses:
+        "200":
+          description: Save training checkpoint operation details
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/RL.TrainingCheckpointOperation'
+        default:
+          description: An unexpected error response.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ErrorData'
+      parameters:
+        - name: session_id
+          in: path
+          required: true
+          schema:
+            description: Training session ID
+            type: string
+  /rl/training-sessions/{session_id}/operations/training-checkpoint/{operation_id}:
+    get:
+      summary: Get save training checkpoint operation
+      description: Retrieves the current status and result of a save training checkpoint operation.
+      operationId: getTrainingCheckpointOperation
+      tags: [RL]
+      responses:
+        "200":
+          description: Save training checkpoint operation details
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/RL.TrainingCheckpointOperation'
+        default:
+          description: An unexpected error response.
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ErrorData'
+      parameters:
+        - name: session_id
+          in: path
+          required: true
+          schema:
+            description: Training session ID
+            type: string
+        - name: operation_id
+          in: path
+          required: true
+          schema:
+            description: Operation ID
+            type: string
   /rl/checkpoints/{id}/download:
     get:
       summary: Download checkpoint
@@ -7976,10 +8034,10 @@ components:
           description: Base model to use for the training session
           type: string
           example: meta-llama/Meta-Llama-3-8B-Instruct
-        checkpoint_id:
-          description: Checkpoint ID to use for the training session
+        resume_from_checkpoint_id:
+          description: Checkpoint ID to resume from
           type: string
-          example: checkpoint-123
+          example: 123e4567-e89b-12d3-a456-426614174000
         lora_config:
           $ref: '#/components/schemas/RL.LoraConfig'
     RL.TrainingSessionStatus:
@@ -8015,6 +8073,16 @@ components:
             type: object
             $ref: '#/components/schemas/RL.InferenceCheckpoint'
           description: List of saved inference checkpoints for this session
+        training_checkpoints:
+          type: array
+          items:
+            type: object
+            $ref: '#/components/schemas/RL.TrainingCheckpoint'
+          description: List of saved training checkpoints for this session
+        resume_from_checkpoint_id:
+          type: string
+          example: 123e4567-e89b-12d3-a456-426614174000
+          description: Checkpoint ID this session was resumed from
         step:
           description: Current training step
           type: string
@@ -8151,6 +8219,46 @@ components:
           format: date-time
           example: "2026-01-02T00:00:00Z"
           description: Timestamp when the model was registered
+    RL.TrainingCheckpoint:
+      type: object
+      description: Saved training checkpoint
+      properties:
+        id:
+          type: string
+          example: 123e4567-e89b-12d3-a456-426614174000
+          description: Unique identifier for the checkpoint
+        step:
+          type: string
+          format: uint64
+          example: 42
+          description: Training step at time of save
+        created_at:
+          type: string
+          format: date-time
+          example: "2026-01-02T00:00:00Z"
+          description: Timestamp when the checkpoint was created
+    RL.TrainingCheckpointResult:
+      type: object
+      properties:
+        checkpoint_id:
+          type: string
+          example: 550e8400-e29b-41d4-a716-446655440000
+          description: ID of the saved training checkpoint (use for resume via Start)
+    RL.TrainingCheckpointOperation:
+      type: object
+      properties:
+        id:
+          type: string
+          example: 550e8400-e29b-41d4-a716-446655440000
+          description: Operation ID
+        status:
+          $ref: '#/components/schemas/RL.TrainingOperationStatus'
+          example: TRAINING_OPERATION_STATUS_PENDING
+          description: Operation status
+        output:
+          $ref: '#/components/schemas/RL.TrainingCheckpointResult'
+        error:
+          $ref: '#/components/schemas/RL.TrainingOperationError'
     RL.CheckpointVariant:
       type: string
       enum:

From 464236095739f7f5786512eb52b12c95fe4fcf3c Mon Sep 17 00:00:00 2001
From: Soroush <soroush.bassam@gmail.com>
Date: Tue, 17 Mar 2026 14:09:10 -0700
Subject: [PATCH 2/2] Update openapi.yaml

Co-authored-by: Gleb Khaykin <khaykingleb@gmail.com>
---
 openapi.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/openapi.yaml b/openapi.yaml
index 01390ca..272d625 100644
--- a/openapi.yaml
+++ b/openapi.yaml
@@ -7495,7 +7495,7 @@ paths:
   /rl/training-sessions/{session_id}/operations/training-checkpoint:
     post:
       summary: Save training checkpoint
-      description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step) to object storage.
+      description: Submits an operation that will asynchronously save the full training state (adapter + optimizer + step).
       operationId: createTrainingCheckpoint
       tags: [RL]
       responses: