From d3174587dc00b8e0afa0413611028919341deb2b Mon Sep 17 00:00:00 2001
From: Andrew Beveridge <andrew@beveridge.uk>
Date: Thu, 26 Mar 2026 12:35:16 -0400
Subject: [PATCH] fix: synchronous endpoint + no semaphore, let Cloud Run scale
 like Modal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The fire-and-forget + semaphore design caused all jobs to queue on one
instance. Cloud Run couldn't see background threads as "busy" so it
never scaled to new instances.

Fix: make endpoint synchronous (await executor) with concurrency=1.
Cloud Run sees each request as active during processing and scales to
new GPU instances for concurrent jobs — matching Modal's .spawn().
Increase client POST timeout to 1800s to match.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 audio_separator/remote/api_client.py      |  4 ++--
 audio_separator/remote/deploy_cloudrun.py | 29 ++++++++---------------
 2 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/audio_separator/remote/api_client.py b/audio_separator/remote/api_client.py
index 1ff8ec6..977b6e5 100644
--- a/audio_separator/remote/api_client.py
+++ b/audio_separator/remote/api_client.py
@@ -148,7 +148,7 @@ def separate_audio(
             data["custom_output_names"] = json.dumps(custom_output_names)
 
         try:
-            # Server returns immediately with task_id; 60s is generous for submission
+            # Server processes synchronously; 1800s matches Cloud Run request timeout.
             # When using gcs_uri (no file upload), we still need multipart/form-data
             # encoding because FastAPI requires it for endpoints with File()/Form() params.
             # Passing a dummy empty file field forces requests to use multipart encoding.
@@ -158,7 +158,7 @@ def separate_audio(
                 f"{self.api_url}/separate",
                 files=files,
                 data=data,
-                timeout=60,
+                timeout=1800,
             )
             response.raise_for_status()
             return response.json()
diff --git a/audio_separator/remote/deploy_cloudrun.py b/audio_separator/remote/deploy_cloudrun.py
index 0f5fa12..ccd2c8d 100644
--- a/audio_separator/remote/deploy_cloudrun.py
+++ b/audio_separator/remote/deploy_cloudrun.py
@@ -55,7 +55,6 @@
 models_ready = False
 
 # --- Async job infrastructure ---
-gpu_semaphore = threading.Semaphore(1)
 
 OUTPUT_BUCKET = os.environ.get("OUTPUT_BUCKET", "nomadkaraoke-audio-separator-outputs")
 GCP_PROJECT = os.environ.get("GCP_PROJECT", "nomadkaraoke")
@@ -231,11 +230,8 @@ def update_status(status: str, progress: int = 0, error: str = None, files: dict
         except Exception as e:
             logger.warning(f"[{task_id}] Failed to update Firestore status: {e}")
 
-    # Wait for GPU availability
-    update_status("queued", 0)
-    logger.info(f"[{task_id}] Waiting for GPU semaphore...")
-    gpu_semaphore.acquire()
-    logger.info(f"[{task_id}] GPU semaphore acquired, starting separation")
+    update_status("processing", 0)
+    logger.info(f"[{task_id}] Starting separation")
     try:
         os.makedirs(f"{STORAGE_DIR}/outputs/{task_id}", exist_ok=True)
         output_dir = f"{STORAGE_DIR}/outputs/{task_id}"
@@ -379,8 +375,7 @@ def update_status(status: str, progress: int = 0, error: str = None, files: dict
         return {"task_id": task_id, "status": "error", "error": str(e), "models_used": models_used}
 
     finally:
-        gpu_semaphore.release()
-        logger.info(f"[{task_id}] GPU semaphore released")
+        logger.info(f"[{task_id}] Separation finished, cleaning up local files")
         # Clean up local files (outputs are in GCS now)
         output_dir = f"{STORAGE_DIR}/outputs/{task_id}"
         if os.path.exists(output_dir):
@@ -507,9 +502,12 @@ async def separate_audio(
             "instance_id": instance_id,
         })
 
-        # Fire-and-forget: run separation in background thread
+        # Run separation synchronously — Cloud Run keeps this request active,
+        # which lets the autoscaler know this instance is busy and route new
+        # requests to new instances (with concurrency=1).
+        # This matches Modal's .spawn() pattern: each job gets its own GPU instance.
         loop = asyncio.get_event_loop()
-        loop.run_in_executor(
+        result = await loop.run_in_executor(
             None,
             lambda: separate_audio_sync(
                 audio_data,
@@ -551,15 +549,8 @@ async def separate_audio(
             ),
         )
 
-        # Return immediately — client polls /status/{task_id}
-        return {
-            "task_id": task_id,
-            "status": "submitted",
-            "progress": 0,
-            "original_filename": filename,
-            "models_used": [f"preset:{preset}"] if preset else (models_list or ["default"]),
-            "total_models": 1 if preset else (len(models_list) if models_list else 1),
-        }
+        # Return the completed/error result (Firestore + GCS already updated by separate_audio_sync)
+        return result
 
     except HTTPException:
         raise