fix: first request no longer blocks on model switch — uses background task + SSE

The FIRST request that triggers a model switch was blocking the HTTP response for 10-30s while waiting for the sidecar to load the model. Hermes Desktop's client timed out during this wait, causing 'nothing happens' on new session. Fix: refactored the proxy handler so ALL requests during a model switch use the same SSE streaming pattern (immediate 200, progress events, then actual response piped through after switch completes). The switch now runs as a background asyncio task via create_task(). - Added _background_switch() — runs POST /models/switch in background task with complete_switch() + drain_queue() in finally block - All switch-triggering requests go through queue_request() + StreamingResponse - SSE generator now falls through to OpenRouter/LXC if Main PC unreachable (switch failure case) instead of hanging indefinitely Sidecar fixes from previous commit: - _kill_llama_server() is now async with proper await on process termination - _switch_lock changed from threading.Lock to asyncio.Lock()
2026-06-18 00:10:32 +00:00 · 2026-06-18 00:10:32 +00:00 · b3ac21b2c0
commit b3ac21b2c0
parent 45dd793b69
1 changed files with 116 additions and 64 deletions
--- a/main.py
+++ b/main.py
@ -141,6 +141,49 @@ def complete_switch():
            _switching_event.set()
 async def _background_switch(requested_model: str):
    """Run a model switch in the background.
    The sidecar POST is awaited but the caller gets an immediate SSE stream
    so Hermes Desktop doesn't timeout waiting for the first response.
    Called via asyncio.create_task() so it runs concurrently with the
    SSE stream being sent to the client.
    """
    try:
        async with httpx.AsyncClient(timeout=120.0) as client:
            switch_resp = await client.post(
                f"{SIDECAR_URL}/models/switch",
                json={"profile_id": requested_model},
            )
            switch_result = switch_resp.json()
            if switch_result.get("status") == "ready":
                print(
                    f"SWITCH SUCCESS: profile={requested_model}",
                    flush=True,
                )
            else:
                circuit_record_failure()
                print(
                    f"SWITCH FAILED: profile={requested_model}, "
                    f"status={switch_result.get('status')}, "
                    f"message={switch_result.get('message', '(no message)')}",
                    flush=True,
                )
    except Exception as e:
        circuit_record_failure()
        print(
            f"SWITCH EXCEPTION: profile={requested_model}, "
            f"error={type(e).__name__}: {e}",
            flush=True,
        )
    finally:
        # Signal all queued requests so they can proceed (and fall
        # through to the fallback chain if the switch failed).
        complete_switch()
        drain_queue()
 # ─── App ─────────────────────────────────────────────────────────────────────
@asynccontextmanager
 async def lifespan(app: FastAPI):
@ -466,29 +509,35 @@ async def proxy(
        if requested_model and sidecar_status.get("active_profile") == requested_model:
            target_url = f"{MAIN_PC_BASE}/{path}"
        elif requested_model and is_chat_request:
-            # Trigger switch for a specific model request
+            # All requests during a model switch get an immediate SSE streaming
-            # Check if a switch is already in progress
+            # response so clients (Hermes Desktop) don't timeout while waiting
            # for the model to load (10-30s).  The switch runs in a background
            # task; the SSE stream yields progress events, then pipes through
            # the actual response once the backend model is ready.
            current_switch = await wait_for_switch()
            if current_switch is None:
                # No switch in progress — start one in the background
                await start_switch()
                asyncio.create_task(_background_switch(requested_model))
-            if current_switch is not None and not current_switch.is_set():
+            # Queue this request — signals when switch completes
                # Another request started the switch — queue this one
            try:
                wait_evt = await queue_request()
            except HTTPException as he:
                raise
-                # SSE progress while waiting
+            # Build request headers once
            req_headers = dict(request.headers)
            req_headers.pop("host", None)
            async def stream_with_sse():
                sse_gen = sse_progress_stream(wait_evt)
                try:
                    await wait_evt.wait()
                    async for sse_chunk in sse_gen:
                        yield sse_chunk
-                        complete_switch()
+                    # Send actual request to Main PC
                        drain_queue()
                    async with httpx.AsyncClient(timeout=60.0) as c:
                            req_headers = dict(request.headers)
                            req_headers.pop("host", None)
                        async with c.stream(
                            request.method,
                            f"{MAIN_PC_BASE}/{path}",
@ -497,8 +546,47 @@ async def proxy(
                        ) as resp:
                            async for chunk in resp.aiter_bytes():
                                yield chunk
                except Exception:
                    # Main PC unreachable (switch failed or server died) —
                    # try fallback chain
                    yield _sse_format(
                        "error",
                        {"message": "Backend unreachable, trying fallback..."},
                    )
                    # Try OpenRouter
                    if OPENROUTER_API_KEY:
                        try:
                            fb_headers = dict(req_headers)
                            fb_headers["Authorization"] = f"Bearer {OPENROUTER_API_KEY}"
                            async with httpx.AsyncClient(timeout=60.0) as c:
                                async with c.stream(
                                    request.method,
                                    f"{OPENROUTER_BASE}/{path}",
                                    content=body,
                                    headers=fb_headers,
                                ) as resp:
                                    async for chunk in resp.aiter_bytes():
                                        yield chunk
                                    return
                        except Exception:
                            pass
                    # Fallback to LXC SLM
                    try:
                        async with httpx.AsyncClient(timeout=60.0) as c:
                            async with c.stream(
                                request.method,
                                f"{FALLBACK_SLM_URL}/{path}",
                                content=body,
                                headers=req_headers,
                            ) as resp:
                                async for chunk in resp.aiter_bytes():
                                    yield chunk
                    except Exception:
                        yield _sse_format(
                            "error",
                            {"message": "All backends unavailable"},
                        )
                finally:
                        # Clean up sse_gen
                    try:
                        await sse_gen.aclose()
                    except Exception:
@ -509,42 +597,6 @@ async def proxy(
                media_type="text/event-stream",
            )
            # First request triggers the switch
            await start_switch()  # Create event for tracking
            try:
                async with httpx.AsyncClient(timeout=120.0) as client:
                    switch_resp = await client.post(
                        f"{SIDECAR_URL}/models/switch",
                        json={"profile_id": requested_model},
                    )
                    switch_result = switch_resp.json()
                    if switch_result.get("status") == "ready":
                        complete_switch()
                        drain_queue()
                        target_url = f"{MAIN_PC_BASE}/{path}"
                    else:
                        error = "switch_failed"
                        circuit_record_failure()
                        print(
                            f"SWITCH FAILED: profile={requested_model}, "
                            f"sidecar_status={switch_result.get('status')}, "
                            f"message={switch_result.get('message', '(no message)')}",
                            flush=True,
                        )
            except Exception as e:
                error = f"switch_error: {str(e)}"
                circuit_record_failure()
                print(
                    f"SWITCH EXCEPTION: profile={requested_model}, "
                    f"error={type(e).__name__}: {e}",
                    flush=True,
                )
            # Always signal queued requests on switch completion/failure
            # so they don't hang forever waiting for a switch that never finishes.
            complete_switch()
            drain_queue()
        else:
            # No model in request body (probe/GET/non-chat request) —
            # route to the currently active backend when available,