From b3ac21b2c040de91bb02a210a92daebfda07d7c0 Mon Sep 17 00:00:00 2001
From: root <root@hermes.chiabur.xyz>
Date: Thu, 18 Jun 2026 00:10:32 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20first=20request=20no=20longer=20blocks?=
 =?UTF-8?q?=20on=20model=20switch=20=E2=80=94=20uses=20background=20task?=
 =?UTF-8?q?=20+=20SSE?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The FIRST request that triggers a model switch was blocking the HTTP response
for 10-30s while waiting for the sidecar to load the model. Hermes Desktop's
client timed out during this wait, causing 'nothing happens' on new session.

Fix: refactored the proxy handler so ALL requests during a model switch use
the same SSE streaming pattern (immediate 200, progress events, then actual
response piped through after switch completes). The switch now runs as a
background asyncio task via create_task().

- Added _background_switch() — runs POST /models/switch in background task
  with complete_switch() + drain_queue() in finally block
- All switch-triggering requests go through queue_request() + StreamingResponse
- SSE generator now falls through to OpenRouter/LXC if Main PC unreachable
  (switch failure case) instead of hanging indefinitely

Sidecar fixes from previous commit:
- _kill_llama_server() is now async with proper await on process termination
- _switch_lock changed from threading.Lock to asyncio.Lock()
---
 main.py | 180 ++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 116 insertions(+), 64 deletions(-)

diff --git a/main.py b/main.py
index 2f2101e..099c32e 100644
--- a/main.py
+++ b/main.py
@@ -141,6 +141,49 @@ def complete_switch():
             _switching_event.set()
 
 
+async def _background_switch(requested_model: str):
+    """Run a model switch in the background.
+
+    The sidecar POST is awaited but the caller gets an immediate SSE stream
+    so Hermes Desktop doesn't timeout waiting for the first response.
+
+    Called via asyncio.create_task() so it runs concurrently with the
+    SSE stream being sent to the client.
+    """
+    try:
+        async with httpx.AsyncClient(timeout=120.0) as client:
+            switch_resp = await client.post(
+                f"{SIDECAR_URL}/models/switch",
+                json={"profile_id": requested_model},
+            )
+            switch_result = switch_resp.json()
+            if switch_result.get("status") == "ready":
+                print(
+                    f"SWITCH SUCCESS: profile={requested_model}",
+                    flush=True,
+                )
+            else:
+                circuit_record_failure()
+                print(
+                    f"SWITCH FAILED: profile={requested_model}, "
+                    f"status={switch_result.get('status')}, "
+                    f"message={switch_result.get('message', '(no message)')}",
+                    flush=True,
+                )
+    except Exception as e:
+        circuit_record_failure()
+        print(
+            f"SWITCH EXCEPTION: profile={requested_model}, "
+            f"error={type(e).__name__}: {e}",
+            flush=True,
+        )
+    finally:
+        # Signal all queued requests so they can proceed (and fall
+        # through to the fallback chain if the switch failed).
+        complete_switch()
+        drain_queue()
+
+
 # ─── App ─────────────────────────────────────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
@@ -466,84 +509,93 @@ async def proxy(
         if requested_model and sidecar_status.get("active_profile") == requested_model:
             target_url = f"{MAIN_PC_BASE}/{path}"
         elif requested_model and is_chat_request:
-            # Trigger switch for a specific model request
-            # Check if a switch is already in progress
+            # All requests during a model switch get an immediate SSE streaming
+            # response so clients (Hermes Desktop) don't timeout while waiting
+            # for the model to load (10-30s).  The switch runs in a background
+            # task; the SSE stream yields progress events, then pipes through
+            # the actual response once the backend model is ready.
             current_switch = await wait_for_switch()
+            if current_switch is None:
+                # No switch in progress — start one in the background
+                await start_switch()
+                asyncio.create_task(_background_switch(requested_model))
 
-            if current_switch is not None and not current_switch.is_set():
-                # Another request started the switch — queue this one
+            # Queue this request — signals when switch completes
+            try:
+                wait_evt = await queue_request()
+            except HTTPException as he:
+                raise
+
+            # Build request headers once
+            req_headers = dict(request.headers)
+            req_headers.pop("host", None)
+
+            async def stream_with_sse():
+                sse_gen = sse_progress_stream(wait_evt)
                 try:
-                    wait_evt = await queue_request()
-                except HTTPException as he:
-                    raise
-
-                # SSE progress while waiting
-                async def stream_with_sse():
-                    sse_gen = sse_progress_stream(wait_evt)
+                    await wait_evt.wait()
+                    async for sse_chunk in sse_gen:
+                        yield sse_chunk
+                    # Send actual request to Main PC
+                    async with httpx.AsyncClient(timeout=60.0) as c:
+                        async with c.stream(
+                            request.method,
+                            f"{MAIN_PC_BASE}/{path}",
+                            content=body,
+                            headers=req_headers,
+                        ) as resp:
+                            async for chunk in resp.aiter_bytes():
+                                yield chunk
+                except Exception:
+                    # Main PC unreachable (switch failed or server died) —
+                    # try fallback chain
+                    yield _sse_format(
+                        "error",
+                        {"message": "Backend unreachable, trying fallback..."},
+                    )
+                    # Try OpenRouter
+                    if OPENROUTER_API_KEY:
+                        try:
+                            fb_headers = dict(req_headers)
+                            fb_headers["Authorization"] = f"Bearer {OPENROUTER_API_KEY}"
+                            async with httpx.AsyncClient(timeout=60.0) as c:
+                                async with c.stream(
+                                    request.method,
+                                    f"{OPENROUTER_BASE}/{path}",
+                                    content=body,
+                                    headers=fb_headers,
+                                ) as resp:
+                                    async for chunk in resp.aiter_bytes():
+                                        yield chunk
+                                    return
+                        except Exception:
+                            pass
+                    # Fallback to LXC SLM
                     try:
-                        await wait_evt.wait()
-                        async for sse_chunk in sse_gen:
-                            yield sse_chunk
-                        complete_switch()
-                        drain_queue()
                         async with httpx.AsyncClient(timeout=60.0) as c:
-                            req_headers = dict(request.headers)
-                            req_headers.pop("host", None)
                             async with c.stream(
                                 request.method,
-                                f"{MAIN_PC_BASE}/{path}",
+                                f"{FALLBACK_SLM_URL}/{path}",
                                 content=body,
                                 headers=req_headers,
                             ) as resp:
                                 async for chunk in resp.aiter_bytes():
                                     yield chunk
-                    finally:
-                        # Clean up sse_gen
-                        try:
-                            await sse_gen.aclose()
-                        except Exception:
-                            pass
-
-                return StreamingResponse(
-                    stream_with_sse(),
-                    media_type="text/event-stream",
-                )
-
-            # First request triggers the switch
-            await start_switch()  # Create event for tracking
-            try:
-                async with httpx.AsyncClient(timeout=120.0) as client:
-                    switch_resp = await client.post(
-                        f"{SIDECAR_URL}/models/switch",
-                        json={"profile_id": requested_model},
-                    )
-                    switch_result = switch_resp.json()
-                    if switch_result.get("status") == "ready":
-                        complete_switch()
-                        drain_queue()
-                        target_url = f"{MAIN_PC_BASE}/{path}"
-                    else:
-                        error = "switch_failed"
-                        circuit_record_failure()
-                        print(
-                            f"SWITCH FAILED: profile={requested_model}, "
-                            f"sidecar_status={switch_result.get('status')}, "
-                            f"message={switch_result.get('message', '(no message)')}",
-                            flush=True,
+                    except Exception:
+                        yield _sse_format(
+                            "error",
+                            {"message": "All backends unavailable"},
                         )
-            except Exception as e:
-                error = f"switch_error: {str(e)}"
-                circuit_record_failure()
-                print(
-                    f"SWITCH EXCEPTION: profile={requested_model}, "
-                    f"error={type(e).__name__}: {e}",
-                    flush=True,
-                )
+                finally:
+                    try:
+                        await sse_gen.aclose()
+                    except Exception:
+                        pass
 
-            # Always signal queued requests on switch completion/failure
-            # so they don't hang forever waiting for a switch that never finishes.
-            complete_switch()
-            drain_queue()
+            return StreamingResponse(
+                stream_with_sse(),
+                media_type="text/event-stream",
+            )
 
         else:
             # No model in request body (probe/GET/non-chat request) —