From 45dd793b6960a944161c20a4931b0058fdc5c896 Mon Sep 17 00:00:00 2001
From: root <root@hermes.chiabur.xyz>
Date: Wed, 17 Jun 2026 23:49:57 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20sidecar=20process=20kill=20was=20not=20a?=
 =?UTF-8?q?waiting=20wait()=20=E2=80=94=20old=20server=20held=20GPU=20VRAM?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- _kill_llama_server() was sync calling an unawaited coroutine. process.wait() created
  a discarded coroutine object — the old llama-server was never waited on to release
  GPU memory before starting a new one, causing OOM on rapid model switches.
  Fixed with async await + 10s SIGTERM timeout + SIGKILL fallback.

- Changed _switch_lock from threading.Lock to asyncio.Lock() to prevent event loop
  deadlock during long switch operations.

- Router proxy: only trigger model switches for POST /v1/chat/completions and
  /v1/completions. Non-chat endpoints (GET probes, /api/show) no longer trigger
  unwanted model reloads.

- _ollama_show_lookup: return active profile context size when model_name is empty.
  Previously returned 404, causing Hermes Desktop to default to 256k context.

- Always drain_queue() + complete_switch() after switch failure so queued requests
  don't hang forever waiting on a never-set switching event.
---
 main.py        | 64 +++++++++++++++++++++++++++++++++++++++++++-------
 sidecar/app.py | 53 ++++++++++++++++++++++++-----------------
 2 files changed, 87 insertions(+), 30 deletions(-)

diff --git a/main.py b/main.py
index d79e21f..2f2101e 100644
--- a/main.py
+++ b/main.py
@@ -251,14 +251,41 @@ async def ollama_show_post(request: Request):
 
 
 async def _ollama_show_lookup(model_name: str):
-    """Shared logic for Ollama /api/show model info lookup."""
+    """Shared logic for Ollama /api/show model info lookup.
+
+    When model_name is empty string (Hermes Desktop probe with no model field),
+    returns the currently-active profile's info so the desktop can determine
+    the correct context size. Previously returned 404, causing Hermes Desktop
+    to default to 256k context.
+    """
     async with httpx.AsyncClient(timeout=5.0) as client:
         try:
             resp = await client.get(f"{SIDECAR_URL}/models/available")
             profiles = resp.json()
+            status_resp = await client.get(f"{SIDECAR_URL}/models/status")
+            status = status_resp.json()
         except Exception:
             return JSONResponse(status_code=404, content={"error": "model not found"})
 
+    # If no model specified, return the currently-active profile's info
+    active_id = status.get("active_profile")
+    if not model_name and active_id:
+        for p in profiles:
+            if p.get("id") == active_id:
+                flags = p.get("flags", {})
+                ctx_size = str(flags.get("ctx-size", flags.get("n_ctx", "4096")))
+                return {
+                    "modelfile": "",
+                    "parameters": f"num_ctx {ctx_size}",
+                    "template": "",
+                    "details": {
+                        "format": "gguf",
+                        "family": p.get("name", "llm"),
+                        "parameter_size": ctx_size,
+                    },
+                    "model_info": {"id": p.get("id", "")},
+                }
+
     for p in profiles:
         if p.get("id") == model_name:
             # Extract actual context size from the profile's flags
@@ -428,9 +455,17 @@ async def proxy(
         body_data = json.loads(body) if body else {}
         requested_model = body_data.get("model")
 
+        # Only trigger model switches for actual chat/completion POST requests.
+        # GET probes, /api/show lookups, and other non-chat endpoints should
+        # never trigger a switch — they just read current state.
+        is_chat_request = (
+            request.method == "POST"
+            and path in ("v1/chat/completions", "v1/completions")
+        )
+
         if requested_model and sidecar_status.get("active_profile") == requested_model:
             target_url = f"{MAIN_PC_BASE}/{path}"
-        elif requested_model:
+        elif requested_model and is_chat_request:
             # Trigger switch for a specific model request
             # Check if a switch is already in progress
             current_switch = await wait_for_switch()
@@ -497,14 +532,25 @@ async def proxy(
                             flush=True,
                         )
             except Exception as e:
-                circuit_record_failure()
                 error = f"switch_error: {str(e)}"
-            else:
-                # No model in request body (probe/GET/non-chat request) —
-                # route to the currently active backend when available,
-                # or fall through to the fallback chain.
-                if sidecar_status.get("active_profile") and sidecar_status.get("llama_server_running"):
-                    target_url = f"{MAIN_PC_BASE}/{path}"
+                circuit_record_failure()
+                print(
+                    f"SWITCH EXCEPTION: profile={requested_model}, "
+                    f"error={type(e).__name__}: {e}",
+                    flush=True,
+                )
+
+            # Always signal queued requests on switch completion/failure
+            # so they don't hang forever waiting for a switch that never finishes.
+            complete_switch()
+            drain_queue()
+
+        else:
+            # No model in request body (probe/GET/non-chat request) —
+            # route to the currently active backend when available,
+            # or fall through to the fallback chain.
+            if sidecar_status.get("active_profile") and sidecar_status.get("llama_server_running"):
+                target_url = f"{MAIN_PC_BASE}/{path}"
 
     # ── Fallback chain ────────────────────────────────────────────────────
     if target_url is None:
diff --git a/sidecar/app.py b/sidecar/app.py
index 734ba9b..c9a8be8 100644
--- a/sidecar/app.py
+++ b/sidecar/app.py
@@ -5,7 +5,6 @@ Runs on the Main PC, manages llama-server subprocess, serves manifest/profile da
 import os
 import asyncio
 import signal as signal_module
-import threading
 from contextlib import asynccontextmanager
 from typing import Optional
 
@@ -26,7 +25,7 @@ LLAMA_STDERR_LOG = os.path.join(
 # Global state
 _llama_server_process: Optional[asyncio.subprocess.Process] = None
 _active_profile: Optional[str] = None
-_switch_lock = threading.Lock()  # Use threading.Lock for compatibility with TestClient
+_switch_lock = asyncio.Lock()  # Use asyncio.Lock to avoid blocking the event loop
 
 
 @asynccontextmanager
@@ -37,7 +36,7 @@ async def lifespan(app: FastAPI):
     # Cleanup: kill llama-server if running
     global _llama_server_process
     if _llama_server_process:
-        _kill_llama_server()
+        await _kill_llama_server()
 
 
 app = FastAPI(lifespan=lifespan)
@@ -55,22 +54,34 @@ def _close_stderr_log():
                 pass
 
 
-def _kill_llama_server():
-    """Kill the llama-server subprocess and close its stderr log handle."""
-    global _llama_server_process
-    if _llama_server_process and _llama_server_process.returncode is None:
-        try:
-            _llama_server_process.send_signal(signal_module.SIGTERM)
-            try:
-                _llama_server_process.wait(timeout=5)
-            except asyncio.TimeoutError:
-                _llama_server_process.kill()
-        except Exception:
-            pass
-        _llama_server_process = None
+async def _kill_llama_server():
+    """Kill the llama-server subprocess and wait for it to fully terminate.
 
-    # Close stderr log handle if still open
-    _close_stderr_log()
+    This MUST be async because process.wait() is a coroutine. The synchronous
+    version was calling .wait() without await, creating an unawaited coroutine
+    object — the old process was never actually waited on, so it could still
+    hold GPU VRAM when the new server started.
+    """
+    global _llama_server_process
+    if _llama_server_process is None or _llama_server_process.returncode is not None:
+        _close_stderr_log()
+        return
+
+    try:
+        _llama_server_process.send_signal(signal_module.SIGTERM)
+        try:
+            await asyncio.wait_for(_llama_server_process.wait(), timeout=10)
+        except asyncio.TimeoutError:
+            _llama_server_process.kill()
+            try:
+                await asyncio.wait_for(_llama_server_process.wait(), timeout=5)
+            except asyncio.TimeoutError:
+                pass
+    except Exception:
+        pass
+    finally:
+        _llama_server_process = None
+        _close_stderr_log()
 
 
 def _flag_value(value) -> str:
@@ -105,7 +116,7 @@ async def _start_llama_server(profile: dict):
     global _llama_server_process
 
     # Kill any existing process
-    _kill_llama_server()
+    await _kill_llama_server()
 
     # Build command from profile flags
     cmd = ["/home/bigt/AI/llama.cpp/build/bin/llama-server"]
@@ -200,7 +211,7 @@ async def switch_model(payload: SwitchRequest):
     """Stop current llama-server, start new one with the given profile, wait for readiness."""
     global _active_profile
 
-    with _switch_lock:
+    async with _switch_lock:
         # Validate profile_id
         profiles = load_manifest(MANIFEST_PATH)
         if profiles is None:
@@ -229,7 +240,7 @@ async def switch_model(payload: SwitchRequest):
             }
 
         # Start the new model
-        _kill_llama_server()
+        await _kill_llama_server()
         _active_profile = None
         await _start_llama_server(profile)