From 37fee5341ed5cc4265f2766663673753e8b90413 Mon Sep 17 00:00:00 2001
From: root <root@hermes.chiabur.xyz>
Date: Tue, 16 Jun 2026 00:06:45 +0000
Subject: [PATCH] fix: capture llama-server stderr, fix YAML boolean flag
 conversion, reduce polling timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three fixes for the model-not-loading bug:

1. **YAML boolean → CLI flag bug**: YAML parses 'on'/'off'/'yes'/'no' as Python
   bools. str(True)='True' which is INVALID for llama.cpp's --flash-attn flag
   (expects 'on'/'off'/'auto'). Added _flag_value() converter that maps bools
   to 'on'/'off' strings.

2. **llama-server stderr was DEVNULL**: All error messages (bad model path,
   OOM, invalid flag) were invisible. Now captured to /tmp/llama-server-stderr.log
   and dumped to the sidecar log on failure.

3. **Reduce polling timeout**: 240 retries × 0.5s = 120s hang. Reduced to
   60 retries × 0.5s = 30s. Still dumps stderr + exit code on failure.

4. **Manifest VRAM fix**: gemma4-26b-compact-long-128k used q8_0 KV cache at
   128K context (~24GB on 24GB RTX 3090 — borderline OOM). Changed to q4_0
   (~18GB, comfortable).
---
 deploy/manifest.yaml |  4 +--
 sidecar/app.py       | 66 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/deploy/manifest.yaml b/deploy/manifest.yaml
index 8255afe..ad888c3 100644
--- a/deploy/manifest.yaml
+++ b/deploy/manifest.yaml
@@ -184,8 +184,8 @@
   flags:
     n_ctx: 131072
     n_gpu_layers: 999
-    cache-type-k: q8_0
-    cache-type-v: q8_0
+    cache-type-k: q4_0
+    cache-type-v: q4_0
     flash-attn: on
     temp: 1.0
     top_p: 0.95
diff --git a/sidecar/app.py b/sidecar/app.py
index cf9aacd..19e15b3 100644
--- a/sidecar/app.py
+++ b/sidecar/app.py
@@ -19,6 +19,7 @@ from sidecar.manifest import load_manifest
 MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
 SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
 LLAMA_SERVER_PORT = 8081
+LLAMA_STDERR_LOG = "/tmp/llama-server-stderr.log"
 
 # Global state
 _llama_server_process: Optional[asyncio.subprocess.Process] = None
@@ -40,8 +41,20 @@ async def lifespan(app: FastAPI):
 app = FastAPI(lifespan=lifespan)
 
 
+def _close_stderr_log():
+    """Close the stderr log file handle if it's still attached to the process."""
+    global _llama_server_process
+    if _llama_server_process is not None:
+        fh = getattr(_llama_server_process, "_stderr_fh", None)
+        if fh is not None and not fh.closed:
+            try:
+                fh.close()
+            except Exception:
+                pass
+
+
 def _kill_llama_server():
-    """Kill the llama-server subprocess."""
+    """Kill the llama-server subprocess and close its stderr log handle."""
     global _llama_server_process
     if _llama_server_process and _llama_server_process.returncode is None:
         try:
@@ -54,6 +67,20 @@ def _kill_llama_server():
             pass
         _llama_server_process = None
 
+    # Close stderr log handle if still open
+    _close_stderr_log()
+
+
+def _flag_value(value) -> str:
+    """Convert a manifest flag value to a llama-server CLI argument string.
+
+    YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
+    safe_load.  llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
+    """
+    if isinstance(value, bool):
+        return "on" if value else "off"
+    return str(value)
+
 
 async def _start_llama_server(profile: dict):
     """Start llama-server with the given profile's configuration."""
@@ -67,22 +94,31 @@ async def _start_llama_server(profile: dict):
     cmd += ["--model", profile["model_path"]]
     cmd += ["--port", str(LLAMA_SERVER_PORT)]
     for key, value in profile.get("flags", {}).items():
-        cmd += ["--" + key, str(value)]
+        cmd += ["--" + key, _flag_value(value)]
 
     print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
+
+    # Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
+    stderr_fh = open(LLAMA_STDERR_LOG, "w")
     _llama_server_process = await asyncio.create_subprocess_exec(
         *cmd,
         stdout=asyncio.subprocess.DEVNULL,
-        stderr=asyncio.subprocess.DEVNULL,
+        stderr=stderr_fh,
     )
+    # Keep a reference so we can close the handle later
+    _llama_server_process._stderr_fh = stderr_fh  # type: ignore[attr-defined]
     return _llama_server_process
 
 
-async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):
-    """Poll llama-server readiness via /v1/models endpoint."""
+async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
+    """Poll llama-server readiness via /v1/models endpoint.
+
+    Returns True on success.  On failure, dumps the captured stderr (if any)
+    so the user can see why llama-server crashed.
+    """
     import httpx
 
-    for _ in range(max_retries):
+    for attempt in range(max_retries):
         try:
             async with httpx.AsyncClient(timeout=2.0) as client:
                 resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
@@ -91,6 +127,24 @@ async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5
         except Exception:
             pass
         await asyncio.sleep(interval)
+
+    # ── Dump stderr for diagnosis ──────────────────────────────────────
+    print("llama-server did NOT become ready — dumping stderr:", flush=True)
+    try:
+        with open(LLAMA_STDERR_LOG) as f:
+            for line in f:
+                print(f"  {line.rstrip()}", flush=True)
+    except FileNotFoundError:
+        print("  (stderr log not found — process may not have started)", flush=True)
+
+    # Also log exit code if the process died
+    global _llama_server_process
+    if _llama_server_process and _llama_server_process.returncode is not None:
+        print(
+            f"llama-server exited with code {_llama_server_process.returncode}",
+            flush=True,
+        )
+
     return False