fix: capture llama-server stderr, fix YAML boolean flag conversion, reduce polling timeout

Three fixes for the model-not-loading bug: 1. **YAML boolean → CLI flag bug**: YAML parses 'on'/'off'/'yes'/'no' as Python bools. str(True)='True' which is INVALID for llama.cpp's --flash-attn flag (expects 'on'/'off'/'auto'). Added _flag_value() converter that maps bools to 'on'/'off' strings. 2. **llama-server stderr was DEVNULL**: All error messages (bad model path, OOM, invalid flag) were invisible. Now captured to /tmp/llama-server-stderr.log and dumped to the sidecar log on failure. 3. **Reduce polling timeout**: 240 retries × 0.5s = 120s hang. Reduced to 60 retries × 0.5s = 30s. Still dumps stderr + exit code on failure. 4. **Manifest VRAM fix**: gemma4-26b-compact-long-128k used q8_0 KV cache at 128K context (~24GB on 24GB RTX 3090 — borderline OOM). Changed to q4_0 (~18GB, comfortable).
2026-06-16 00:06:45 +00:00 · 2026-06-16 00:06:45 +00:00 · 37fee5341e
commit 37fee5341e
parent 903f06c634
2 changed files with 62 additions and 8 deletions
--- a/deploy/manifest.yaml
+++ b/deploy/manifest.yaml
@ -184,8 +184,8 @@
  flags:
    n_ctx: 131072
    n_gpu_layers: 999
-    cache-type-k: q8_0
+    cache-type-k: q4_0
-    cache-type-v: q8_0
+    cache-type-v: q4_0
    flash-attn: on
    temp: 1.0
    top_p: 0.95
--- a/sidecar/app.py
+++ b/sidecar/app.py
@ -19,6 +19,7 @@ from sidecar.manifest import load_manifest
 MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
 SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
 LLAMA_SERVER_PORT = 8081
 LLAMA_STDERR_LOG = "/tmp/llama-server-stderr.log"
 # Global state
 _llama_server_process: Optional[asyncio.subprocess.Process] = None
@ -40,8 +41,20 @@ async def lifespan(app: FastAPI):
 app = FastAPI(lifespan=lifespan)
 def _close_stderr_log():
    """Close the stderr log file handle if it's still attached to the process."""
    global _llama_server_process
    if _llama_server_process is not None:
        fh = getattr(_llama_server_process, "_stderr_fh", None)
        if fh is not None and not fh.closed:
            try:
                fh.close()
            except Exception:
                pass
 def _kill_llama_server():
-    """Kill the llama-server subprocess."""
+    """Kill the llama-server subprocess and close its stderr log handle."""
    global _llama_server_process
    if _llama_server_process and _llama_server_process.returncode is None:
        try:
@ -54,6 +67,20 @@ def _kill_llama_server():
            pass
        _llama_server_process = None
    # Close stderr log handle if still open
    _close_stderr_log()
 def _flag_value(value) -> str:
    """Convert a manifest flag value to a llama-server CLI argument string.
    YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
    safe_load.  llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
    """
    if isinstance(value, bool):
        return "on" if value else "off"
    return str(value)
 async def _start_llama_server(profile: dict):
    """Start llama-server with the given profile's configuration."""
@ -67,22 +94,31 @@ async def _start_llama_server(profile: dict):
    cmd += ["--model", profile["model_path"]]
    cmd += ["--port", str(LLAMA_SERVER_PORT)]
    for key, value in profile.get("flags", {}).items():
-        cmd += ["--" + key, str(value)]
+        cmd += ["--" + key, _flag_value(value)]
    print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
    # Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
    stderr_fh = open(LLAMA_STDERR_LOG, "w")
    _llama_server_process = await asyncio.create_subprocess_exec(
        *cmd,
        stdout=asyncio.subprocess.DEVNULL,
-        stderr=asyncio.subprocess.DEVNULL,
+        stderr=stderr_fh,
    )
    # Keep a reference so we can close the handle later
    _llama_server_process._stderr_fh = stderr_fh  # type: ignore[attr-defined]
    return _llama_server_process
-async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):
+async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
-    """Poll llama-server readiness via /v1/models endpoint."""
+    """Poll llama-server readiness via /v1/models endpoint.
    Returns True on success.  On failure, dumps the captured stderr (if any)
    so the user can see why llama-server crashed.
    """
    import httpx
-    for _ in range(max_retries):
+    for attempt in range(max_retries):
        try:
            async with httpx.AsyncClient(timeout=2.0) as client:
                resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
@ -91,6 +127,24 @@ async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5
        except Exception:
            pass
        await asyncio.sleep(interval)
    # ── Dump stderr for diagnosis ──────────────────────────────────────
    print("llama-server did NOT become ready — dumping stderr:", flush=True)
    try:
        with open(LLAMA_STDERR_LOG) as f:
            for line in f:
                print(f"  {line.rstrip()}", flush=True)
    except FileNotFoundError:
        print("  (stderr log not found — process may not have started)", flush=True)
    # Also log exit code if the process died
    global _llama_server_process
    if _llama_server_process and _llama_server_process.returncode is not None:
        print(
            f"llama-server exited with code {_llama_server_process.returncode}",
            flush=True,
        )
    return False