From 37fee5341ed5cc4265f2766663673753e8b90413 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 16 Jun 2026 00:06:45 +0000 Subject: [PATCH] fix: capture llama-server stderr, fix YAML boolean flag conversion, reduce polling timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three fixes for the model-not-loading bug: 1. **YAML boolean → CLI flag bug**: YAML parses 'on'/'off'/'yes'/'no' as Python bools. str(True)='True' which is INVALID for llama.cpp's --flash-attn flag (expects 'on'/'off'/'auto'). Added _flag_value() converter that maps bools to 'on'/'off' strings. 2. **llama-server stderr was DEVNULL**: All error messages (bad model path, OOM, invalid flag) were invisible. Now captured to /tmp/llama-server-stderr.log and dumped to the sidecar log on failure. 3. **Reduce polling timeout**: 240 retries × 0.5s = 120s hang. Reduced to 60 retries × 0.5s = 30s. Still dumps stderr + exit code on failure. 4. **Manifest VRAM fix**: gemma4-26b-compact-long-128k used q8_0 KV cache at 128K context (~24GB on 24GB RTX 3090 — borderline OOM). Changed to q4_0 (~18GB, comfortable). --- deploy/manifest.yaml | 4 +-- sidecar/app.py | 66 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 62 insertions(+), 8 deletions(-) diff --git a/deploy/manifest.yaml b/deploy/manifest.yaml index 8255afe..ad888c3 100644 --- a/deploy/manifest.yaml +++ b/deploy/manifest.yaml @@ -184,8 +184,8 @@ flags: n_ctx: 131072 n_gpu_layers: 999 - cache-type-k: q8_0 - cache-type-v: q8_0 + cache-type-k: q4_0 + cache-type-v: q4_0 flash-attn: on temp: 1.0 top_p: 0.95 diff --git a/sidecar/app.py b/sidecar/app.py index cf9aacd..19e15b3 100644 --- a/sidecar/app.py +++ b/sidecar/app.py @@ -19,6 +19,7 @@ from sidecar.manifest import load_manifest MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml") SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080")) LLAMA_SERVER_PORT = 8081 +LLAMA_STDERR_LOG = "/tmp/llama-server-stderr.log" # Global state _llama_server_process: Optional[asyncio.subprocess.Process] = None @@ -40,8 +41,20 @@ async def lifespan(app: FastAPI): app = FastAPI(lifespan=lifespan) +def _close_stderr_log(): + """Close the stderr log file handle if it's still attached to the process.""" + global _llama_server_process + if _llama_server_process is not None: + fh = getattr(_llama_server_process, "_stderr_fh", None) + if fh is not None and not fh.closed: + try: + fh.close() + except Exception: + pass + + def _kill_llama_server(): - """Kill the llama-server subprocess.""" + """Kill the llama-server subprocess and close its stderr log handle.""" global _llama_server_process if _llama_server_process and _llama_server_process.returncode is None: try: @@ -54,6 +67,20 @@ def _kill_llama_server(): pass _llama_server_process = None + # Close stderr log handle if still open + _close_stderr_log() + + +def _flag_value(value) -> str: + """Convert a manifest flag value to a llama-server CLI argument string. + + YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by + safe_load. llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'. + """ + if isinstance(value, bool): + return "on" if value else "off" + return str(value) + async def _start_llama_server(profile: dict): """Start llama-server with the given profile's configuration.""" @@ -67,22 +94,31 @@ async def _start_llama_server(profile: dict): cmd += ["--model", profile["model_path"]] cmd += ["--port", str(LLAMA_SERVER_PORT)] for key, value in profile.get("flags", {}).items(): - cmd += ["--" + key, str(value)] + cmd += ["--" + key, _flag_value(value)] print(f"Starting llama-server: {' '.join(cmd)}", flush=True) + + # Capture stderr so we can diagnose crashes (model not found, OOM, bad flag) + stderr_fh = open(LLAMA_STDERR_LOG, "w") _llama_server_process = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.DEVNULL, - stderr=asyncio.subprocess.DEVNULL, + stderr=stderr_fh, ) + # Keep a reference so we can close the handle later + _llama_server_process._stderr_fh = stderr_fh # type: ignore[attr-defined] return _llama_server_process -async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5): - """Poll llama-server readiness via /v1/models endpoint.""" +async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5): + """Poll llama-server readiness via /v1/models endpoint. + + Returns True on success. On failure, dumps the captured stderr (if any) + so the user can see why llama-server crashed. + """ import httpx - for _ in range(max_retries): + for attempt in range(max_retries): try: async with httpx.AsyncClient(timeout=2.0) as client: resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models") @@ -91,6 +127,24 @@ async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5 except Exception: pass await asyncio.sleep(interval) + + # ── Dump stderr for diagnosis ────────────────────────────────────── + print("llama-server did NOT become ready — dumping stderr:", flush=True) + try: + with open(LLAMA_STDERR_LOG) as f: + for line in f: + print(f" {line.rstrip()}", flush=True) + except FileNotFoundError: + print(" (stderr log not found — process may not have started)", flush=True) + + # Also log exit code if the process died + global _llama_server_process + if _llama_server_process and _llama_server_process.returncode is not None: + print( + f"llama-server exited with code {_llama_server_process.returncode}", + flush=True, + ) + return False