fix: capture llama-server stderr, fix YAML boolean flag conversion, reduce polling timeout

Three fixes for the model-not-loading bug:

1. **YAML boolean → CLI flag bug**: YAML parses 'on'/'off'/'yes'/'no' as Python
   bools. str(True)='True' which is INVALID for llama.cpp's --flash-attn flag
   (expects 'on'/'off'/'auto'). Added _flag_value() converter that maps bools
   to 'on'/'off' strings.

2. **llama-server stderr was DEVNULL**: All error messages (bad model path,
   OOM, invalid flag) were invisible. Now captured to /tmp/llama-server-stderr.log
   and dumped to the sidecar log on failure.

3. **Reduce polling timeout**: 240 retries × 0.5s = 120s hang. Reduced to
   60 retries × 0.5s = 30s. Still dumps stderr + exit code on failure.

4. **Manifest VRAM fix**: gemma4-26b-compact-long-128k used q8_0 KV cache at
   128K context (~24GB on 24GB RTX 3090 — borderline OOM). Changed to q4_0
   (~18GB, comfortable).
This commit is contained in:
root 2026-06-16 00:06:45 +00:00
parent 903f06c634
commit 37fee5341e
2 changed files with 62 additions and 8 deletions

View File

@ -184,8 +184,8 @@
flags: flags:
n_ctx: 131072 n_ctx: 131072
n_gpu_layers: 999 n_gpu_layers: 999
cache-type-k: q8_0 cache-type-k: q4_0
cache-type-v: q8_0 cache-type-v: q4_0
flash-attn: on flash-attn: on
temp: 1.0 temp: 1.0
top_p: 0.95 top_p: 0.95

View File

@ -19,6 +19,7 @@ from sidecar.manifest import load_manifest
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml") MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080")) SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
LLAMA_SERVER_PORT = 8081 LLAMA_SERVER_PORT = 8081
LLAMA_STDERR_LOG = "/tmp/llama-server-stderr.log"
# Global state # Global state
_llama_server_process: Optional[asyncio.subprocess.Process] = None _llama_server_process: Optional[asyncio.subprocess.Process] = None
@ -40,8 +41,20 @@ async def lifespan(app: FastAPI):
app = FastAPI(lifespan=lifespan) app = FastAPI(lifespan=lifespan)
def _close_stderr_log():
"""Close the stderr log file handle if it's still attached to the process."""
global _llama_server_process
if _llama_server_process is not None:
fh = getattr(_llama_server_process, "_stderr_fh", None)
if fh is not None and not fh.closed:
try:
fh.close()
except Exception:
pass
def _kill_llama_server(): def _kill_llama_server():
"""Kill the llama-server subprocess.""" """Kill the llama-server subprocess and close its stderr log handle."""
global _llama_server_process global _llama_server_process
if _llama_server_process and _llama_server_process.returncode is None: if _llama_server_process and _llama_server_process.returncode is None:
try: try:
@ -54,6 +67,20 @@ def _kill_llama_server():
pass pass
_llama_server_process = None _llama_server_process = None
# Close stderr log handle if still open
_close_stderr_log()
def _flag_value(value) -> str:
"""Convert a manifest flag value to a llama-server CLI argument string.
YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
safe_load. llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
"""
if isinstance(value, bool):
return "on" if value else "off"
return str(value)
async def _start_llama_server(profile: dict): async def _start_llama_server(profile: dict):
"""Start llama-server with the given profile's configuration.""" """Start llama-server with the given profile's configuration."""
@ -67,22 +94,31 @@ async def _start_llama_server(profile: dict):
cmd += ["--model", profile["model_path"]] cmd += ["--model", profile["model_path"]]
cmd += ["--port", str(LLAMA_SERVER_PORT)] cmd += ["--port", str(LLAMA_SERVER_PORT)]
for key, value in profile.get("flags", {}).items(): for key, value in profile.get("flags", {}).items():
cmd += ["--" + key, str(value)] cmd += ["--" + key, _flag_value(value)]
print(f"Starting llama-server: {' '.join(cmd)}", flush=True) print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
# Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
stderr_fh = open(LLAMA_STDERR_LOG, "w")
_llama_server_process = await asyncio.create_subprocess_exec( _llama_server_process = await asyncio.create_subprocess_exec(
*cmd, *cmd,
stdout=asyncio.subprocess.DEVNULL, stdout=asyncio.subprocess.DEVNULL,
stderr=asyncio.subprocess.DEVNULL, stderr=stderr_fh,
) )
# Keep a reference so we can close the handle later
_llama_server_process._stderr_fh = stderr_fh # type: ignore[attr-defined]
return _llama_server_process return _llama_server_process
async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5): async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
"""Poll llama-server readiness via /v1/models endpoint.""" """Poll llama-server readiness via /v1/models endpoint.
Returns True on success. On failure, dumps the captured stderr (if any)
so the user can see why llama-server crashed.
"""
import httpx import httpx
for _ in range(max_retries): for attempt in range(max_retries):
try: try:
async with httpx.AsyncClient(timeout=2.0) as client: async with httpx.AsyncClient(timeout=2.0) as client:
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models") resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
@ -91,6 +127,24 @@ async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5
except Exception: except Exception:
pass pass
await asyncio.sleep(interval) await asyncio.sleep(interval)
# ── Dump stderr for diagnosis ──────────────────────────────────────
print("llama-server did NOT become ready — dumping stderr:", flush=True)
try:
with open(LLAMA_STDERR_LOG) as f:
for line in f:
print(f" {line.rstrip()}", flush=True)
except FileNotFoundError:
print(" (stderr log not found — process may not have started)", flush=True)
# Also log exit code if the process died
global _llama_server_process
if _llama_server_process and _llama_server_process.returncode is not None:
print(
f"llama-server exited with code {_llama_server_process.returncode}",
flush=True,
)
return False return False