fix: capture llama-server stderr, fix YAML boolean flag conversion, reduce polling timeout
Three fixes for the model-not-loading bug: 1. **YAML boolean → CLI flag bug**: YAML parses 'on'/'off'/'yes'/'no' as Python bools. str(True)='True' which is INVALID for llama.cpp's --flash-attn flag (expects 'on'/'off'/'auto'). Added _flag_value() converter that maps bools to 'on'/'off' strings. 2. **llama-server stderr was DEVNULL**: All error messages (bad model path, OOM, invalid flag) were invisible. Now captured to /tmp/llama-server-stderr.log and dumped to the sidecar log on failure. 3. **Reduce polling timeout**: 240 retries × 0.5s = 120s hang. Reduced to 60 retries × 0.5s = 30s. Still dumps stderr + exit code on failure. 4. **Manifest VRAM fix**: gemma4-26b-compact-long-128k used q8_0 KV cache at 128K context (~24GB on 24GB RTX 3090 — borderline OOM). Changed to q4_0 (~18GB, comfortable).
This commit is contained in:
parent
903f06c634
commit
37fee5341e
@ -184,8 +184,8 @@
|
|||||||
flags:
|
flags:
|
||||||
n_ctx: 131072
|
n_ctx: 131072
|
||||||
n_gpu_layers: 999
|
n_gpu_layers: 999
|
||||||
cache-type-k: q8_0
|
cache-type-k: q4_0
|
||||||
cache-type-v: q8_0
|
cache-type-v: q4_0
|
||||||
flash-attn: on
|
flash-attn: on
|
||||||
temp: 1.0
|
temp: 1.0
|
||||||
top_p: 0.95
|
top_p: 0.95
|
||||||
|
|||||||
@ -19,6 +19,7 @@ from sidecar.manifest import load_manifest
|
|||||||
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
|
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
|
||||||
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
|
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
|
||||||
LLAMA_SERVER_PORT = 8081
|
LLAMA_SERVER_PORT = 8081
|
||||||
|
LLAMA_STDERR_LOG = "/tmp/llama-server-stderr.log"
|
||||||
|
|
||||||
# Global state
|
# Global state
|
||||||
_llama_server_process: Optional[asyncio.subprocess.Process] = None
|
_llama_server_process: Optional[asyncio.subprocess.Process] = None
|
||||||
@ -40,8 +41,20 @@ async def lifespan(app: FastAPI):
|
|||||||
app = FastAPI(lifespan=lifespan)
|
app = FastAPI(lifespan=lifespan)
|
||||||
|
|
||||||
|
|
||||||
|
def _close_stderr_log():
|
||||||
|
"""Close the stderr log file handle if it's still attached to the process."""
|
||||||
|
global _llama_server_process
|
||||||
|
if _llama_server_process is not None:
|
||||||
|
fh = getattr(_llama_server_process, "_stderr_fh", None)
|
||||||
|
if fh is not None and not fh.closed:
|
||||||
|
try:
|
||||||
|
fh.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _kill_llama_server():
|
def _kill_llama_server():
|
||||||
"""Kill the llama-server subprocess."""
|
"""Kill the llama-server subprocess and close its stderr log handle."""
|
||||||
global _llama_server_process
|
global _llama_server_process
|
||||||
if _llama_server_process and _llama_server_process.returncode is None:
|
if _llama_server_process and _llama_server_process.returncode is None:
|
||||||
try:
|
try:
|
||||||
@ -54,6 +67,20 @@ def _kill_llama_server():
|
|||||||
pass
|
pass
|
||||||
_llama_server_process = None
|
_llama_server_process = None
|
||||||
|
|
||||||
|
# Close stderr log handle if still open
|
||||||
|
_close_stderr_log()
|
||||||
|
|
||||||
|
|
||||||
|
def _flag_value(value) -> str:
|
||||||
|
"""Convert a manifest flag value to a llama-server CLI argument string.
|
||||||
|
|
||||||
|
YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
|
||||||
|
safe_load. llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
|
||||||
|
"""
|
||||||
|
if isinstance(value, bool):
|
||||||
|
return "on" if value else "off"
|
||||||
|
return str(value)
|
||||||
|
|
||||||
|
|
||||||
async def _start_llama_server(profile: dict):
|
async def _start_llama_server(profile: dict):
|
||||||
"""Start llama-server with the given profile's configuration."""
|
"""Start llama-server with the given profile's configuration."""
|
||||||
@ -67,22 +94,31 @@ async def _start_llama_server(profile: dict):
|
|||||||
cmd += ["--model", profile["model_path"]]
|
cmd += ["--model", profile["model_path"]]
|
||||||
cmd += ["--port", str(LLAMA_SERVER_PORT)]
|
cmd += ["--port", str(LLAMA_SERVER_PORT)]
|
||||||
for key, value in profile.get("flags", {}).items():
|
for key, value in profile.get("flags", {}).items():
|
||||||
cmd += ["--" + key, str(value)]
|
cmd += ["--" + key, _flag_value(value)]
|
||||||
|
|
||||||
print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
|
print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
|
||||||
|
|
||||||
|
# Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
|
||||||
|
stderr_fh = open(LLAMA_STDERR_LOG, "w")
|
||||||
_llama_server_process = await asyncio.create_subprocess_exec(
|
_llama_server_process = await asyncio.create_subprocess_exec(
|
||||||
*cmd,
|
*cmd,
|
||||||
stdout=asyncio.subprocess.DEVNULL,
|
stdout=asyncio.subprocess.DEVNULL,
|
||||||
stderr=asyncio.subprocess.DEVNULL,
|
stderr=stderr_fh,
|
||||||
)
|
)
|
||||||
|
# Keep a reference so we can close the handle later
|
||||||
|
_llama_server_process._stderr_fh = stderr_fh # type: ignore[attr-defined]
|
||||||
return _llama_server_process
|
return _llama_server_process
|
||||||
|
|
||||||
|
|
||||||
async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):
|
async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
|
||||||
"""Poll llama-server readiness via /v1/models endpoint."""
|
"""Poll llama-server readiness via /v1/models endpoint.
|
||||||
|
|
||||||
|
Returns True on success. On failure, dumps the captured stderr (if any)
|
||||||
|
so the user can see why llama-server crashed.
|
||||||
|
"""
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
for _ in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=2.0) as client:
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
||||||
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
|
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
|
||||||
@ -91,6 +127,24 @@ async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5
|
|||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
await asyncio.sleep(interval)
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
# ── Dump stderr for diagnosis ──────────────────────────────────────
|
||||||
|
print("llama-server did NOT become ready — dumping stderr:", flush=True)
|
||||||
|
try:
|
||||||
|
with open(LLAMA_STDERR_LOG) as f:
|
||||||
|
for line in f:
|
||||||
|
print(f" {line.rstrip()}", flush=True)
|
||||||
|
except FileNotFoundError:
|
||||||
|
print(" (stderr log not found — process may not have started)", flush=True)
|
||||||
|
|
||||||
|
# Also log exit code if the process died
|
||||||
|
global _llama_server_process
|
||||||
|
if _llama_server_process and _llama_server_process.returncode is not None:
|
||||||
|
print(
|
||||||
|
f"llama-server exited with code {_llama_server_process.returncode}",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user