Three fixes for the model-not-loading bug: 1. **YAML boolean → CLI flag bug**: YAML parses 'on'/'off'/'yes'/'no' as Python bools. str(True)='True' which is INVALID for llama.cpp's --flash-attn flag (expects 'on'/'off'/'auto'). Added _flag_value() converter that maps bools to 'on'/'off' strings. 2. **llama-server stderr was DEVNULL**: All error messages (bad model path, OOM, invalid flag) were invisible. Now captured to /tmp/llama-server-stderr.log and dumped to the sidecar log on failure. 3. **Reduce polling timeout**: 240 retries × 0.5s = 120s hang. Reduced to 60 retries × 0.5s = 30s. Still dumps stderr + exit code on failure. 4. **Manifest VRAM fix**: gemma4-26b-compact-long-128k used q8_0 KV cache at 128K context (~24GB on 24GB RTX 3090 — borderline OOM). Changed to q4_0 (~18GB, comfortable).
228 lines
7.3 KiB
Python
228 lines
7.3 KiB
Python
"""Sidecar FastAPI service — Issue #2 foundation.
|
|
|
|
Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
|
|
"""
|
|
import os
|
|
import asyncio
|
|
import signal as signal_module
|
|
import threading
|
|
from contextlib import asynccontextmanager
|
|
from typing import Optional
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel
|
|
|
|
from sidecar.manifest import load_manifest
|
|
|
|
# Configuration from environment
|
|
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
|
|
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
|
|
LLAMA_SERVER_PORT = 8081
|
|
LLAMA_STDERR_LOG = "/tmp/llama-server-stderr.log"
|
|
|
|
# Global state
|
|
_llama_server_process: Optional[asyncio.subprocess.Process] = None
|
|
_active_profile: Optional[str] = None
|
|
_switch_lock = threading.Lock() # Use threading.Lock for compatibility with TestClient
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Manage sidecar lifecycle — no default model loaded."""
|
|
print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}", flush=True)
|
|
yield
|
|
# Cleanup: kill llama-server if running
|
|
global _llama_server_process
|
|
if _llama_server_process:
|
|
_kill_llama_server()
|
|
|
|
|
|
app = FastAPI(lifespan=lifespan)
|
|
|
|
|
|
def _close_stderr_log():
|
|
"""Close the stderr log file handle if it's still attached to the process."""
|
|
global _llama_server_process
|
|
if _llama_server_process is not None:
|
|
fh = getattr(_llama_server_process, "_stderr_fh", None)
|
|
if fh is not None and not fh.closed:
|
|
try:
|
|
fh.close()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _kill_llama_server():
|
|
"""Kill the llama-server subprocess and close its stderr log handle."""
|
|
global _llama_server_process
|
|
if _llama_server_process and _llama_server_process.returncode is None:
|
|
try:
|
|
_llama_server_process.send_signal(signal_module.SIGTERM)
|
|
try:
|
|
_llama_server_process.wait(timeout=5)
|
|
except asyncio.TimeoutError:
|
|
_llama_server_process.kill()
|
|
except Exception:
|
|
pass
|
|
_llama_server_process = None
|
|
|
|
# Close stderr log handle if still open
|
|
_close_stderr_log()
|
|
|
|
|
|
def _flag_value(value) -> str:
|
|
"""Convert a manifest flag value to a llama-server CLI argument string.
|
|
|
|
YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
|
|
safe_load. llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
|
|
"""
|
|
if isinstance(value, bool):
|
|
return "on" if value else "off"
|
|
return str(value)
|
|
|
|
|
|
async def _start_llama_server(profile: dict):
|
|
"""Start llama-server with the given profile's configuration."""
|
|
global _llama_server_process
|
|
|
|
# Kill any existing process
|
|
_kill_llama_server()
|
|
|
|
# Build command from profile flags
|
|
cmd = ["/home/bigt/AI/llama.cpp/build/bin/llama-server"]
|
|
cmd += ["--model", profile["model_path"]]
|
|
cmd += ["--port", str(LLAMA_SERVER_PORT)]
|
|
for key, value in profile.get("flags", {}).items():
|
|
cmd += ["--" + key, _flag_value(value)]
|
|
|
|
print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
|
|
|
|
# Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
|
|
stderr_fh = open(LLAMA_STDERR_LOG, "w")
|
|
_llama_server_process = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.DEVNULL,
|
|
stderr=stderr_fh,
|
|
)
|
|
# Keep a reference so we can close the handle later
|
|
_llama_server_process._stderr_fh = stderr_fh # type: ignore[attr-defined]
|
|
return _llama_server_process
|
|
|
|
|
|
async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
|
|
"""Poll llama-server readiness via /v1/models endpoint.
|
|
|
|
Returns True on success. On failure, dumps the captured stderr (if any)
|
|
so the user can see why llama-server crashed.
|
|
"""
|
|
import httpx
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
|
|
if resp.status_code == 200:
|
|
return True
|
|
except Exception:
|
|
pass
|
|
await asyncio.sleep(interval)
|
|
|
|
# ── Dump stderr for diagnosis ──────────────────────────────────────
|
|
print("llama-server did NOT become ready — dumping stderr:", flush=True)
|
|
try:
|
|
with open(LLAMA_STDERR_LOG) as f:
|
|
for line in f:
|
|
print(f" {line.rstrip()}", flush=True)
|
|
except FileNotFoundError:
|
|
print(" (stderr log not found — process may not have started)", flush=True)
|
|
|
|
# Also log exit code if the process died
|
|
global _llama_server_process
|
|
if _llama_server_process and _llama_server_process.returncode is not None:
|
|
print(
|
|
f"llama-server exited with code {_llama_server_process.returncode}",
|
|
flush=True,
|
|
)
|
|
|
|
return False
|
|
|
|
|
|
@app.get("/models/available")
|
|
async def get_available_models():
|
|
"""Read manifest YAML and return list of profiles."""
|
|
profiles = load_manifest(MANIFEST_PATH)
|
|
if profiles is None:
|
|
raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
|
|
return profiles
|
|
|
|
|
|
@app.get("/models/status")
|
|
async def get_models_status():
|
|
"""Return current model status."""
|
|
global _active_profile
|
|
return {
|
|
"active_profile": _active_profile,
|
|
"llama_server_running": (
|
|
_llama_server_process is not None and _llama_server_process.returncode is None
|
|
),
|
|
}
|
|
|
|
|
|
class SwitchRequest(BaseModel):
|
|
profile_id: str
|
|
|
|
|
|
@app.post("/models/switch")
|
|
async def switch_model(payload: SwitchRequest):
|
|
"""Stop current llama-server, start new one with the given profile, wait for readiness."""
|
|
global _active_profile
|
|
|
|
with _switch_lock:
|
|
# Validate profile_id
|
|
profiles = load_manifest(MANIFEST_PATH)
|
|
if profiles is None:
|
|
return JSONResponse(
|
|
status_code=500,
|
|
content={"status": "error", "message": "Failed to load manifest"},
|
|
)
|
|
|
|
profile = None
|
|
for p in profiles:
|
|
if p["id"] == payload.profile_id:
|
|
profile = p
|
|
break
|
|
|
|
if profile is None:
|
|
return JSONResponse(
|
|
status_code=404,
|
|
content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
|
|
)
|
|
|
|
# Already running this profile — just check readiness
|
|
if _active_profile == payload.profile_id:
|
|
return {
|
|
"status": "ready",
|
|
"active_profile": _active_profile,
|
|
}
|
|
|
|
# Start the new model
|
|
_kill_llama_server()
|
|
_active_profile = None
|
|
await _start_llama_server(profile)
|
|
|
|
# Poll for readiness
|
|
ready = await _poll_llama_server_ready()
|
|
if ready:
|
|
_active_profile = payload.profile_id
|
|
return {
|
|
"status": "ready",
|
|
"active_profile": _active_profile,
|
|
}
|
|
else:
|
|
_active_profile = None
|
|
return JSONResponse(
|
|
status_code=500,
|
|
content={"status": "error", "message": "llama-server failed to become ready"},
|
|
)
|