intelligence-router/sidecar/app.py
root 45dd793b69 fix: sidecar process kill was not awaiting wait() — old server held GPU VRAM
- _kill_llama_server() was sync calling an unawaited coroutine. process.wait() created
  a discarded coroutine object — the old llama-server was never waited on to release
  GPU memory before starting a new one, causing OOM on rapid model switches.
  Fixed with async await + 10s SIGTERM timeout + SIGKILL fallback.

- Changed _switch_lock from threading.Lock to asyncio.Lock() to prevent event loop
  deadlock during long switch operations.

- Router proxy: only trigger model switches for POST /v1/chat/completions and
  /v1/completions. Non-chat endpoints (GET probes, /api/show) no longer trigger
  unwanted model reloads.

- _ollama_show_lookup: return active profile context size when model_name is empty.
  Previously returned 404, causing Hermes Desktop to default to 256k context.

- Always drain_queue() + complete_switch() after switch failure so queued requests
  don't hang forever waiting on a never-set switching event.
2026-06-17 23:49:57 +00:00

261 lines
8.5 KiB
Python

"""Sidecar FastAPI service — Issue #2 foundation.
Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
"""
import os
import asyncio
import signal as signal_module
from contextlib import asynccontextmanager
from typing import Optional
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse
from pydantic import BaseModel
from sidecar.manifest import load_manifest
# Configuration from environment
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
LLAMA_SERVER_PORT = 8081
LLAMA_STDERR_LOG = os.path.join(
os.path.dirname(MANIFEST_PATH), "llama-server-stderr.log"
)
# Global state
_llama_server_process: Optional[asyncio.subprocess.Process] = None
_active_profile: Optional[str] = None
_switch_lock = asyncio.Lock() # Use asyncio.Lock to avoid blocking the event loop
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Manage sidecar lifecycle — no default model loaded."""
print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}", flush=True)
yield
# Cleanup: kill llama-server if running
global _llama_server_process
if _llama_server_process:
await _kill_llama_server()
app = FastAPI(lifespan=lifespan)
def _close_stderr_log():
"""Close the stderr log file handle if it's still attached to the process."""
global _llama_server_process
if _llama_server_process is not None:
fh = getattr(_llama_server_process, "_stderr_fh", None)
if fh is not None and not fh.closed:
try:
fh.close()
except Exception:
pass
async def _kill_llama_server():
"""Kill the llama-server subprocess and wait for it to fully terminate.
This MUST be async because process.wait() is a coroutine. The synchronous
version was calling .wait() without await, creating an unawaited coroutine
object — the old process was never actually waited on, so it could still
hold GPU VRAM when the new server started.
"""
global _llama_server_process
if _llama_server_process is None or _llama_server_process.returncode is not None:
_close_stderr_log()
return
try:
_llama_server_process.send_signal(signal_module.SIGTERM)
try:
await asyncio.wait_for(_llama_server_process.wait(), timeout=10)
except asyncio.TimeoutError:
_llama_server_process.kill()
try:
await asyncio.wait_for(_llama_server_process.wait(), timeout=5)
except asyncio.TimeoutError:
pass
except Exception:
pass
finally:
_llama_server_process = None
_close_stderr_log()
def _flag_value(value) -> str:
"""Convert a manifest flag value to a llama-server CLI argument string.
YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
safe_load. llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
"""
if isinstance(value, bool):
return "on" if value else "off"
return str(value)
def _flag_key(key: str) -> str:
"""Convert a manifest flag key to the correct llama-server CLI flag name.
llama-server uses hyphenated flag names (--ctx-size, --n-gpu-layers),
but YAML keys often use underscores. Some flags were also renamed
across llama.cpp versions (e.g. --n-ctx → --ctx-size).
This function normalises underscores to hyphens and applies known renames.
"""
normalized = key.replace("_", "-")
FLAG_RENAMES = {
"n-ctx": "ctx-size",
}
return FLAG_RENAMES.get(normalized, normalized)
async def _start_llama_server(profile: dict):
"""Start llama-server with the given profile's configuration."""
global _llama_server_process
# Kill any existing process
await _kill_llama_server()
# Build command from profile flags
cmd = ["/home/bigt/AI/llama.cpp/build/bin/llama-server"]
cmd += ["--model", profile["model_path"]]
cmd += ["--port", str(LLAMA_SERVER_PORT)]
cmd += ["--host", "0.0.0.0"]
for key, value in profile.get("flags", {}).items():
cmd += ["--" + _flag_key(key), _flag_value(value)]
print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
# Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
stderr_fh = open(LLAMA_STDERR_LOG, "w")
_llama_server_process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.DEVNULL,
stderr=stderr_fh,
)
# Keep a reference so we can close the handle later
_llama_server_process._stderr_fh = stderr_fh # type: ignore[attr-defined]
return _llama_server_process
async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
"""Poll llama-server readiness via /v1/models endpoint.
Returns True on success. On failure, dumps the captured stderr (if any)
so the user can see why llama-server crashed.
"""
import httpx
for attempt in range(max_retries):
try:
async with httpx.AsyncClient(timeout=2.0) as client:
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
if resp.status_code == 200:
return True
except Exception:
pass
await asyncio.sleep(interval)
# Flush and close the stderr handle so all data is on disk before we read
_close_stderr_log()
# ── Dump stderr for diagnosis ──────────────────────────────────────
print("llama-server did NOT become ready — dumping stderr:", flush=True)
try:
with open(LLAMA_STDERR_LOG) as f:
for line in f:
print(f" {line.rstrip()}", flush=True)
except FileNotFoundError:
print(" (stderr log not found — process may not have started)", flush=True)
# Also log exit code if the process died
global _llama_server_process
if _llama_server_process and _llama_server_process.returncode is not None:
print(
f"llama-server exited with code {_llama_server_process.returncode}",
flush=True,
)
return False
@app.get("/models/available")
async def get_available_models():
"""Read manifest YAML and return list of profiles."""
profiles = load_manifest(MANIFEST_PATH)
if profiles is None:
raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
return profiles
@app.get("/models/status")
async def get_models_status():
"""Return current model status."""
global _active_profile
return {
"active_profile": _active_profile,
"llama_server_running": (
_llama_server_process is not None and _llama_server_process.returncode is None
),
}
class SwitchRequest(BaseModel):
profile_id: str
@app.post("/models/switch")
async def switch_model(payload: SwitchRequest):
"""Stop current llama-server, start new one with the given profile, wait for readiness."""
global _active_profile
async with _switch_lock:
# Validate profile_id
profiles = load_manifest(MANIFEST_PATH)
if profiles is None:
return JSONResponse(
status_code=500,
content={"status": "error", "message": "Failed to load manifest"},
)
profile = None
for p in profiles:
if p["id"] == payload.profile_id:
profile = p
break
if profile is None:
return JSONResponse(
status_code=404,
content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
)
# Already running this profile — just check readiness
if _active_profile == payload.profile_id:
return {
"status": "ready",
"active_profile": _active_profile,
}
# Start the new model
await _kill_llama_server()
_active_profile = None
await _start_llama_server(profile)
# Poll for readiness
ready = await _poll_llama_server_ready()
if ready:
_active_profile = payload.profile_id
return {
"status": "ready",
"active_profile": _active_profile,
}
else:
_active_profile = None
return JSONResponse(
status_code=500,
content={"status": "error", "message": "llama-server failed to become ready"},
)