Two changes to fix 'error: invalid argument: --n-ctx' during model switch: 1. sidecar/app.py: Added _flag_key() converter that normalises underscores to hyphens in flag names and handles the n_ctx→ctx-size rename. The code now converts e.g. n_gpu_layers → n-gpu-layers, top_p → top-p, top_k → top-k, min_p → min-p before passing to llama-server CLI. 2. deploy/manifest.yaml: Updated all 20 profiles to use correct llama-server flag names: n_ctx→ctx-size, n_gpu_layers→n-gpu-layers, top_p→top-p, top_k→top-k, min_p→min-p. All flags now use hyphens, matching what llama-server actually accepts.
249 lines
8.0 KiB
Python
249 lines
8.0 KiB
Python
"""Sidecar FastAPI service — Issue #2 foundation.
|
|
|
|
Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
|
|
"""
|
|
import os
|
|
import asyncio
|
|
import signal as signal_module
|
|
import threading
|
|
from contextlib import asynccontextmanager
|
|
from typing import Optional
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel
|
|
|
|
from sidecar.manifest import load_manifest
|
|
|
|
# Configuration from environment
|
|
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
|
|
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8080"))
|
|
LLAMA_SERVER_PORT = 8081
|
|
LLAMA_STDERR_LOG = os.path.join(
|
|
os.path.dirname(MANIFEST_PATH), "llama-server-stderr.log"
|
|
)
|
|
|
|
# Global state
|
|
_llama_server_process: Optional[asyncio.subprocess.Process] = None
|
|
_active_profile: Optional[str] = None
|
|
_switch_lock = threading.Lock() # Use threading.Lock for compatibility with TestClient
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Manage sidecar lifecycle — no default model loaded."""
|
|
print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}", flush=True)
|
|
yield
|
|
# Cleanup: kill llama-server if running
|
|
global _llama_server_process
|
|
if _llama_server_process:
|
|
_kill_llama_server()
|
|
|
|
|
|
app = FastAPI(lifespan=lifespan)
|
|
|
|
|
|
def _close_stderr_log():
|
|
"""Close the stderr log file handle if it's still attached to the process."""
|
|
global _llama_server_process
|
|
if _llama_server_process is not None:
|
|
fh = getattr(_llama_server_process, "_stderr_fh", None)
|
|
if fh is not None and not fh.closed:
|
|
try:
|
|
fh.close()
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def _kill_llama_server():
|
|
"""Kill the llama-server subprocess and close its stderr log handle."""
|
|
global _llama_server_process
|
|
if _llama_server_process and _llama_server_process.returncode is None:
|
|
try:
|
|
_llama_server_process.send_signal(signal_module.SIGTERM)
|
|
try:
|
|
_llama_server_process.wait(timeout=5)
|
|
except asyncio.TimeoutError:
|
|
_llama_server_process.kill()
|
|
except Exception:
|
|
pass
|
|
_llama_server_process = None
|
|
|
|
# Close stderr log handle if still open
|
|
_close_stderr_log()
|
|
|
|
|
|
def _flag_value(value) -> str:
|
|
"""Convert a manifest flag value to a llama-server CLI argument string.
|
|
|
|
YAML booleans (True/False/on/off/yes/no) are parsed as Python bools by
|
|
safe_load. llama-server expects 'on'/'off' for boolean flags, not 'True'/'False'.
|
|
"""
|
|
if isinstance(value, bool):
|
|
return "on" if value else "off"
|
|
return str(value)
|
|
|
|
|
|
def _flag_key(key: str) -> str:
|
|
"""Convert a manifest flag key to the correct llama-server CLI flag name.
|
|
|
|
llama-server uses hyphenated flag names (--ctx-size, --n-gpu-layers),
|
|
but YAML keys often use underscores. Some flags were also renamed
|
|
across llama.cpp versions (e.g. --n-ctx → --ctx-size).
|
|
|
|
This function normalises underscores to hyphens and applies known renames.
|
|
"""
|
|
normalized = key.replace("_", "-")
|
|
FLAG_RENAMES = {
|
|
"n-ctx": "ctx-size",
|
|
}
|
|
return FLAG_RENAMES.get(normalized, normalized)
|
|
|
|
|
|
async def _start_llama_server(profile: dict):
|
|
"""Start llama-server with the given profile's configuration."""
|
|
global _llama_server_process
|
|
|
|
# Kill any existing process
|
|
_kill_llama_server()
|
|
|
|
# Build command from profile flags
|
|
cmd = ["/home/bigt/AI/llama.cpp/build/bin/llama-server"]
|
|
cmd += ["--model", profile["model_path"]]
|
|
cmd += ["--port", str(LLAMA_SERVER_PORT)]
|
|
for key, value in profile.get("flags", {}).items():
|
|
cmd += ["--" + _flag_key(key), _flag_value(value)]
|
|
|
|
print(f"Starting llama-server: {' '.join(cmd)}", flush=True)
|
|
|
|
# Capture stderr so we can diagnose crashes (model not found, OOM, bad flag)
|
|
stderr_fh = open(LLAMA_STDERR_LOG, "w")
|
|
_llama_server_process = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.DEVNULL,
|
|
stderr=stderr_fh,
|
|
)
|
|
# Keep a reference so we can close the handle later
|
|
_llama_server_process._stderr_fh = stderr_fh # type: ignore[attr-defined]
|
|
return _llama_server_process
|
|
|
|
|
|
async def _poll_llama_server_ready(max_retries: int = 60, interval: float = 0.5):
|
|
"""Poll llama-server readiness via /v1/models endpoint.
|
|
|
|
Returns True on success. On failure, dumps the captured stderr (if any)
|
|
so the user can see why llama-server crashed.
|
|
"""
|
|
import httpx
|
|
|
|
for attempt in range(max_retries):
|
|
try:
|
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
|
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
|
|
if resp.status_code == 200:
|
|
return True
|
|
except Exception:
|
|
pass
|
|
await asyncio.sleep(interval)
|
|
|
|
# Flush and close the stderr handle so all data is on disk before we read
|
|
_close_stderr_log()
|
|
|
|
# ── Dump stderr for diagnosis ──────────────────────────────────────
|
|
print("llama-server did NOT become ready — dumping stderr:", flush=True)
|
|
try:
|
|
with open(LLAMA_STDERR_LOG) as f:
|
|
for line in f:
|
|
print(f" {line.rstrip()}", flush=True)
|
|
except FileNotFoundError:
|
|
print(" (stderr log not found — process may not have started)", flush=True)
|
|
|
|
# Also log exit code if the process died
|
|
global _llama_server_process
|
|
if _llama_server_process and _llama_server_process.returncode is not None:
|
|
print(
|
|
f"llama-server exited with code {_llama_server_process.returncode}",
|
|
flush=True,
|
|
)
|
|
|
|
return False
|
|
|
|
|
|
@app.get("/models/available")
|
|
async def get_available_models():
|
|
"""Read manifest YAML and return list of profiles."""
|
|
profiles = load_manifest(MANIFEST_PATH)
|
|
if profiles is None:
|
|
raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
|
|
return profiles
|
|
|
|
|
|
@app.get("/models/status")
|
|
async def get_models_status():
|
|
"""Return current model status."""
|
|
global _active_profile
|
|
return {
|
|
"active_profile": _active_profile,
|
|
"llama_server_running": (
|
|
_llama_server_process is not None and _llama_server_process.returncode is None
|
|
),
|
|
}
|
|
|
|
|
|
class SwitchRequest(BaseModel):
|
|
profile_id: str
|
|
|
|
|
|
@app.post("/models/switch")
|
|
async def switch_model(payload: SwitchRequest):
|
|
"""Stop current llama-server, start new one with the given profile, wait for readiness."""
|
|
global _active_profile
|
|
|
|
with _switch_lock:
|
|
# Validate profile_id
|
|
profiles = load_manifest(MANIFEST_PATH)
|
|
if profiles is None:
|
|
return JSONResponse(
|
|
status_code=500,
|
|
content={"status": "error", "message": "Failed to load manifest"},
|
|
)
|
|
|
|
profile = None
|
|
for p in profiles:
|
|
if p["id"] == payload.profile_id:
|
|
profile = p
|
|
break
|
|
|
|
if profile is None:
|
|
return JSONResponse(
|
|
status_code=404,
|
|
content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
|
|
)
|
|
|
|
# Already running this profile — just check readiness
|
|
if _active_profile == payload.profile_id:
|
|
return {
|
|
"status": "ready",
|
|
"active_profile": _active_profile,
|
|
}
|
|
|
|
# Start the new model
|
|
_kill_llama_server()
|
|
_active_profile = None
|
|
await _start_llama_server(profile)
|
|
|
|
# Poll for readiness
|
|
ready = await _poll_llama_server_ready()
|
|
if ready:
|
|
_active_profile = payload.profile_id
|
|
return {
|
|
"status": "ready",
|
|
"active_profile": _active_profile,
|
|
}
|
|
else:
|
|
_active_profile = None
|
|
return JSONResponse(
|
|
status_code=500,
|
|
content={"status": "error", "message": "llama-server failed to become ready"},
|
|
)
|