174 lines
5.3 KiB
Python
174 lines
5.3 KiB
Python
|
|
"""Sidecar FastAPI service — Issue #2 foundation.
|
||
|
|
|
||
|
|
Runs on the Main PC, manages llama-server subprocess, serves manifest/profile data.
|
||
|
|
"""
|
||
|
|
import os
|
||
|
|
import asyncio
|
||
|
|
import signal as signal_module
|
||
|
|
import threading
|
||
|
|
from contextlib import asynccontextmanager
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
from fastapi import FastAPI, HTTPException
|
||
|
|
from fastapi.responses import JSONResponse
|
||
|
|
from pydantic import BaseModel
|
||
|
|
|
||
|
|
from sidecar.manifest import load_manifest
|
||
|
|
|
||
|
|
# Configuration from environment
|
||
|
|
MANIFEST_PATH = os.getenv("MANIFEST_PATH", "/home/bigt/AI/llm/manifest.yaml")
|
||
|
|
SIDECAR_PORT = int(os.getenv("SIDECAR_PORT", "8081"))
|
||
|
|
LLAMA_SERVER_PORT = 8080
|
||
|
|
|
||
|
|
# Global state
|
||
|
|
_llama_server_process: Optional[asyncio.subprocess.Process] = None
|
||
|
|
_active_profile: Optional[str] = None
|
||
|
|
_switch_lock = threading.Lock() # Use threading.Lock for compatibility with TestClient
|
||
|
|
|
||
|
|
|
||
|
|
@asynccontextmanager
|
||
|
|
async def lifespan(app: FastAPI):
|
||
|
|
"""Manage sidecar lifecycle — no default model loaded."""
|
||
|
|
print(f"Sidecar starting, manifest={MANIFEST_PATH}, port={SIDECAR_PORT}")
|
||
|
|
yield
|
||
|
|
# Cleanup: kill llama-server if running
|
||
|
|
global _llama_server_process
|
||
|
|
if _llama_server_process:
|
||
|
|
_kill_llama_server()
|
||
|
|
|
||
|
|
|
||
|
|
app = FastAPI(lifespan=lifespan)
|
||
|
|
|
||
|
|
|
||
|
|
def _kill_llama_server():
|
||
|
|
"""Kill the llama-server subprocess."""
|
||
|
|
global _llama_server_process
|
||
|
|
if _llama_server_process and _llama_server_process.returncode is None:
|
||
|
|
try:
|
||
|
|
_llama_server_process.send_signal(signal_module.SIGTERM)
|
||
|
|
try:
|
||
|
|
_llama_server_process.wait(timeout=5)
|
||
|
|
except asyncio.TimeoutError:
|
||
|
|
_llama_server_process.kill()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
_llama_server_process = None
|
||
|
|
|
||
|
|
|
||
|
|
async def _start_llama_server(profile: dict):
|
||
|
|
"""Start llama-server with the given profile's configuration."""
|
||
|
|
global _llama_server_process
|
||
|
|
|
||
|
|
# Kill any existing process
|
||
|
|
_kill_llama_server()
|
||
|
|
|
||
|
|
# Build command from profile flags
|
||
|
|
cmd = ["llama-server"]
|
||
|
|
cmd += ["--model", profile["model_path"]]
|
||
|
|
cmd += ["--port", str(LLAMA_SERVER_PORT)]
|
||
|
|
for key, value in profile.get("flags", {}).items():
|
||
|
|
cmd += ["--" + key, str(value)]
|
||
|
|
|
||
|
|
print(f"Starting llama-server: {' '.join(cmd)}")
|
||
|
|
_llama_server_process = await asyncio.create_subprocess_exec(
|
||
|
|
*cmd,
|
||
|
|
stdout=asyncio.subprocess.DEVNULL,
|
||
|
|
stderr=asyncio.subprocess.DEVNULL,
|
||
|
|
)
|
||
|
|
return _llama_server_process
|
||
|
|
|
||
|
|
|
||
|
|
async def _poll_llama_server_ready(max_retries: int = 240, interval: float = 0.5):
|
||
|
|
"""Poll llama-server readiness via /v1/models endpoint."""
|
||
|
|
import httpx
|
||
|
|
|
||
|
|
for _ in range(max_retries):
|
||
|
|
try:
|
||
|
|
async with httpx.AsyncClient(timeout=2.0) as client:
|
||
|
|
resp = await client.get(f"http://localhost:{LLAMA_SERVER_PORT}/v1/models")
|
||
|
|
if resp.status_code == 200:
|
||
|
|
return True
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
await asyncio.sleep(interval)
|
||
|
|
return False
|
||
|
|
|
||
|
|
|
||
|
|
@app.get("/models/available")
|
||
|
|
async def get_available_models():
|
||
|
|
"""Read manifest YAML and return list of profiles."""
|
||
|
|
profiles = load_manifest(MANIFEST_PATH)
|
||
|
|
if profiles is None:
|
||
|
|
raise HTTPException(status_code=500, detail="Failed to parse manifest YAML")
|
||
|
|
return profiles
|
||
|
|
|
||
|
|
|
||
|
|
@app.get("/models/status")
|
||
|
|
async def get_models_status():
|
||
|
|
"""Return current model status."""
|
||
|
|
global _active_profile
|
||
|
|
return {
|
||
|
|
"active_profile": _active_profile,
|
||
|
|
"llama_server_running": (
|
||
|
|
_llama_server_process is not None and _llama_server_process.returncode is None
|
||
|
|
),
|
||
|
|
}
|
||
|
|
|
||
|
|
|
||
|
|
class SwitchRequest(BaseModel):
|
||
|
|
profile_id: str
|
||
|
|
|
||
|
|
|
||
|
|
@app.post("/models/switch")
|
||
|
|
async def switch_model(payload: SwitchRequest):
|
||
|
|
"""Stop current llama-server, start new one with the given profile, wait for readiness."""
|
||
|
|
global _active_profile
|
||
|
|
|
||
|
|
with _switch_lock:
|
||
|
|
# Validate profile_id
|
||
|
|
profiles = load_manifest(MANIFEST_PATH)
|
||
|
|
if profiles is None:
|
||
|
|
return JSONResponse(
|
||
|
|
status_code=500,
|
||
|
|
content={"status": "error", "message": "Failed to load manifest"},
|
||
|
|
)
|
||
|
|
|
||
|
|
profile = None
|
||
|
|
for p in profiles:
|
||
|
|
if p["id"] == payload.profile_id:
|
||
|
|
profile = p
|
||
|
|
break
|
||
|
|
|
||
|
|
if profile is None:
|
||
|
|
return JSONResponse(
|
||
|
|
status_code=404,
|
||
|
|
content={"status": "error", "message": f"Profile '{payload.profile_id}' not found"},
|
||
|
|
)
|
||
|
|
|
||
|
|
# Already running this profile — just check readiness
|
||
|
|
if _active_profile == payload.profile_id:
|
||
|
|
return {
|
||
|
|
"status": "ready",
|
||
|
|
"active_profile": _active_profile,
|
||
|
|
}
|
||
|
|
|
||
|
|
# Start the new model
|
||
|
|
_kill_llama_server()
|
||
|
|
_active_profile = None
|
||
|
|
await _start_llama_server(profile)
|
||
|
|
|
||
|
|
# Poll for readiness
|
||
|
|
ready = await _poll_llama_server_ready()
|
||
|
|
if ready:
|
||
|
|
_active_profile = payload.profile_id
|
||
|
|
return {
|
||
|
|
"status": "ready",
|
||
|
|
"active_profile": _active_profile,
|
||
|
|
}
|
||
|
|
else:
|
||
|
|
_active_profile = None
|
||
|
|
return JSONResponse(
|
||
|
|
status_code=500,
|
||
|
|
content={"status": "error", "message": "llama-server failed to become ready"},
|
||
|
|
)
|