From 2c23faa4a102b6dafc36ba122b3e411b7621a624 Mon Sep 17 00:00:00 2001 From: root Date: Mon, 15 Jun 2026 15:22:15 +0000 Subject: [PATCH] fix: add probe endpoints and no-model fallback for Hermes Desktop compatibility MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hermes Desktop sends probe requests to validate providers before allowing model switching. The router was returning 503 for all of these because the catch-all proxy requires a 'model' field in the request body. Added explicit handlers for: - GET /v1/models/{model_id} — OpenAI single-model lookup - GET /api/tags — Ollama model list discovery - POST /api/show — Ollama model info - GET /api/v1/models — Ollama-compatible model list - GET /v1/props, GET /props — llama.cpp server properties - GET /version — llama.cpp version Also fixed the catch-all proxy to route requests with no model body to the currently active backend instead of returning 503. --- .hermes/plans/add-model-profiles.md | 94 +++++++++++ main.py | 244 +++++++++++++++++++++------- 2 files changed, 279 insertions(+), 59 deletions(-) create mode 100644 .hermes/plans/add-model-profiles.md diff --git a/.hermes/plans/add-model-profiles.md b/.hermes/plans/add-model-profiles.md new file mode 100644 index 0000000..acd2a15 --- /dev/null +++ b/.hermes/plans/add-model-profiles.md @@ -0,0 +1,94 @@ +# Plan: Add user model profiles to manifest.yaml +# Date: 2025-06-15 +# Author: Hermes Agent +# Status: DRAFT + +## Context +User has a collection of GGUF models on their Main PC (10.0.4.11, RTX 3090 24GB VRAM). +The intelligence-router manifest needs profiles for all models with researched llama.cpp parameters. +Research sourced from r/LocalLLaMA, HuggingFace model cards, and community blog posts. + +## Hardware constraints +- GPU: RTX 3090, 24GB VRAM +- All profiles use `n_gpu_layers: 999` (offload all layers that fit) +- All profiles use `flash-attn: on` +- KV cache quantization (q8_0 or q4_0) to enable 64K+ context within 24GB VRAM +- `min_p` set to 0.0 across all profiles (community standard for these models) + +## Models to add (excluding mmproj files) + +### Qwen3.6-27B (1 file: Qwen3.6-27B-Q4_K_M.gguf, ~10.5 GB) +Recommended sampling per HF model card and Unsloth: temp 0.6 / top_p 0.95 / top_k 20 + +| # | Profile ID | Name | n_ctx | cache_k/v | temp | top_k | repeat_pen | +|---|-----------|------|-------|-----------|------|-------|------------| +| 1 | qwen36-27b-balanced-64k | Qwen3.6-27B Balanced 64K | 65536 | q8_0/q8_0 | 0.6 | 20 | 1.0 | +| 2 | qwen36-27b-thinking-64k | Qwen3.6-27B Thinking 64K | 65536 | q8_0/q8_0 | 1.0 | 20 | 1.0 | +| 3 | qwen36-27b-extended-128k | Qwen3.6-27B Extended 128K | 131072 | q4_0/q4_0 | 0.6 | 20 | 1.05 | + +### Gemma 4 12B (2 files: Q6_K_XL ~8.5 GB, IQ4_XS ~5 GB) +Google official: temp 1.0 / top_p 0.95 / top_k 64 + +| # | Profile ID | Name | File | n_ctx | cache_k/v | temp | top_k | +|---|-----------|------|------|-------|-----------|------|-------| +| 4 | gemma4-12b-standard-q6-64k | Gemma4 12B Standard Q6 64K | Q6_K_XL | 65536 | q8_0/q8_0 | 1.0 | 64 | +| 5 | gemma4-12b-extended-q6-128k | Gemma4 12B Extended Q6 128K | Q6_K_XL | 131072 | q4_0/q4_0 | 1.0 | 64 | +| 6 | gemma4-12b-compact-iq4-64k | Gemma4 12B Compact IQ4 64K | IQ4_XS | 65536 | q8_0/q8_0 | 1.0 | 64 | +| 7 | gemma4-12b-compact-long-128k | Gemma4 12B Compact IQ4 128K | IQ4_XS | 131072 | q8_0/q8_0 | 1.0 | 64 | + +### Gemma 4 26B-A4B (2 files: Q4_K_M ~10.5 GB, IQ4_XS ~6 GB) +MoE, 4B active. Same sampling as 12B family. + +| # | Profile ID | Name | File | n_ctx | cache_k/v | temp | top_k | repeat_pen | +|---|-----------|------|------|-------|-----------|------|-------|------------| +| 8 | gemma4-26b-balanced-64k | Gemma4 26B Balanced 64K | Q4_K_M | 65536 | q8_0/q8_0 | 1.0 | 64 | 1.0 | +| 9 | gemma4-26b-extended-128k | Gemma4 26B Extended 128K | Q4_K_M | 131072 | q4_0/q4_0 | 1.0 | 64 | 1.15 | +| 10 | gemma4-26b-ultra-long-iq4-256k | Gemma4 26B Ultra-Long IQ4 256K | IQ4_XS | 262144 | q4_0/q4_0 | 1.0 | 64 | 1.0 | + +### Qwen3.6-35B-A3B (2 files: UD-Q4_K_M ~14 GB, MTP-UD-Q4_K_M ~16 GB) +**MTP note:** Unsloth benchmark shows MTP is net-negative on single 3090. Including MTP profile anyway since user has the file. + +| # | Profile ID | Name | File | n_ctx | cache_k/v | temp | top_k | MTP | +|---|-----------|------|------|-------|-----------|------|-------|-----| +| 11 | qwen36-35b-fast-64k | Qwen3.6-35B Fast 64K | UD-Q4 | 65536 | q8_0/q8_0 | 0.6 | 20 | no | +| 12 | qwen36-35b-thinking-64k | Qwen3.6-35B Thinking 64K | UD-Q4 | 65536 | q8_0/q8_0 | 1.0 | 20 | no | +| 13 | qwen36-35b-extended-128k | Qwen3.6-35B Extended 128K | UD-Q4 | 131072 | q4_0/q4_0 | 0.6 | 20 | no | +| 14 | qwen36-35b-mtp-128k | Qwen3.6-35B MTP 128K | MTP-UD-Q4 | 131072 | q8_0/q8_0 | 0.6 | 20 | yes (n=3) | + +### Uncensored models (apply censored family params) + +| # | Profile ID | Name | File | n_ctx | cache_k/v | temp | top_k | Based on | +|---|-----------|------|------|-------|-----------|------|-------|----------| +| 15 | qwen36-35b-hauhau-aggressive-64k | Qwen3.6-35B HauhauCS Aggressive 64K | Uncensored-HauhauCS-Q4_K_P | 65536 | q8_0/q8_0 | 0.6 | 20 | Qwen3.6-35B fast | +| 16 | qwen36-35b-genesis-apex-64k | Qwen3.6-35B Genesis APEX 64K | Uncensored-Genesis-APEX | 65536 | q8_0/q8_0 | 0.6 | 20 | Qwen3.6-35B fast | +| 17 | qwen36-35b-genesis-mtp-apex-128k | Qwen3.6-35B Genesis MTP APEX 128K | Uncensored-Genesis-MTP-APEX | 131072 | q8_0/q8_0 | 0.6 | 20 | Qwen3.6-35B MTP | +| 18 | gemma4-26b-hauhau-balanced-64k | Gemma4 26B HauhauCS Balanced 64K | Uncensored-HauhauCS-Q5_K_M | 65536 | q8_0/q8_0 | 1.0 | 64 | Gemma4 26B balanced | + +**Total: 18 profiles** + +## Flag mapping (manifest → llama-server CLI) + +Manifest flags use camelCase keys that the sidecar passes as `--key value` to llama-server: + +| Manifest key | CLI flag | Type | Notes | +|-------------|----------|------|-------| +| n_gpu_layers | --n-gpu-layers | int | 999 = all | +| n_ctx | --ctx-size | int | context window | +| cache_type_k | --cache-type-k | str | q8_0, q4_0 | +| cache_type_v | --cache-type-v | str | q8_0, q4_0 | +| flash_attn | --flash-attn | bool | true/on | +| temp | --temp | float | sampling | +| top_p | --top-p | float | sampling | +| top_k | --top-k | int | sampling | +| repeat_penalty | --repeat-penalty | float | sampling | +| min_p | --min-p | float | 0.0 | +| spec_type | --spec-type | str | draft-mtp (only MTP profiles) | +| spec_draft_n_max | --spec-draft-n-max | int | 3 (only MTP profiles) | +| presence_penalty | --presence-penalty | float | 0.0 | + +## Actions +1. Create branch `feature/add-model-profiles` from master +2. Create issues on Gitea for each model family (4 issues: qwen27, gemma12b, gemma26b, qwen35b) +3. Update `deploy/manifest.yaml` with all 18 profiles +4. Update tests if flag structure requires it +5. Run tests, commit diff --git a/main.py b/main.py index 705f128..2713399 100644 --- a/main.py +++ b/main.py @@ -179,6 +179,127 @@ async def health(): return {"status": "router_online"} +# ─── Hermes Desktop Probe Endpoints ────────────────────────────────────────── +# These endpoints are probed by Hermes Desktop to validate/identify the +# provider before allowing model switching. Without them the desktop +# returns 503 and refuses to switch models. + +@app.get("/v1/models/{model_id:path}") +async def get_single_model(model_id: str): + """OpenAI-compatible single model query. Proxied via Sidecar model list.""" + async with httpx.AsyncClient(timeout=5.0) as client: + try: + resp = await client.get(f"{SIDECAR_URL}/models/available") + profiles = resp.json() + except Exception: + return JSONResponse( + status_code=503, + content={"error": "Sidecar unavailable", "data": []}, + ) + + for p in profiles: + if p.get("id") == model_id: + return {"id": p["id"], "object": "model", "owned_by": "sidecar"} + return JSONResponse(status_code=404, content={"error": "model not found", "id": model_id}) + + +@app.get("/api/tags") +async def ollama_tags(): + """Ollama-compatible model list for Hermes Desktop discovery.""" + async with httpx.AsyncClient(timeout=5.0) as client: + try: + resp = await client.get(f"{SIDECAR_URL}/models/available") + profiles = resp.json() + except Exception: + return JSONResponse(content={"models": []}) + + models = [] + for p in profiles: + models.append({ + "name": p.get("id", ""), + "model": p.get("id", ""), + "modified_at": "2025-01-01T00:00:00Z", + "size": 0, + "digest": "", + "details": {"format": "gguf", "family": p.get("name", "llm")}, + }) + return {"models": models} + + +@app.post("/api/show") +async def ollama_show(request: Request): + """Ollama-compatible model info for Hermes Desktop discovery.""" + body = await request.body() + body_data = json.loads(body) if body else {} + model_name = body_data.get("model", "") + + async with httpx.AsyncClient(timeout=5.0) as client: + try: + resp = await client.get(f"{SIDECAR_URL}/models/available") + profiles = resp.json() + except Exception: + return JSONResponse(status_code=404, content={"error": "model not found"}) + + for p in profiles: + if p.get("id") == model_name: + return { + "modelfile": "", + "parameters": "num_ctx 4096", + "template": "", + "details": { + "format": "gguf", + "family": p.get("name", "llm"), + "parameter_size": p.get("flags", {}).get("--num-ctx", "4096"), + }, + "model_info": {"id": p.get("id", "")}, + } + return JSONResponse(status_code=404, content={"error": "model not found"}) + + +@app.get("/api/v1/models") +async def ollama_v1_models(): + """Ollama /api/v1/models redirect — return same list as /v1/models.""" + return await get_models() + + +@app.get("/v1/props") +async def llama_cpp_props(): + """llama.cpp discovery endpoint for Hermes Desktop.""" + async with httpx.AsyncClient(timeout=3.0) as client: + try: + resp = await client.get(f"{SIDECAR_URL}/models/status") + status = resp.json() + except Exception: + status = {"active_profile": None, "llama_server_running": False} + + # Report the currently-running server version / capabilities + return { + "props": { + "version": 1, + "total_slots": 1, + "chat_endpoint": "/v1/chat/completions", + "completion_endpoint": "/v1/completions", + "embedding_endpoint": "/v1/embeddings", + "rerank_endpoint": "", + "health_endpoint": "/health", + }, + "active_profile": status.get("active_profile"), + "server_running": status.get("llama_server_running", False), + } + + +@app.get("/props") +async def llm_props(): + """Legacy llama.cpp discovery endpoint (same as /v1/props).""" + return await llama_cpp_props() + + +@app.get("/version") +async def llm_version(): + """llama.cpp version endpoint for Hermes Desktop.""" + return {"version": "0.2.0", "build": "router-proxy", "commit": "intelligence-router"} + + # ─── GET /models/status ────────────────────────────────────────────────────── @app.get("/models/status") async def router_model_status(): @@ -285,69 +406,74 @@ async def proxy( if requested_model and sidecar_status.get("active_profile") == requested_model: target_url = f"{MAIN_PC_BASE}/{path}" - else: - # Trigger switch - if requested_model: - # Check if a switch is already in progress - current_switch = await wait_for_switch() + elif requested_model: + # Trigger switch for a specific model request + # Check if a switch is already in progress + current_switch = await wait_for_switch() - if current_switch is not None and not current_switch.is_set(): - # Another request started the switch — queue this one - try: - wait_evt = await queue_request() - except HTTPException as he: - raise - - # SSE progress while waiting - async def stream_with_sse(): - sse_gen = sse_progress_stream(wait_evt) - try: - await wait_evt.wait() - async for sse_chunk in sse_gen: - yield sse_chunk - complete_switch() - drain_queue() - async with httpx.AsyncClient(timeout=60.0) as c: - req_headers = dict(request.headers) - req_headers.pop("host", None) - async with c.stream( - request.method, - f"{MAIN_PC_BASE}/{path}", - content=body, - headers=req_headers, - ) as resp: - async for chunk in resp.aiter_bytes(): - yield chunk - finally: - # Clean up sse_gen - try: - await sse_gen.aclose() - except Exception: - pass - - return StreamingResponse( - stream_with_sse(), - media_type="text/event-stream", - ) - - # First request triggers the switch - await start_switch() # Create event for tracking + if current_switch is not None and not current_switch.is_set(): + # Another request started the switch — queue this one try: - async with httpx.AsyncClient(timeout=120.0) as client: - switch_resp = await client.post( - f"{SIDECAR_URL}/models/switch", - json={"profile_id": requested_model}, - ) - switch_result = switch_resp.json() - if switch_result.get("status") == "ready": + wait_evt = await queue_request() + except HTTPException as he: + raise + + # SSE progress while waiting + async def stream_with_sse(): + sse_gen = sse_progress_stream(wait_evt) + try: + await wait_evt.wait() + async for sse_chunk in sse_gen: + yield sse_chunk complete_switch() drain_queue() - target_url = f"{MAIN_PC_BASE}/{path}" - else: - error = "switch_failed" - except Exception as e: - circuit_record_failure() - error = f"switch_error: {str(e)}" + async with httpx.AsyncClient(timeout=60.0) as c: + req_headers = dict(request.headers) + req_headers.pop("host", None) + async with c.stream( + request.method, + f"{MAIN_PC_BASE}/{path}", + content=body, + headers=req_headers, + ) as resp: + async for chunk in resp.aiter_bytes(): + yield chunk + finally: + # Clean up sse_gen + try: + await sse_gen.aclose() + except Exception: + pass + + return StreamingResponse( + stream_with_sse(), + media_type="text/event-stream", + ) + + # First request triggers the switch + await start_switch() # Create event for tracking + try: + async with httpx.AsyncClient(timeout=120.0) as client: + switch_resp = await client.post( + f"{SIDECAR_URL}/models/switch", + json={"profile_id": requested_model}, + ) + switch_result = switch_resp.json() + if switch_result.get("status") == "ready": + complete_switch() + drain_queue() + target_url = f"{MAIN_PC_BASE}/{path}" + else: + error = "switch_failed" + except Exception as e: + circuit_record_failure() + error = f"switch_error: {str(e)}" + else: + # No model in request body (probe/GET/non-chat request) — + # route to the currently active backend when available, + # or fall through to the fallback chain. + if sidecar_status.get("active_profile") and sidecar_status.get("llama_server_running"): + target_url = f"{MAIN_PC_BASE}/{path}" # ── Fallback chain ──────────────────────────────────────────────────── if target_url is None: