diff --git a/deploy/manifest.yaml b/deploy/manifest.yaml index 7fdbcb2..fa6117a 100644 --- a/deploy/manifest.yaml +++ b/deploy/manifest.yaml @@ -6,6 +6,10 @@ # 1. Edit this file with available GGUFs and desired parameters # 2. The sidecar automatically picks up changes # 3. Use the Hermes model picker to switch models +# +# Hardware: RTX 3090 (24GB VRAM) +# All profiles use flash-attn: on, n-gpu-layers: 999 (offload all) +# KV cache quantization (q8_0/q4_0) enables 64K+ context within 24GB VRAM - id: qwen-3-8b name: "Qwen 3 8B" @@ -27,3 +31,269 @@ flags: n_ctx: 8192 n_gpu_layers: 35 + +# --- Qwen3.6-27B (Q4_K_M, ~10.5 GB) --- +# Sampling: temp 0.6/1.0, top_p 0.95, top_k 20 +- id: qwen36-27b-balanced-64k + name: "Qwen3.6-27B Balanced 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: qwen36-27b-thinking-64k + name: "Qwen3.6-27B Thinking 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: qwen36-27b-extended-128k + name: "Qwen3.6-27B Extended 128K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q4_0 + cache-type-v: q4_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.05 + min_p: 0.0 + presence-penalty: 0.0 + +# --- Gemma 4 12B (Q6_K_XL ~8.5 GB, IQ4_XS ~5 GB) --- +# Sampling: temp 1.0, top_p 0.95, top_k 64 (Google official) +- id: gemma4-12b-standard-q6-64k + name: "Gemma4 12B Standard Q6 64K" + model_path: "/home/bigt/AI/llm/gemma4/gemma-4-12b-it-UD-Q6_K_XL.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: gemma4-12b-extended-q6-128k + name: "Gemma4 12B Extended Q6 128K" + model_path: "/home/bigt/AI/llm/gemma4/gemma-4-12b-it-UD-Q6_K_XL.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q4_0 + cache-type-v: q4_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: gemma4-12b-compact-iq4-64k + name: "Gemma4 12B Compact IQ4 64K" + model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: gemma4-12b-compact-long-128k + name: "Gemma4 12B Compact IQ4 128K" + model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +# --- Gemma 4 26B-A4B (Q4_K_M ~10.5 GB, IQ4_XS ~6 GB) --- +# MoE, 4B active params. Same sampling as 12B family. +- id: gemma4-26b-balanced-64k + name: "Gemma4 26B Balanced 64K" + model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-Q4_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: gemma4-26b-extended-128k + name: "Gemma4 26B Extended 128K" + model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-Q4_K_M.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q4_0 + cache-type-v: q4_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.15 + min_p: 0.0 + presence-penalty: 0.0 + +- id: gemma4-26b-ultra-long-iq4-128k + name: "Gemma4 26B Ultra-Long IQ4 128K" + model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-IQ4_XS.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q4_0 + cache-type-v: q4_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +# --- Qwen3.6-35B-A3B (UD-Q4_K_M ~14 GB) --- +# MoE, 3.6B active params. Non-MTP (faster on single GPU per Unsloth benchmark). +# Sampling: temp 0.6/1.0, top_p 0.95, top_k 20 +- id: qwen36-35b-fast-64k + name: "Qwen3.6-35B Fast 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: qwen36-35b-thinking-64k + name: "Qwen3.6-35B Thinking 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: qwen36-35b-extended-128k + name: "Qwen3.6-35B Extended 128K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf" + flags: + n_ctx: 131072 + n_gpu_layers: 999 + cache-type-k: q4_0 + cache-type-v: q4_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +# --- Uncensored models (apply censored family params) --- +- id: qwen36-35b-hauhau-aggressive-64k + name: "Qwen3.6-35B HauhauCS Aggressive 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: qwen36-35b-genesis-apex-64k + name: "Qwen3.6-35B Genesis APEX 64K" + model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-Genesis-APEX.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 0.6 + top_p: 0.95 + top_k: 20 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 + +- id: gemma4-26b-hauhau-balanced-64k + name: "Gemma4 26B HauhauCS Balanced 64K" + model_path: "/home/bigt/AI/llm/gemma4/Gemma4-26B-A4B-Uncensored-HauhauCS-Balanced-Q5_K_M.gguf" + flags: + n_ctx: 65536 + n_gpu_layers: 999 + cache-type-k: q8_0 + cache-type-v: q8_0 + flash-attn: on + temp: 1.0 + top_p: 0.95 + top_k: 64 + repeat-penalty: 1.0 + min_p: 0.0 + presence-penalty: 0.0 diff --git a/pytest.ini b/pytest.ini new file mode 100644 index 0000000..4584de7 --- /dev/null +++ b/pytest.ini @@ -0,0 +1,3 @@ +[pytest] +testpaths = tests +pythonpath = .