2026-06-15 04:13:36 +03:00
|
|
|
# LLM Model Manifest
|
|
|
|
|
# Each profile defines a named model configuration for llama-server.
|
|
|
|
|
# The sidecar reads this file on every request — no restart needed.
|
|
|
|
|
#
|
|
|
|
|
# Usage:
|
|
|
|
|
# 1. Edit this file with available GGUFs and desired parameters
|
|
|
|
|
# 2. The sidecar automatically picks up changes
|
|
|
|
|
# 3. Use the Hermes model picker to switch models
|
2026-06-15 15:34:46 +03:00
|
|
|
#
|
|
|
|
|
# Hardware: RTX 3090 (24GB VRAM)
|
|
|
|
|
# All profiles use flash-attn: on, n-gpu-layers: 999 (offload all)
|
|
|
|
|
# KV cache quantization (q8_0/q4_0) enables 64K+ context within 24GB VRAM
|
2026-06-15 04:13:36 +03:00
|
|
|
|
2026-06-15 15:34:46 +03:00
|
|
|
# --- Qwen3.6-27B (Q4_K_M, ~10.5 GB) ---
|
|
|
|
|
# Sampling: temp 0.6/1.0, top_p 0.95, top_k 20
|
|
|
|
|
- id: qwen36-27b-balanced-64k
|
|
|
|
|
name: "Qwen3.6-27B Balanced 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: qwen36-27b-thinking-64k
|
|
|
|
|
name: "Qwen3.6-27B Thinking 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: qwen36-27b-extended-128k
|
|
|
|
|
name: "Qwen3.6-27B Extended 128K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-27B-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 131072
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q4_0
|
|
|
|
|
cache-type-v: q4_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.05
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
# --- Gemma 4 12B (Q6_K_XL ~8.5 GB) ---
|
2026-06-15 15:34:46 +03:00
|
|
|
# Sampling: temp 1.0, top_p 0.95, top_k 64 (Google official)
|
|
|
|
|
- id: gemma4-12b-standard-q6-64k
|
|
|
|
|
name: "Gemma4 12B Standard Q6 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-12b-it-UD-Q6_K_XL.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: gemma4-12b-extended-q6-128k
|
|
|
|
|
name: "Gemma4 12B Extended Q6 128K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-12b-it-UD-Q6_K_XL.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 131072
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q4_0
|
|
|
|
|
cache-type-v: q4_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
# --- Gemma 4 26B-A4B (Q4_K_M ~10.5 GB, IQ4_XS ~6 GB) ---
|
|
|
|
|
# MoE, 4B active params. Same sampling as 12B family.
|
|
|
|
|
- id: gemma4-26b-balanced-64k
|
|
|
|
|
name: "Gemma4 26B Balanced 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-Q4_K_M.gguf"
|
2026-06-15 15:34:46 +03:00
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
- id: gemma4-26b-extended-128k
|
|
|
|
|
name: "Gemma4 26B Extended 128K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-Q4_K_M.gguf"
|
2026-06-15 15:34:46 +03:00
|
|
|
flags:
|
|
|
|
|
n_ctx: 131072
|
|
|
|
|
n_gpu_layers: 999
|
2026-06-15 19:38:17 +03:00
|
|
|
cache-type-k: q4_0
|
|
|
|
|
cache-type-v: q4_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.15
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: gemma4-26b-ultra-long-iq4-128k
|
|
|
|
|
name: "Gemma4 26B Ultra-Long IQ4 128K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/gemma-4-26B-A4B-it-UD-IQ4_XS.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 131072
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q4_0
|
|
|
|
|
cache-type-v: q4_0
|
2026-06-15 15:34:46 +03:00
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
- id: gemma4-26b-q5-64k
|
|
|
|
|
name: "Gemma4 26B Q5 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/google_gemma-4-26B-A4B-it-Q5_K_M.gguf"
|
2026-06-15 15:34:46 +03:00
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
# --- Gemma 4 26B Compact (IQ4_XS ~6 GB) ---
|
|
|
|
|
- id: gemma4-26b-compact-iq4-64k
|
|
|
|
|
name: "Gemma4 26B Compact IQ4 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
|
2026-06-15 15:34:46 +03:00
|
|
|
flags:
|
2026-06-15 19:38:17 +03:00
|
|
|
n_ctx: 65536
|
2026-06-15 15:34:46 +03:00
|
|
|
n_gpu_layers: 999
|
2026-06-15 19:38:17 +03:00
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
2026-06-15 15:34:46 +03:00
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
2026-06-15 19:38:17 +03:00
|
|
|
repeat-penalty: 1.0
|
2026-06-15 15:34:46 +03:00
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
- id: gemma4-26b-compact-long-128k
|
|
|
|
|
name: "Gemma4 26B Compact IQ4 128K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/bartowski-google_gemma-4-26B-A4B-it-IQ4_XS.gguf"
|
2026-06-15 15:34:46 +03:00
|
|
|
flags:
|
|
|
|
|
n_ctx: 131072
|
|
|
|
|
n_gpu_layers: 999
|
2026-06-15 19:38:17 +03:00
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
2026-06-15 15:34:46 +03:00
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
# --- Qwen3.6-35B-A3B (UD-Q4_K_M ~14 GB) ---
|
|
|
|
|
# MoE, 3.6B active params. Non-MTP (faster on single GPU per Unsloth benchmark).
|
|
|
|
|
# Sampling: temp 0.6/1.0, top_p 0.95, top_k 20
|
|
|
|
|
- id: qwen36-35b-fast-64k
|
|
|
|
|
name: "Qwen3.6-35B Fast 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: qwen36-35b-thinking-64k
|
|
|
|
|
name: "Qwen3.6-35B Thinking 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: qwen36-35b-extended-128k
|
|
|
|
|
name: "Qwen3.6-35B Extended 128K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 131072
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q4_0
|
|
|
|
|
cache-type-v: q4_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
# --- Qwen3.6-35B-A3B MTP variant ---
|
|
|
|
|
- id: qwen36-35b-mtp-fast-64k
|
|
|
|
|
name: "Qwen3.6-35B MTP Fast 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: qwen36-35b-mtp-extended-128k
|
|
|
|
|
name: "Qwen3.6-35B MTP Extended 128K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-MTP-UD-Q4_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 131072
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q4_0
|
|
|
|
|
cache-type-v: q4_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
# --- Uncensored models ---
|
2026-06-15 15:34:46 +03:00
|
|
|
- id: qwen36-35b-hauhau-aggressive-64k
|
|
|
|
|
name: "Qwen3.6-35B HauhauCS Aggressive 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-HauhauCS-Aggressive-Q4_K_P.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
|
|
|
|
- id: qwen36-35b-genesis-apex-64k
|
|
|
|
|
name: "Qwen3.6-35B Genesis APEX 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-Genesis-APEX.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 19:38:17 +03:00
|
|
|
- id: qwen36-35b-genesis-mtp-apex-64k
|
|
|
|
|
name: "Qwen3.6-35B Genesis MTP APEX 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/qwen3.6/Qwen3.6-35B-A3B-Uncensored-Genesis-MTP-APEX.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 0.6
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 20
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
|
|
|
|
presence-penalty: 0.0
|
|
|
|
|
|
2026-06-15 15:34:46 +03:00
|
|
|
- id: gemma4-26b-hauhau-balanced-64k
|
|
|
|
|
name: "Gemma4 26B HauhauCS Balanced 64K"
|
|
|
|
|
model_path: "/home/bigt/AI/llm/gemma4/Gemma4-26B-A4B-Uncensored-HauhauCS-Balanced-Q5_K_M.gguf"
|
|
|
|
|
flags:
|
|
|
|
|
n_ctx: 65536
|
|
|
|
|
n_gpu_layers: 999
|
|
|
|
|
cache-type-k: q8_0
|
|
|
|
|
cache-type-v: q8_0
|
|
|
|
|
flash-attn: on
|
|
|
|
|
temp: 1.0
|
|
|
|
|
top_p: 0.95
|
|
|
|
|
top_k: 64
|
|
|
|
|
repeat-penalty: 1.0
|
|
|
|
|
min_p: 0.0
|
2026-06-15 19:38:17 +03:00
|
|
|
presence-penalty: 0.0
|