diff --git a/deploy.py b/deploy.py
new file mode 100644
index 0000000..a61856d
--- /dev/null
+++ b/deploy.py
@@ -0,0 +1,40 @@
+import paramiko
+import os
+
+host = '192.168.178.153'
+user = 'hitonabi'
+password = 'Tu77ceu2zzvx!'
+
+print("Connecting to server...")
+ssh = paramiko.SSHClient()
+ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ssh.connect(host, username=user, password=password)
+
+print("Uploading update.tar.gz...")
+sftp = ssh.open_sftp()
+sftp.put('update.tar.gz', '/home/hitonabi/update.tar.gz')
+sftp.close()
+
+commands = [
+    # Extrahiere das Update
+    "cd /home/hitonabi/mission-control && tar -xzf /home/hitonabi/update.tar.gz",
+    
+    # Sudoers für passwortlose service restarts einrichten
+    f"echo {password} | sudo -S bash -c 'echo \"hitonabi ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart mission-control, /usr/bin/systemctl restart llama-swap, /usr/bin/journalctl\" > /etc/sudoers.d/mission-control'",
+    f"echo {password} | sudo -S chmod 440 /etc/sudoers.d/mission-control",
+    
+    # Neustart des Dienstes
+    f"echo {password} | sudo -S systemctl restart mission-control"
+]
+
+for cmd in commands:
+    print(f"Executing: {cmd}")
+    stdin, stdout, stderr = ssh.exec_command(cmd)
+    exit_status = stdout.channel.recv_exit_status()
+    print("STDOUT:", stdout.read().decode())
+    print("STDERR:", stderr.read().decode())
+    if exit_status != 0:
+        print(f"Error executing {cmd}")
+
+ssh.close()
+print("Deployment complete!")
diff --git a/deploy_bosgame.sh b/deploy_bosgame.sh
new file mode 100644
index 0000000..e2a26c4
--- /dev/null
+++ b/deploy_bosgame.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -e
+
+echo "Pulling latest code..."
+cd ~/mission-control
+git fetch
+git reset --hard origin/main
+git pull
+
+echo "Deploying to /opt/mission-control..."
+rsync -a --exclude='.git' --exclude='.venv' --exclude='__pycache__' --exclude='*.pyc' ~/mission-control/ /opt/mission-control/
+
+echo "Configuring sudoers..."
+echo 'Tu77ceu2zzvx!' | sudo -S bash -c "echo 'hitonabi ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart mission-control, /usr/bin/systemctl restart llama-swap, /usr/bin/journalctl' > /etc/sudoers.d/mission-control && chmod 440 /etc/sudoers.d/mission-control"
+
+echo "Restarting service..."
+echo 'Tu77ceu2zzvx!' | sudo -S systemctl restart mission-control
+
+echo "Deployment complete."
diff --git a/fit.py b/fit.py
new file mode 100644
index 0000000..a5a49a7
--- /dev/null
+++ b/fit.py
@@ -0,0 +1,778 @@
+import re
+
+from services.hwfit.models import (
+    params_b, estimate_memory_gb, infer_use_case,
+    get_models, is_prequantized, _active_params_b, QUANT_BYTES_PER_PARAM,
+    QUANT_SPEED_MULT, QUANT_QUALITY_PENALTY,
+)
+
+GPU_BANDWIDTH = {
+    "5090": 1792, "5080": 960, "5070 ti": 896, "5070": 672, "5060 ti": 448, "5060": 256,
+    "4090": 1008, "4080 super": 736, "4080": 717, "4070 ti super": 672, "4070 ti": 504, "4070 super": 504, "4070": 504, "4060 ti": 288, "4060": 272,
+    "3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360,
+    "2080 ti": 616, "2080 super": 496, "2080": 448, "2070 super": 448, "2070": 448, "2060 super": 448, "2060": 336,
+    "1660 ti": 288, "1660 super": 336, "1660": 192, "1650 super": 192, "1650": 128,
+    "h100 sxm": 3350, "h100": 2039, "h200": 4800, "a100 sxm": 2039, "a100": 1555,
+    "l40s": 864, "l40": 864, "l4": 300, "a10g": 600, "a10": 600, "t4": 320,
+    "v100 sxm": 900, "v100": 897, "a6000": 768, "a5000": 768, "a4000": 448,
+    "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288,
+    "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224,
+    "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229,
+    "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322,
+    # NVIDIA GB10 Grace-Blackwell superchip (DGX Spark). Unified LPDDR5X memory,
+    # not Apple Silicon, so it lives in the generic GPU table — the Apple-only
+    # lookup never matches it (its name carries no "apple").
+    "gb10": 273,
+}
+
+# Pre-sort keys by length descending for correct substring matching
+_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True)
+
+# Apple Silicon unified-memory bandwidth (GB/s). For chip families with both
+# binned and full variants under the same "Apple Mx Max" brand string, prefer
+# GPU core count when hardware detection provides it; otherwise fall back to the
+# conservative tier so speed estimates do not over-promise.
+APPLE_BANDWIDTH_FIXED = {
+    "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68,
+    "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100,
+    "m3 ultra": 800, "m3 pro": 150, "m3": 100,
+    "m4 pro": 273, "m4": 120,
+    "m5 pro": 307, "m5": 153,
+}
+APPLE_BANDWIDTH_BY_CORES = {
+    "m3 max": {30: 300, 40: 400},
+    "m4 max": {32: 410, 40: 546},
+    "m5 max": {32: 460, 40: 614},
+}
+_APPLE_FIXED_KEYS_SORTED = sorted(APPLE_BANDWIDTH_FIXED.keys(), key=len, reverse=True)
+_APPLE_VARIANT_KEYS_SORTED = sorted(APPLE_BANDWIDTH_BY_CORES.keys(), key=len, reverse=True)
+
+# metal: backstop for Apple Silicon chips not in the explicit tables above
+# (e.g. a future M6) — use a conservative generic estimate when unknown.
+FALLBACK_K = {"cuda": 220, "rocm": 180, "metal": 150, "cpu_x86": 70, "cpu_arm": 90}
+
+USE_CASE_WEIGHTS = {
+    "general":    (0.45, 0.30, 0.15, 0.10),
+    "coding":     (0.50, 0.20, 0.15, 0.15),
+    "reasoning":  (0.55, 0.15, 0.15, 0.15),
+    "chat":       (0.40, 0.35, 0.15, 0.10),
+    "multimodal": (0.50, 0.20, 0.15, 0.15),
+    "embedding":  (0.30, 0.40, 0.20, 0.10),
+    "tts":        (0.40, 0.35, 0.15, 0.10),
+    "stt":        (0.40, 0.35, 0.15, 0.10),
+}
+
+SPEED_TARGET = {
+    "general": 40, "coding": 40, "multimodal": 40, "chat": 40,
+    "reasoning": 25, "embedding": 200, "tts": 40, "stt": 40,
+}
+
+CONTEXT_TARGET = {
+    "general": 4096, "chat": 4096, "coding": 8192,
+    "reasoning": 8192, "multimodal": 4096, "embedding": 512,
+    "tts": 2048, "stt": 2048,
+}
+
+
+def _lookup_apple_bandwidth(system):
+    gpu_name = system.get("gpu_name")
+    if not isinstance(gpu_name, str) or not gpu_name:
+        return None
+    gn = gpu_name.lower()
+
+    # Guard against false matches on non-Apple GPUs whose names contain
+    # "m3"/"m4"/"m5" (e.g. NVIDIA Quadro M4 000).
+    if "apple" not in gn:
+        return None
+
+    raw_cores = system.get("gpu_cores")
+    try:
+        gpu_cores = int(raw_cores) if raw_cores is not None else None
+    except (TypeError, ValueError):
+        gpu_cores = None
+
+    for key in _APPLE_VARIANT_KEYS_SORTED:
+        if key not in gn:
+            continue
+        if gpu_cores in APPLE_BANDWIDTH_BY_CORES[key]:
+            return APPLE_BANDWIDTH_BY_CORES[key][gpu_cores]
+        return min(APPLE_BANDWIDTH_BY_CORES[key].values())
+
+    for key in _APPLE_FIXED_KEYS_SORTED:
+        if key in gn:
+            return APPLE_BANDWIDTH_FIXED[key]
+    return None
+
+
+def _lookup_bandwidth(system):
+    if isinstance(system, dict):
+        gpu_name = system.get("gpu_name")
+    else:
+        gpu_name = system
+
+    if not isinstance(gpu_name, str) or not gpu_name:
+        return None
+
+    # Apple tiers live only in the Apple-specific table now (#2564), so route
+    # BOTH dict and bare-string callers through it. A bare string carries no
+    # gpu_cores, so the helper falls back to the conservative (lowest) tier for
+    # that model -- before #2564 the generic table answered string lookups, and
+    # dropping that made _lookup_bandwidth("Apple M3 Max") return None.
+    apple_input = system if isinstance(system, dict) else {"gpu_name": gpu_name}
+    bw = _lookup_apple_bandwidth(apple_input)
+    if bw is not None:
+        return bw
+
+    gn = gpu_name.lower()
+    for key in _BW_KEYS_SORTED:
+        if key in gn:
+            return GPU_BANDWIDTH[key]
+    return None
+
+
+def _canonical_cpu_backend(system):
+    """Return the canonical CPU backend for cpu_only speed estimation.
+
+    Normalizes CPU-architecture aliases separately from the GPU backend, and
+    overrides GPU-only backends (CUDA/ROCm/Metal) so they do not inherit a
+    discrete-GPU fallback constant when the model is actually running on CPU.
+    """
+    backend = (system.get("backend") or "").lower().strip()
+    cpu_arch = (system.get("cpu_arch") or "").lower().strip()
+    cpu_name = (system.get("cpu_name") or "").lower()
+    gpu_name = (system.get("gpu_name") or "").lower()
+
+    # Already-canonical CPU backends
+    if backend in ("cpu_x86", "cpu_arm"):
+        return backend
+
+    # Raw CPU-architecture aliases. Treat plain "arm" as 32-bit ARM, not the
+    # ARM64-class CPU fallback used for Apple Silicon/aarch64 machines.
+    if backend in ("x86_64", "amd64", "i386", "i686"):
+        return "cpu_x86"
+    if backend in ("arm64", "aarch64"):
+        return "cpu_arm"
+
+    # Prefer an explicit CPU architecture field when present
+    if cpu_arch:
+        if cpu_arch in ("x86_64", "amd64", "x86", "i386", "i686"):
+            return "cpu_x86"
+        if cpu_arch in ("arm64", "aarch64"):
+            return "cpu_arm"
+
+    # Apple Silicon enters ranking as backend="metal"; its CPU path is ARM.
+    if backend in ("metal", "mps", "apple") or "apple" in cpu_name or "apple" in gpu_name:
+        return "cpu_arm"
+
+    # Conservative default for CUDA/ROCm/discrete GPU backends and unknowns.
+    return "cpu_x86"
+
+
+def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0):
+    """Estimate tok/s. Uses active params for MoE (only active experts run per token).
+
+    offload_frac (0..1): fraction of the model's weights that spill to system RAM
+    (CPU) because they don't fit VRAM. Generation reads every active weight per
+    token, so when part lives in CPU RAM the per-token time is dominated by the
+    slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and
+    system-RAM bandwidth weighted by what's where — far more accurate than a flat
+    "halve it" for partial offload, which under/over-shoots depending on amount.
+    Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with
+    light offload → ~59 t/s est vs 59.8 measured.
+    """
+    pb = _active_params_b(model)
+    is_moe = model.get("is_moe", False)
+    bw = _lookup_bandwidth(system)
+    backend = system.get("backend", "cpu_x86")
+
+    # CPU-only inference must never inherit a GPU backend's fallback constant,
+    # even if the detected system happens to report a CUDA/Metal/ROCm backend.
+    if run_mode == "cpu_only":
+        backend = _canonical_cpu_backend(system)
+
+    if bw and run_mode in ("gpu", "cpu_offload"):
+        bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5)
+        model_gb = pb * bpp
+        if model_gb <= 0:
+            return 0.0
+        efficiency = 0.55
+        if run_mode == "cpu_offload":
+            # Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be
+            # conservative since offloaded MoE is also compute-bound on CPU.
+            cpu_bw = 55.0
+            frac = min(max(offload_frac, 0.0), 1.0)
+            # If we don't know the fraction (legacy callers pass 0 with
+            # cpu_offload), assume a meaningful spill so we don't overestimate.
+            if frac <= 0.0:
+                frac = 0.5
+            # Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the
+            # slow CPU portion dominates as it grows (matches the steep real-world
+            # drop-off when more experts offload).
+            eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw)
+            raw_tps = (eff_bw / model_gb) * efficiency
+            return raw_tps * (0.8 if is_moe else 1.0)
+        # Fully on GPU.
+        raw_tps = (bw / model_gb) * efficiency
+        return raw_tps * (0.8 if is_moe else 1.0)
+
+    k = FALLBACK_K.get(backend, 70)
+    if pb <= 0:
+        return 0.0
+    sm = QUANT_SPEED_MULT.get(quant, 1.0)
+    return k / pb * sm
+
+
+def _architecture_bonus(model):
+    name = (model.get("name") or "").lower()
+    arch = (model.get("architecture") or "").lower()
+    text = f"{name} {arch}"
+
+    # Keep this intentionally small: hardware fit and speed still matter, but
+    # current model families should not be scored the same as older Qwen2/LLama
+    # era entries just because the parameter count is similar.
+    if "qwen3.6" in text or "qwen3_6" in text:
+        return 9
+    if "qwen3.5" in text or "qwen3_5" in text:
+        return 8
+    if "qwen3-next" in text or "qwen3_next" in text:
+        return 6
+    if "qwen3" in text or arch.startswith("qwen3"):
+        return 4
+    if "qwen2.5" in text or "qwen2_5" in text:
+        return 2
+    return 0
+
+
+def _quality_score(model, quant, use_case):
+    pb = params_b(model)
+    if pb < 1:
+        base = 30
+    elif pb < 3:
+        base = 45
+    elif pb < 7:
+        base = 60
+    elif pb < 10:
+        base = 75
+    elif pb < 20:
+        base = 82
+    elif pb < 40:
+        base = 89
+    else:
+        base = 95
+
+    name_lower = model.get("name", "").lower()
+    if "qwen" in name_lower:
+        base += 2
+    if "deepseek" in name_lower:
+        base += 3
+    if "llama" in name_lower:
+        base += 2
+    if "mistral" in name_lower or "mixtral" in name_lower:
+        base += 1
+    if "gemma" in name_lower:
+        base += 1
+
+    base += _architecture_bonus(model)
+    base += QUANT_QUALITY_PENALTY.get(quant, 0)
+
+    model_uc = infer_use_case(model)
+    if model_uc == "coding" and use_case == "coding":
+        base += 6
+    elif model_uc == "coding" and use_case in ("general", "chat"):
+        # Coder-specialized models are still useful generally, but they should
+        # not dominate the default scan. If the user wants code, the Coding
+        # filter gives them the boost above.
+        base -= 10
+    if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13:
+        base += 5
+    elif model_uc == "reasoning" and use_case == "chat":
+        base -= 4
+    if model_uc == "multimodal" and use_case == "multimodal":
+        base += 6
+
+    return max(0, min(100, base))
+
+
+def _speed_score(tps, use_case):
+    target = SPEED_TARGET.get(use_case, 40)
+    return max(0, min(100, (tps / target) * 100))
+
+
+def _fit_score(required, available):
+    if required > available:
+        return 0
+    if available <= 0:
+        return 0
+    ratio = required / available
+    if ratio <= 0.5:
+        return 60 + (ratio / 0.5) * 40
+    if ratio <= 0.8:
+        return 100
+    if ratio <= 0.9:
+        return 70
+    return 50
+
+
+def _context_score(ctx, use_case):
+    target = CONTEXT_TARGET.get(use_case, 4096)
+    if ctx >= target:
+        return 100
+    if ctx >= target / 2:
+        return 70
+    return 30
+
+
+def _try_quant_at(model, quant, ctx, gpu_vram, available_ram):
+    """Try a specific quant at a given context. Returns (run_mode, quant, ctx, mem) or None."""
+    mem = estimate_memory_gb(model, quant, ctx)
+    if gpu_vram > 0 and mem <= gpu_vram:
+        return "gpu", quant, ctx, mem
+    if gpu_vram > 0 and mem <= available_ram:
+        return "cpu_offload", quant, ctx, mem
+    if gpu_vram <= 0 and mem <= available_ram:
+        return "cpu_only", quant, ctx, mem
+    # Try halving context
+    cur_ctx = ctx // 2
+    while cur_ctx >= 1024:
+        mem = estimate_memory_gb(model, quant, cur_ctx)
+        if gpu_vram > 0 and mem <= gpu_vram:
+            return "gpu", quant, cur_ctx, mem
+        if mem <= available_ram:
+            return ("cpu_offload" if gpu_vram > 0 else "cpu_only"), quant, cur_ctx, mem
+        cur_ctx //= 2
+    return None
+
+
+def _quant_bits(q):
+    """Approximate bit-width of a quant label so GGUF quant tiers (Q4/Q8/…) can
+    be matched against prequantized formats (AWQ 4, AWQ-8bit, FP8, GPTQ-4bit…).
+    Returns 0 when unknown (caller treats unknown as "don't filter")."""
+    qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "")
+    # GGUF k-quants + float formats
+    if qu.startswith("Q8") or "FP8" in qu or "INT8" in qu or qu.startswith("W8"):
+        return 8
+    if qu.startswith("Q4") or qu.startswith("IQ4") or "FP4" in qu or "NF4" in qu or "INT4" in qu or qu.startswith("W4"):
+        return 4
+    if qu.startswith("Q2") or qu.startswith("IQ2"):
+        return 2
+    if qu.startswith("Q3") or qu.startswith("IQ3"):
+        return 3
+    if qu.startswith("Q5"):
+        return 5
+    if qu.startswith("Q6"):
+        return 6
+    if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"):
+        return 16
+    # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 ...)
+    m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu)
+    if m:
+        b = int(m.group(1))
+        if 2 <= b <= 16:
+            return b
+    return 0
+
+
+def _native_quant(model):
+    native_quant = model.get("quantization", "Q4_K_M")
+    name = (model.get("name") or "").lower()
+    fmt = (model.get("format") or "").lower()
+    text = f"{name} {fmt}"
+    if "nvfp4" in text:
+        return "NVFP4"
+    if re.search(r"(^|[-_/])fp8($|[-_/\s])", text):
+        return "FP8"
+    if "gptq" in text:
+        m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
+        # Canonical catalog label is "GPTQ-Int4"/"GPTQ-Int8" (see models.py
+        # QUANT_BPP / QUANT_QUALITY_PENALTY keys); "GPTQ-4bit" misses both
+        # maps, so BPP and the quality penalty silently fall to defaults.
+        return f"GPTQ-Int{m.group(1)}" if m else "GPTQ-Int4"
+    if "awq" in text:
+        m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text)
+        # Catalog keys are "AWQ-4bit"/"AWQ-8bit"; bare "AWQ" misses the maps.
+        return f"AWQ-{m.group(1)}bit" if m else "AWQ-4bit"
+    if "mlx" in text:
+        m = re.search(r"mlx[-_]?(\d{1,2})bit", text)
+        return f"mlx-{m.group(1)}bit" if m else native_quant
+    if not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text):
+        return "INT8"
+    return native_quant
+
+
+def analyze_model(model, system, target_quant=None, scoring_use_case=None, target_context=None):
+    pb = params_b(model)
+    if pb <= 0:
+        return None
+
+    model_use_case = infer_use_case(model)
+    score_use_case = scoring_use_case or "general"
+    has_gpu = system.get("has_gpu", False)
+    gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0
+    gpu_count = system.get("gpu_count", 1) or 1
+    single_gpu_vram = gpu_vram / gpu_count if gpu_count > 1 else gpu_vram
+    available_ram = system.get("available_ram_gb", 0)
+    # When the user has explicitly picked a GPU config (not RAM mode), they want
+    # to see what runs ON the GPU(s) — not big models that only "fit" by spilling
+    # most layers to system RAM. Zeroing the offload budget makes _try_quant_at
+    # take only its GPU branches (fit on VRAM, shrinking context if needed),
+    # otherwise return None. Fixes "96 GB GPU still lists a 175 GB model".
+    gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0
+    eff_ram = 0 if gpu_only else available_ram
+    is_moe = model.get("is_moe", False)
+    model_ctx = model.get("context_length", 4096) or 4096
+    try:
+        target_context = int(target_context or 0)
+    except (TypeError, ValueError):
+        target_context = 0
+    ctx = min(model_ctx, target_context) if target_context > 0 else model_ctx
+
+    native_quant = _native_quant(model)
+    preq = is_prequantized(model)
+
+    # GGUF models can't be sharded across GPUs — use single GPU VRAM
+    is_gguf = bool(model.get("gguf_sources"))
+    quant_upper = (native_quant or "").upper()
+    is_gguf_quant = any(quant_upper.startswith(p) for p in ("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ", "F16", "F32"))
+    # Single-GPU VRAM only applies to GGUF/dense builds (llama.cpp can't shard
+    # across GPUs). Prequantized formats (AWQ/GPTQ/FP8) are served sharded by
+    # vLLM across all GPUs, so they get the FULL multi-GPU VRAM — even when the
+    # model also lists a GGUF alternate download (gguf_sources).
+    if (is_gguf or is_gguf_quant) and not preq:
+        effective_vram = single_gpu_vram
+    else:
+        effective_vram = gpu_vram
+
+    native_gpu_only = preq and not native_quant.startswith("mlx-")
+
+    # Determine which quant to evaluate at
+    native_quant_prefixes = (
+        "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+        "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+    )
+
+    if preq:
+        # Native HF/vLLM quantized repos come at a fixed format. If the user
+        # picked a GGUF quant tier (Q4/Q8/etc.), do not treat same-bit
+        # AWQ/GPTQ/FP8/FP4 builds as equivalent; those formats are separate
+        # serving paths and only appear when explicitly selected or unfiltered.
+        if target_quant:
+            if not any(target_quant.startswith(p) for p in native_quant_prefixes):
+                return None
+            _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant)
+            if _tb and _nb and _tb != _nb:
+                return None
+        quant_to_try = native_quant
+    elif target_quant:
+        # User picked a specific quant
+        quant_to_try = target_quant
+    elif gpu_count >= 2:
+        # Multi-GPU box: vLLM/SGLang can't serve GGUF Q* quants (those are
+        # llama.cpp-only). Default non-prequantized models to BF16 so the row
+        # is meaningful on a multi-GPU rig. If BF16 doesn't fit, the model
+        # surfaces as too_tight — better than showing a Q4 row the user
+        # can't actually serve with vLLM on >1 GPU.
+        quant_to_try = "BF16"
+    else:
+        # Default: Q4_K_M (user's stated preference) — kept for single-GPU
+        # and RAM modes where llama.cpp serving is the natural path.
+        quant_to_try = "Q4_K_M"
+
+    # Multi-GPU filter: skip the row if the resolved quant is a GGUF tier
+    # (Q*/IQ-prefixed) — vLLM/SGLang can't serve those, so showing them on
+    # a 2+ GPU rig just clutters the list with unservable candidates.
+    if gpu_count >= 2 and quant_to_try and not target_quant and quant_to_try.upper().startswith(("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ")):
+        return None
+
+    result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram)
+
+    if result is None:
+        # Model doesn't fit on the user's current hardware. Surface it
+        # anyway with a "too_tight" badge instead of silently dropping
+        # it — without this, editing the hardware config to try LARGER
+        # tiers never revealed the bigger models, because they were
+        # filtered out before the user could see what would fit. The
+        # client already knows how to render too_tight (red row).
+        oversized_required = estimate_memory_gb(model, quant_to_try, ctx)
+        return {
+            "name": model.get("name"),
+            "provider": model.get("provider"),
+            "parameter_count": model.get("parameter_count"),
+            "params_b": round(pb, 1),
+            "is_moe": is_moe,
+            "use_case": model_use_case,
+            "fit_level": "too_tight",
+            "run_mode": "no_fit",
+            "quant": quant_to_try,
+            "context": ctx,
+            "required_gb": round(oversized_required, 1),
+            "speed_tps": 0,
+            "score": 0,
+            "scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0},
+            "gguf_sources": model.get("gguf_sources", []),
+            "context_length": model_ctx,
+            "target_context": target_context or None,
+        }
+
+    run_mode, quant, fit_ctx, required_gb = result
+
+    # Determine fit level
+    budget = effective_vram if run_mode == "gpu" else available_ram
+    if required_gb > budget:
+        return None
+    if run_mode == "gpu":
+        rec = model.get("recommended_ram_gb") or required_gb
+        if rec <= gpu_vram:
+            fit_level = "perfect"
+        elif gpu_vram >= required_gb * 1.2:
+            fit_level = "good"
+        else:
+            fit_level = "marginal"
+    elif run_mode == "cpu_offload":
+        fit_level = "good" if available_ram >= required_gb * 1.2 else "marginal"
+    else:
+        fit_level = "marginal"
+
+    # Fraction of the model that spills to CPU RAM (drives the offload speed
+    # model). When offloading, anything beyond the GPU's VRAM lives in system RAM.
+    offload_frac = 0.0
+    if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0:
+        offload_frac = max(0.0, (required_gb - effective_vram) / required_gb)
+    tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac)
+
+    q_score = _quality_score(model, quant, score_use_case)
+    s_score = _speed_score(tps, score_use_case)
+    f_score = _fit_score(required_gb, budget)
+    c_score = _context_score(fit_ctx, score_use_case)
+
+    wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10))
+    composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc
+
+    return {
+        "name": model.get("name"),
+        "provider": model.get("provider"),
+        "parameter_count": model.get("parameter_count"),
+        "params_b": round(pb, 1),
+        "is_moe": is_moe,
+        "use_case": model_use_case,
+        "fit_level": fit_level,
+        "run_mode": run_mode,
+        "quant": quant,
+        "context": fit_ctx,
+        "required_gb": round(required_gb, 1),
+        "speed_tps": round(tps, 1),
+        "score": round(composite, 1),
+        "scores": {
+            "quality": round(q_score, 1),
+            "speed": round(s_score, 1),
+            "fit": round(f_score, 1),
+            "context": round(c_score, 1),
+        },
+        "gguf_sources": model.get("gguf_sources", []),
+        "context_length": model_ctx,
+        "release_date": model.get("release_date", ""),
+        "target_context": target_context or None,
+    }
+
+
+def _version_key(name):
+    """Parse the model's version number from its display name so equal-score
+    rows can break ties in favor of the newer release (e.g. M2.7 > M2.5).
+    Returns a float; 0.0 for names with no recognizable version. The regex
+    grabs the FIRST 'word-with-digits' pattern after a hyphen/underscore,
+    so e.g. 'MiniMax-M2.7' -> 2.7, 'Qwen3.6-35B' -> 3.6, 'M2' -> 2.0."""
+    import re as _re
+    if not name:
+        return 0.0
+    # Match the version-marker word: a letter followed by a number with
+    # optional decimal, e.g. M2.7, V4, Pro3. Take the first hit; ignore
+    # "B" param-count suffixes (Qwen3-235B should yield 3, not 235).
+    for m in _re.finditer(r"[A-Za-z](\d+(?:\.\d+)?)(?![A-Za-z])", name):
+        val = m.group(1)
+        # Skip param-count tokens (e.g. "235B" gives "235" but the next
+        # char would be "B" — already excluded by the negative lookahead).
+        try:
+            f = float(val)
+        except ValueError:
+            continue
+        # Heuristic: bare integers >= 100 are almost certainly param counts
+        # (1B/3B/8B/70B/235B…), not version numbers. Skip them.
+        if "." not in val and f >= 100:
+            continue
+        return f
+    return 0.0
+
+
+SORT_KEYS = {
+    # Score sort with version-aware tiebreaker — when two rows tie on
+    # composite score (a common case for the SAME base model in different
+    # versions, e.g. MiniMax-M2.5 vs M2.7 both at the same FP8 budget),
+    # prefer the newer version. Without this, ties resolved to whatever
+    # order they came out of the registry, which let older releases land
+    # above newer ones in user-facing lists.
+    "score": lambda r: (r["score"], _version_key(r.get("name") or "")),
+    "speed": lambda r: r["speed_tps"],
+    "vram": lambda r: r["required_gb"],
+    "params": lambda r: r["params_b"],
+    "context": lambda r: r["context"],
+    # Newest first. release_date is an ISO-ish string ("2026-05-30"); plain
+    # string sort is chronological. Missing dates sort last (empty < any date,
+    # and we sort reverse=True for newest, so "" lands at the bottom).
+    "newest": lambda r: r.get("release_date") or "",
+}
+
+
+def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None, target_context=None, fit_only=False):
+    """Rank all models against detected hardware. Returns sorted list of fit results.
+
+    fit_only: when True, drop rows whose fit_level is "too_tight" (model doesn't
+    actually fit on the chosen budget). When False (default), every model is
+    shown — sorting by Param means highest-param PERIOD, even ones that won't
+    run, so the user can see the truth.
+    """
+    models = get_models()
+    results = []
+
+    # Include image gen models only when explicitly filtered
+    if use_case == "image_gen":
+        try:
+            from services.hwfit.image_models import rank_image_models
+        except ImportError:
+            rank_image_models = None
+        if rank_image_models:
+            img_results = rank_image_models(system, search=search)
+        else:
+            img_results = []
+        for im in img_results:
+            fit_map = {"perfect": "perfect", "good": "good", "tight": "marginal", "no_fit": "too_tight", "no_gpu": "too_tight"}
+            results.append({
+                "name": im["id"],
+                "provider": im["provider"],
+                "parameter_count": f"{im['params_b']}B",
+                "params_b": im["params_b"],
+                "is_moe": False,
+                "use_case": "image_gen",
+                "fit_level": fit_map.get(im["fit"], "too_tight"),
+                "run_mode": "gpu" if im["fits"] else "no_fit",
+                "quant": im.get("quant", "BF16"),
+                "context": 0,
+                "context_length": 0,
+                "required_gb": round(im.get("vram_needed") or 0, 1),
+                "speed_tps": 0,
+                "score": float(im["score"]),
+                "scores": {"quality": float(im["quality"]), "speed": float(im["speed"]), "fit": 0, "context": 0},
+                "gguf_sources": [],
+                "is_image_gen": True,
+                "capabilities": im.get("capabilities", []),
+                "description": im.get("description", ""),
+            })
+        if use_case == "image_gen":
+            sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
+            results.sort(key=sort_fn, reverse=True)  # see main path below
+            return results[:limit]
+
+    # If user picked a native prequantized format, filter to only those models.
+    filter_native = quant and any(quant.startswith(p) for p in (
+        "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4",
+        "INT4", "INT8", "W4A16", "W8A8", "W8A16",
+    ))
+
+    system_backend = (system.get("backend") or "").lower()
+    apple_silicon = system_backend in ("mps", "metal", "apple")
+    rocm = system_backend == "rocm"
+    is_windows = system.get("platform") == "windows"
+
+    # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path
+    # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter
+    # Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels
+    # are largely unsupported there and FP8 needs out-of-tree patches. So treat
+    # consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched.
+    # Unknown family (no rocminfo) is left untouched to avoid hiding models from
+    # a possibly-capable Instinct box on a misdetect.
+    gpu_family = (system.get("gpu_family") or "").lower()
+    consumer_amd = system_backend == "rocm" and gpu_family == "rdna"
+
+    for m in models:
+        native_q = _native_quant(m)
+
+        # MLX needs the mlx_lm runtime, which Odysseus does not generate serve
+        # commands for. Hide it on every backend, including Metal.
+        if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower():
+            continue
+
+        # ROCm support for vLLM/SGLang quantized safetensors is too brittle to
+        # recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable
+        # only when the user explicitly picks that format from the quant filter;
+        # otherwise prefer GGUF/Q* entries that Odysseus can route through
+        # llama.cpp/Ollama without pretending "fits VRAM" means "servable".
+        if rocm and is_prequantized(m) and not filter_native:
+            continue
+
+        # On Apple Silicon the only serving engines are llama.cpp and Ollama,
+        # both GGUF-only (vLLM/SGLang are CUDA/ROCm and don't run on macOS). So
+        # a model is Metal-servable ONLY if it ships a real GGUF. Drop everything
+        # else — raw safetensors repos (which the catalog still tags with a
+        # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without
+        # this the Cookbook recommends models the Mac can't run; on CUDA these
+        # stay visible because vLLM serves safetensors directly.
+        #
+        # Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the
+        # servable path, so a model needs a real GGUF to be recommended.
+        # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a
+        # Radeon that can't actually serve them.
+        #
+        # Windows is the same: Odysseus only supports llama.cpp on Windows,
+        # which requires GGUF. vLLM/SGLang are explicitly blocked, so AWQ/GPTQ
+        # models without a GGUF source are unservable there.
+        if (apple_silicon or consumer_amd or is_windows) and not (m.get("is_gguf") or m.get("gguf_sources")):
+            continue
+
+        # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc.
+        if filter_native:
+            if quant == "FP8" and native_q != "FP8":
+                continue
+            if quant == "FP4" and native_q not in ("FP4", "NVFP4", "MXFP4", "NF4"):
+                continue
+            if quant.startswith("AWQ") and not native_q.startswith("AWQ"):
+                continue
+            if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"):
+                continue
+            if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"):
+                continue
+            if quant in ("INT4", "INT8", "W4A16", "W8A8", "W8A16") and native_q != quant:
+                continue
+
+        if search:
+            name = m.get("name", "").lower()
+            provider = m.get("provider", "").lower()
+            if search.lower() not in name and search.lower() not in provider:
+                continue
+
+        result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"), target_context=target_context)
+        if result is None:
+            continue
+
+        if use_case:
+            model_uc = infer_use_case(m)
+            if use_case != model_uc and use_case != "general":
+                continue
+
+        results.append(result)
+
+    # Pick the visible SET by the REQUESTED column. Per-user feedback: sorting
+    # by Param should show the highest-param models PERIOD, not just those that
+    # already fit. Same for every other column. Models that don't fit are still
+    # in the list with their fit_level marking the constraint, so the user can
+    # see the truth instead of a quietly-truncated view. Score sort is unchanged
+    # (it's the default ranking and naturally pushes non-fits to the bottom).
+    if fit_only:
+        # Hide rows that definitely don't fit (the "too_tight" badge) — user
+        # explicitly asked for a Fit-only view.
+        results = [r for r in results if r.get("fit_level") != "too_tight"]
+    sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"])
+    # Always sort descending then truncate top-N so each column shows the
+    # global highest by that metric. Before, vram was special-cased
+    # ascending → truncate kept the 50 SMALLEST models and "highest VRAM"
+    # could never appear, breaking the column-click toggle.
+    results.sort(key=sort_fn, reverse=True)
+    results = results[:limit]
+    return results
diff --git a/static/css/base.css b/static/css/base.css
index 2e82699..38f291c 100644
--- a/static/css/base.css
+++ b/static/css/base.css
@@ -20,15 +20,19 @@
   /* Schrift */
   --mono:ui-monospace,"SF Mono",Menlo,Consolas,"Liberation Mono",monospace;
   --sans:system-ui,-apple-system,"Segoe UI",Roboto,sans-serif;
+  /* Typografie Skala */
+  --text-xs: 11.5px; --text-sm: 13px; --text-base: 14px; --text-lg: 16px; --text-xl: 20px; --text-2xl: 24px;
+  /* Spacing Skala */
+  --sp-1: 4px; --sp-2: 8px; --sp-3: 12px; --sp-4: 16px; --sp-5: 20px; --sp-6: 24px; --sp-8: 32px; --sp-10: 40px;
   /* Maße */
-  --side:62px; --radius:14px;
+  --side:62px; --radius:6px;
 }
 
 *{box-sizing:border-box}
 html,body{height:100%}
 body{
   margin:0;background:var(--bg);color:var(--tx);
-  font-family:var(--sans);font-size:14.5px;line-height:1.5;
+  font-family:var(--sans);font-size:var(--text-base);line-height:1.5;
   -webkit-font-smoothing:antialiased;
 }
 a{color:var(--act);text-decoration:none}
@@ -41,7 +45,7 @@ a{color:var(--act);text-decoration:none}
   width:var(--side);flex:0 0 var(--side);
   background:var(--bg2);border-right:1px solid var(--line);
   display:flex;flex-direction:column;align-items:center;
-  padding:14px 0;gap:6px;position:sticky;top:0;height:100vh;
+  padding:var(--sp-4) 0;gap:var(--sp-2);position:sticky;top:0;height:100vh;
 }
 .side-logo{
   width:34px;height:34px;border-radius:9px;margin-bottom:10px;
@@ -51,7 +55,7 @@ a{color:var(--act);text-decoration:none}
 .side-nav{display:flex;flex-direction:column;gap:4px;flex:1}
 .side-foot{margin-top:auto}
 .nav-item{
-  width:40px;height:40px;border-radius:10px;display:grid;place-items:center;
+  width:40px;height:40px;border-radius:var(--radius);display:grid;place-items:center;
   color:var(--mut);cursor:pointer;border:1px solid transparent;transition:.15s;
 }
 .nav-item:hover{color:var(--tx);background:var(--panel)}
@@ -67,50 +71,50 @@ a{color:var(--act);text-decoration:none}
 
 .topbar{
   position:sticky;top:0;z-index:20;
-  display:flex;align-items:center;gap:14px;flex-wrap:wrap;
-  padding:12px 26px;background:rgba(10,13,18,.86);backdrop-filter:blur(8px);
+  display:flex;align-items:center;gap:var(--sp-4);flex-wrap:wrap;
+  padding:var(--sp-3) var(--sp-6);background:rgba(10,13,18,.86);backdrop-filter:blur(8px);
   border-bottom:1px solid var(--line);
 }
 .spacer{flex:1}
 .status-pill{
-  display:inline-flex;align-items:center;gap:8px;font-family:var(--mono);font-size:12.5px;
-  padding:6px 12px;border:1px solid var(--line);border-radius:999px;color:var(--mut);background:var(--panel);
+  display:inline-flex;align-items:center;gap:var(--sp-2);font-family:var(--mono);font-size:var(--text-xs);
+  padding:var(--sp-1) var(--sp-3);border:1px solid var(--line);border-radius:999px;color:var(--mut);background:var(--panel);
 }
 .dot{width:8px;height:8px;border-radius:50%;background:var(--mut);flex:0 0 auto}
 .dot.on{background:var(--on);box-shadow:0 0 0 0 rgba(63,185,80,.5);animation:pulse 2.2s infinite}
 .dot.off{background:var(--err)}
 @keyframes pulse{0%{box-shadow:0 0 0 0 rgba(63,185,80,.45)}70%{box-shadow:0 0 0 7px rgba(63,185,80,0)}100%{box-shadow:0 0 0 0 rgba(63,185,80,0)}}
-.top-stat{font-size:12.5px;color:var(--mut)}
+.top-stat{font-size:var(--text-sm);color:var(--mut)}
 .top-stat b{color:var(--tx);font-family:var(--mono);font-weight:600;margin-left:4px}
-.top-clock{font-family:var(--mono);font-size:12.5px;color:var(--act)}
+.top-clock{font-family:var(--mono);font-size:var(--text-sm);color:var(--act)}
 .tokin{
-  font-family:var(--mono);font-size:12.5px;background:var(--panel);border:1px solid var(--line);
-  color:var(--tx);border-radius:8px;padding:7px 10px;width:128px;
+  font-family:var(--mono);font-size:var(--text-sm);background:var(--panel);border:1px solid var(--line);
+  color:var(--tx);border-radius:var(--radius);padding:var(--sp-2) var(--sp-3);width:128px;
 }
 .tokin:focus{outline:none;border-color:var(--act)}
 
 /* ---- Content-Bereich ---- */
-.content{padding:22px 26px 64px;max-width:1300px;margin:0 auto;width:100%}
+.content{padding:var(--sp-5) var(--sp-6) var(--sp-10);max-width:1300px;margin:0 auto;width:100%}
 .view[hidden]{display:none}
-.view{display:flex;flex-direction:column;gap:18px}
+.view{display:flex;flex-direction:column;gap:var(--sp-4)}
 
 /* Raster-Helfer */
-.grid{display:grid;gap:18px}
-.grid-3{grid-template-columns:1fr 1fr 1fr}
-.grid-2{grid-template-columns:1fr 1fr}
+.grid{display:grid;gap:var(--sp-4)}
+.grid-3{grid-template-columns:repeat(auto-fit, minmax(300px, 1fr))}
+.grid-2{grid-template-columns:repeat(auto-fit, minmax(400px, 1fr))}
 .kpis{grid-template-columns:repeat(5,1fr)}
 @media(max-width:1180px){.grid-3{grid-template-columns:1fr 1fr}.kpis{grid-template-columns:repeat(2,1fr)}}
 @media(max-width:760px){.grid-3,.grid-2,.kpis{grid-template-columns:1fr}}
 
 /* Alert-Banner */
 .alert{
-  margin:14px 26px 0;padding:14px 18px;border-radius:12px;
-  background:linear-gradient(90deg,rgba(229,83,75,.16),rgba(229,83,75,.04));
-  border:1px solid rgba(229,83,75,.32);color:#ffcdc8;
-  display:flex;align-items:center;gap:12px;font-size:13.5px;
+  margin:var(--sp-4) var(--sp-6) 0;padding:var(--sp-3) var(--sp-4);border-radius:var(--radius);
+  background:rgba(229,83,75,.08);
+  border:1px solid rgba(229,83,75,.2);color:#ffcdc8;
+  display:flex;align-items:center;gap:var(--sp-3);font-size:var(--text-sm);
 }
 .alert .a-dot{width:8px;height:8px;border-radius:50%;background:var(--err);flex:0 0 auto}
 .alert b{color:#ffe2de}
-.alert.warn{background:linear-gradient(90deg,rgba(224,163,46,.15),rgba(224,163,46,.03));
-  border-color:rgba(224,163,46,.32);color:#f3dca6}
+.alert.warn{background:rgba(224,163,46,.08);
+  border-color:rgba(224,163,46,.2);color:#f3dca6}
 .alert.warn .a-dot{background:var(--warn)}
diff --git a/static/css/components.css b/static/css/components.css
index a300dfc..8784c9c 100644
--- a/static/css/components.css
+++ b/static/css/components.css
@@ -5,7 +5,7 @@
 /* ---- Karte (Grundbaustein) ---- */
 .card{
   background:var(--panel);border:1px solid var(--line);border-radius:var(--radius);
-  padding:18px 20px;
+  padding:var(--sp-3) var(--sp-4);
 }
 .card-h{display:flex;align-items:center;gap:10px;margin:0 0 14px}
 .card-h h3{font-size:15px;font-weight:600;margin:0;flex:1;color:var(--tx)}
@@ -14,11 +14,9 @@
 
 /* ---- Hero (Overview-Kopf) ---- */
 .hero{
-  background:
-    radial-gradient(120% 140% at 100% 0%,rgba(68,147,224,.10),transparent 60%),
-    var(--panel);
+  background:var(--panel);
   border:1px solid var(--line);border-radius:var(--radius);
-  padding:26px 28px;display:flex;justify-content:space-between;gap:24px;flex-wrap:wrap;
+  padding:var(--sp-4) var(--sp-5);display:flex;justify-content:space-between;gap:var(--sp-6);flex-wrap:wrap;
 }
 .hero .eyebrow{font-size:11.5px;letter-spacing:.22em;text-transform:uppercase;color:var(--mut)}
 .hero h1{font-size:26px;font-weight:650;margin:8px 0 6px}
@@ -34,24 +32,24 @@
 
 /* ---- KPI-Kacheln ---- */
 .kpi{
-  position:relative;border-radius:var(--radius);padding:18px 20px;
+  position:relative;border-radius:var(--radius);padding:var(--sp-3) var(--sp-4);
   border:1px solid var(--line);background:var(--panel);overflow:hidden;
 }
-.kpi .k-h{display:flex;align-items:flex-start;justify-content:space-between;gap:8px}
-.kpi .k-t{font-size:13.5px;color:var(--mut)}
+.kpi .k-h{display:flex;align-items:flex-start;justify-content:space-between;gap:var(--sp-2)}
+.kpi .k-t{font-size:var(--text-sm);color:var(--mut)}
 .kpi .k-ic{color:var(--mut);opacity:.85}
-.kpi .k-ic svg{width:20px;height:20px}
-.kpi .k-v{font-family:var(--mono);font-size:32px;font-weight:600;line-height:1.1;margin:14px 0 4px;color:var(--tx)}
-.kpi .k-v small{font-size:15px;color:var(--mut);font-weight:400}
-.kpi .k-s{font-size:12px;color:var(--mut)}
-/* Farb-Varianten */
-.kpi.green {background:linear-gradient(160deg,var(--t-green),transparent 70%);border-color:var(--b-green)}
+.kpi .k-ic svg{width:18px;height:18px}
+.kpi .k-v{font-family:var(--mono);font-size:var(--text-2xl);font-weight:600;line-height:1.1;margin:var(--sp-3) 0 var(--sp-1);color:var(--tx)}
+.kpi .k-v small{font-size:var(--text-base);color:var(--mut);font-weight:400}
+.kpi .k-s{font-size:var(--text-xs);color:var(--mut)}
+/* Farb-Varianten (flach) */
+.kpi.green {border-top:2px solid var(--on)}
 .kpi.green  .k-v,.kpi.green  .k-ic{color:var(--on)}
-.kpi.blue  {background:linear-gradient(160deg,var(--t-blue),transparent 70%);border-color:var(--b-blue)}
+.kpi.blue  {border-top:2px solid var(--act)}
 .kpi.blue   .k-v,.kpi.blue   .k-ic{color:var(--act)}
-.kpi.purple{background:linear-gradient(160deg,var(--t-purple),transparent 70%);border-color:var(--b-purple)}
+.kpi.purple{border-top:2px solid var(--purple)}
 .kpi.purple .k-v,.kpi.purple .k-ic{color:var(--purple)}
-.kpi.red   {background:linear-gradient(160deg,var(--t-red),transparent 70%);border-color:var(--b-red)}
+.kpi.red   {border-top:2px solid var(--err)}
 .kpi.red    .k-v,.kpi.red    .k-ic{color:var(--err)}
 .kpi.muted .k-v{color:var(--dim)}
 
@@ -157,3 +155,36 @@ button:disabled{opacity:.5;cursor:not-allowed}
 .guide-acc summary::after { content: "▼"; font-size: 10px; color: var(--mut); transition: transform 0.2s; }
 .guide-acc[open] summary::after { transform: rotate(180deg); }
 .guide-acc .acc-body { padding: 0 20px 20px 20px; }
+
+/* ---- Utilities & Layout Helpers ---- */
+.flex { display: flex; }
+.flex-col { display: flex; flex-direction: column; }
+.items-center { align-items: center; }
+.justify-between { justify-content: space-between; }
+.gap-2 { gap: var(--sp-2); }
+.gap-3 { gap: var(--sp-3); }
+.mt-2 { margin-top: var(--sp-2); }
+.mt-3 { margin-top: var(--sp-3); }
+.mt-4 { margin-top: var(--sp-4); }
+.mt-6 { margin-top: var(--sp-6); }
+.mb-2 { margin-bottom: var(--sp-2); }
+.mb-4 { margin-bottom: var(--sp-4); }
+.text-sm { font-size: var(--text-sm); }
+.text-xs { font-size: var(--text-xs); }
+.text-mut { color: var(--mut); }
+.text-act { color: var(--act); }
+
+/* ---- Interaktive Karten (A11y) ---- */
+.card-btn {
+  display: block; width: 100%; text-align: left;
+  background: var(--panel); border: 1px solid var(--line); border-radius: var(--radius);
+  padding: var(--sp-3) var(--sp-4); cursor: pointer; transition: border-color 0.15s, background 0.15s;
+  color: var(--tx); font-family: var(--sans);
+}
+.card-btn:hover, .card-btn:focus {
+  border-color: var(--act);
+  background: var(--bg2);
+  outline: none;
+}
+.card-btn h3 { margin: 0; font-size: var(--text-lg); font-weight: 600; }
+.card-btn p { margin: var(--sp-2) 0 0; font-size: var(--text-sm); color: var(--mut); }
diff --git a/static/index.html b/static/index.html
index a4b3fe0..00eacf6 100644
--- a/static/index.html
+++ b/static/index.html
@@ -43,9 +43,9 @@
       <section class="view" data-view="overview">
         <div id="hero"></div>
         
-        <div class="grid grid-3" id="ov-quick" style="margin-top: 24px"></div>
+        <div class="grid grid-3 mt-6" id="ov-quick"></div>
         
-        <div class="grid grid-2" style="margin-top: 24px; align-items: start;">
+        <div class="grid grid-2 mt-6" style="align-items: start;">
           <div class="card" id="ov-models"></div>
           <div class="card" id="ov-recent-jobs"></div>
         </div>
@@ -86,10 +86,10 @@
     <button id="sm-close" class="ghost" style="position:absolute; top:12px; right:12px;">Schließen</button>
     <h2 style="margin-top:0">Einstellungen</h2>
     
-    <div style="margin-top:24px">
+    <div class="mt-6">
       <label>API Token (Authentifizierung)</label>
       <input id="token" class="tokin" placeholder="Optionales API Token..." autocomplete="off">
-      <div class="meta" style="font-size:12px; margin-top:4px;">Wird für die WebSockets und API Calls genutzt, falls der Server geschützt ist.</div>
+      <div class="meta text-xs mt-2">Wird für die WebSockets und API Calls genutzt, falls der Server geschützt ist.</div>
     </div>
   </div>
 </div>
diff --git a/static/js/panels/models.js b/static/js/panels/models.js
index ffb1eb2..33d6138 100644
--- a/static/js/panels/models.js
+++ b/static/js/panels/models.js
@@ -20,7 +20,7 @@ function mount() {
 
   $("#m-table").innerHTML = `
     <div class="card-h"><h3>Modelle &amp; Ports</h3><span class="meta" id="m-count"></span></div>
-    <div class="hint" style="margin-bottom: 16px;">
+    <div class="hint mb-4">
       💡 <b>Modelle werden automatisch geladen</b>, sobald eine Chat-Anfrage an sie gestellt wird. Du musst sie nicht manuell starten.
     </div>
     <table>
@@ -36,13 +36,13 @@ function mount() {
         <h2 style="margin-top:0">Modell konfigurieren</h2>
         <p class="meta" id="cfg-model-name"></p>
         
-        <div style="margin-top:24px">
+        <div class="mt-6">
           <label>Context-Size (Tokens)</label>
           <input id="cfg-ctx" type="number" class="tokin" value="8192">
-          <div class="meta" style="font-size:12px; margin-top:4px;">Höhere Werte erlauben längere Dokumente, brauchen aber mehr VRAM.</div>
+          <div class="meta text-xs mt-2">Höhere Werte erlauben längere Dokumente, brauchen aber mehr VRAM.</div>
         </div>
 
-        <button class="primary" id="cfg-save" style="width:100%; margin-top:24px; padding:12px">Speichern</button>
+        <button class="primary mt-6" id="cfg-save" style="width:100%; padding:var(--sp-3)">Speichern</button>
       </div>
     </div>
   `;
diff --git a/static/js/panels/overview.js b/static/js/panels/overview.js
index d5c788d..d14f592 100644
--- a/static/js/panels/overview.js
+++ b/static/js/panels/overview.js
@@ -29,29 +29,29 @@ function renderHero() {
 function renderQuickActions() {
   // 3 Kacheln (Cookbook, Server-Status, Aktivität/Guides)
   $("#ov-quick").innerHTML = `
-    <div class="card" style="cursor:pointer; display:flex; flex-direction:column; gap:12px; transition:border-color 0.2s" onclick="document.querySelector('.nav-item[data-view=\\'cookbook\\']').click()" onmouseover="this.style.borderColor='var(--act)'" onmouseout="this.style.borderColor='var(--line)'">
-      <div style="display:flex; justify-content:space-between; align-items:center;">
-        <h3 style="margin:0; font-size:16px;">Modell finden</h3>
-        <span style="color:var(--act)">${icon("book")}</span>
+    <button class="card-btn" onclick="document.querySelector('.nav-item[data-view=\\'cookbook\\']').click()">
+      <div class="flex justify-between items-center">
+        <h3>Modell finden</h3>
+        <span class="text-act">${icon("book")}</span>
       </div>
-      <p style="margin:0; font-size:13px; color:var(--mut);">Durchsuche HuggingFace nach neuen Modellen im Cookbook.</p>
-    </div>
+      <p>Durchsuche HuggingFace nach neuen Modellen im Cookbook.</p>
+    </button>
     
-    <div class="card" style="cursor:pointer; display:flex; flex-direction:column; gap:12px; transition:border-color 0.2s" onclick="document.querySelector('.nav-item[data-view=\\'activity\\']').click()" onmouseover="this.style.borderColor='var(--act)'" onmouseout="this.style.borderColor='var(--line)'">
-      <div style="display:flex; justify-content:space-between; align-items:center;">
-        <h3 style="margin:0; font-size:16px;">Live Metriken</h3>
-        <span style="color:var(--act)">${icon("pulse")}</span>
+    <button class="card-btn" onclick="document.querySelector('.nav-item[data-view=\\'activity\\']').click()">
+      <div class="flex justify-between items-center">
+        <h3>Live Metriken</h3>
+        <span class="text-act">${icon("pulse")}</span>
       </div>
-      <p style="margin:0; font-size:13px; color:var(--mut);">${SYS ? `System läuft (RAM: ${SYS.ram.percent.toFixed(0)}%, CPU: ${SYS.cpu.percent.toFixed(0)}%)` : 'Lade Metriken...'}</p>
-    </div>
+      <p>${SYS ? `System läuft (RAM: ${SYS.ram.percent.toFixed(0)}%, CPU: ${SYS.cpu.percent.toFixed(0)}%)` : 'Lade Metriken...'}</p>
+    </button>
 
-    <div class="card" style="cursor:pointer; display:flex; flex-direction:column; gap:12px; transition:border-color 0.2s" onclick="document.querySelector('.nav-item[data-view=\\'server\\']').click()" onmouseover="this.style.borderColor='var(--act)'" onmouseout="this.style.borderColor='var(--line)'">
-      <div style="display:flex; justify-content:space-between; align-items:center;">
-        <h3 style="margin:0; font-size:16px;">Wartung</h3>
-        <span style="color:var(--act)">${icon("server")}</span>
+    <button class="card-btn" onclick="document.querySelector('.nav-item[data-view=\\'server\\']').click()">
+      <div class="flex justify-between items-center">
+        <h3>Wartung</h3>
+        <span class="text-act">${icon("server")}</span>
       </div>
-      <p style="margin:0; font-size:13px; color:var(--mut);">Server neustarten, VRAM leeren oder Engine aktualisieren.</p>
-    </div>
+      <p>Server neustarten, VRAM leeren oder Engine aktualisieren.</p>
+    </button>
   `;
 }
 
@@ -107,9 +107,9 @@ function renderRecentJobs() {
     ${latest.length
       ? `<div class="list">
           ${latest.map(j => `
-            <div class="li" style="padding:10px 4px">
+            <div class="li">
               <div class="li-main">
-                <div class="li-id" style="font-size:12.5px">${esc(j.label)}</div>
+                <div class="li-id text-sm">${esc(j.label)}</div>
               </div>
               <div class="li-right">${statusBadge(j.state)}</div>
             </div>
diff --git a/update.tar.gz b/update.tar.gz
new file mode 100644
index 0000000..312463b
Binary files /dev/null and b/update.tar.gz differ