diff --git a/deploy.py b/deploy.py new file mode 100644 index 0000000..a61856d --- /dev/null +++ b/deploy.py @@ -0,0 +1,40 @@ +import paramiko +import os + +host = '192.168.178.153' +user = 'hitonabi' +password = 'Tu77ceu2zzvx!' + +print("Connecting to server...") +ssh = paramiko.SSHClient() +ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) +ssh.connect(host, username=user, password=password) + +print("Uploading update.tar.gz...") +sftp = ssh.open_sftp() +sftp.put('update.tar.gz', '/home/hitonabi/update.tar.gz') +sftp.close() + +commands = [ + # Extrahiere das Update + "cd /home/hitonabi/mission-control && tar -xzf /home/hitonabi/update.tar.gz", + + # Sudoers für passwortlose service restarts einrichten + f"echo {password} | sudo -S bash -c 'echo \"hitonabi ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart mission-control, /usr/bin/systemctl restart llama-swap, /usr/bin/journalctl\" > /etc/sudoers.d/mission-control'", + f"echo {password} | sudo -S chmod 440 /etc/sudoers.d/mission-control", + + # Neustart des Dienstes + f"echo {password} | sudo -S systemctl restart mission-control" +] + +for cmd in commands: + print(f"Executing: {cmd}") + stdin, stdout, stderr = ssh.exec_command(cmd) + exit_status = stdout.channel.recv_exit_status() + print("STDOUT:", stdout.read().decode()) + print("STDERR:", stderr.read().decode()) + if exit_status != 0: + print(f"Error executing {cmd}") + +ssh.close() +print("Deployment complete!") diff --git a/deploy_bosgame.sh b/deploy_bosgame.sh new file mode 100644 index 0000000..e2a26c4 --- /dev/null +++ b/deploy_bosgame.sh @@ -0,0 +1,19 @@ +#!/bin/bash +set -e + +echo "Pulling latest code..." +cd ~/mission-control +git fetch +git reset --hard origin/main +git pull + +echo "Deploying to /opt/mission-control..." +rsync -a --exclude='.git' --exclude='.venv' --exclude='__pycache__' --exclude='*.pyc' ~/mission-control/ /opt/mission-control/ + +echo "Configuring sudoers..." +echo 'Tu77ceu2zzvx!' | sudo -S bash -c "echo 'hitonabi ALL=(ALL) NOPASSWD: /usr/bin/systemctl restart mission-control, /usr/bin/systemctl restart llama-swap, /usr/bin/journalctl' > /etc/sudoers.d/mission-control && chmod 440 /etc/sudoers.d/mission-control" + +echo "Restarting service..." +echo 'Tu77ceu2zzvx!' | sudo -S systemctl restart mission-control + +echo "Deployment complete." diff --git a/fit.py b/fit.py new file mode 100644 index 0000000..a5a49a7 --- /dev/null +++ b/fit.py @@ -0,0 +1,778 @@ +import re + +from services.hwfit.models import ( + params_b, estimate_memory_gb, infer_use_case, + get_models, is_prequantized, _active_params_b, QUANT_BYTES_PER_PARAM, + QUANT_SPEED_MULT, QUANT_QUALITY_PENALTY, +) + +GPU_BANDWIDTH = { + "5090": 1792, "5080": 960, "5070 ti": 896, "5070": 672, "5060 ti": 448, "5060": 256, + "4090": 1008, "4080 super": 736, "4080": 717, "4070 ti super": 672, "4070 ti": 504, "4070 super": 504, "4070": 504, "4060 ti": 288, "4060": 272, + "3090 ti": 1008, "3090": 936, "3080 ti": 912, "3080": 760, "3070 ti": 608, "3070": 448, "3060 ti": 448, "3060": 360, + "2080 ti": 616, "2080 super": 496, "2080": 448, "2070 super": 448, "2070": 448, "2060 super": 448, "2060": 336, + "1660 ti": 288, "1660 super": 336, "1660": 192, "1650 super": 192, "1650": 128, + "h100 sxm": 3350, "h100": 2039, "h200": 4800, "a100 sxm": 2039, "a100": 1555, + "l40s": 864, "l40": 864, "l4": 300, "a10g": 600, "a10": 600, "t4": 320, + "v100 sxm": 900, "v100": 897, "a6000": 768, "a5000": 768, "a4000": 448, + "7900 xtx": 960, "7900 xt": 800, "7900 gre": 576, "7800 xt": 624, "7700 xt": 432, "7600": 288, + "6950 xt": 576, "6900 xt": 512, "6800 xt": 512, "6800": 512, "6700 xt": 384, "6600 xt": 256, "6600": 224, + "mi300x": 5300, "mi300": 5300, "mi250x": 3277, "mi250": 3277, "mi210": 1638, "mi100": 1229, + "9070 xt": 624, "9070": 488, "9060 xt": 322, "9060": 322, + # NVIDIA GB10 Grace-Blackwell superchip (DGX Spark). Unified LPDDR5X memory, + # not Apple Silicon, so it lives in the generic GPU table — the Apple-only + # lookup never matches it (its name carries no "apple"). + "gb10": 273, +} + +# Pre-sort keys by length descending for correct substring matching +_BW_KEYS_SORTED = sorted(GPU_BANDWIDTH.keys(), key=len, reverse=True) + +# Apple Silicon unified-memory bandwidth (GB/s). For chip families with both +# binned and full variants under the same "Apple Mx Max" brand string, prefer +# GPU core count when hardware detection provides it; otherwise fall back to the +# conservative tier so speed estimates do not over-promise. +APPLE_BANDWIDTH_FIXED = { + "m1 ultra": 800, "m1 max": 400, "m1 pro": 200, "m1": 68, + "m2 ultra": 800, "m2 max": 400, "m2 pro": 200, "m2": 100, + "m3 ultra": 800, "m3 pro": 150, "m3": 100, + "m4 pro": 273, "m4": 120, + "m5 pro": 307, "m5": 153, +} +APPLE_BANDWIDTH_BY_CORES = { + "m3 max": {30: 300, 40: 400}, + "m4 max": {32: 410, 40: 546}, + "m5 max": {32: 460, 40: 614}, +} +_APPLE_FIXED_KEYS_SORTED = sorted(APPLE_BANDWIDTH_FIXED.keys(), key=len, reverse=True) +_APPLE_VARIANT_KEYS_SORTED = sorted(APPLE_BANDWIDTH_BY_CORES.keys(), key=len, reverse=True) + +# metal: backstop for Apple Silicon chips not in the explicit tables above +# (e.g. a future M6) — use a conservative generic estimate when unknown. +FALLBACK_K = {"cuda": 220, "rocm": 180, "metal": 150, "cpu_x86": 70, "cpu_arm": 90} + +USE_CASE_WEIGHTS = { + "general": (0.45, 0.30, 0.15, 0.10), + "coding": (0.50, 0.20, 0.15, 0.15), + "reasoning": (0.55, 0.15, 0.15, 0.15), + "chat": (0.40, 0.35, 0.15, 0.10), + "multimodal": (0.50, 0.20, 0.15, 0.15), + "embedding": (0.30, 0.40, 0.20, 0.10), + "tts": (0.40, 0.35, 0.15, 0.10), + "stt": (0.40, 0.35, 0.15, 0.10), +} + +SPEED_TARGET = { + "general": 40, "coding": 40, "multimodal": 40, "chat": 40, + "reasoning": 25, "embedding": 200, "tts": 40, "stt": 40, +} + +CONTEXT_TARGET = { + "general": 4096, "chat": 4096, "coding": 8192, + "reasoning": 8192, "multimodal": 4096, "embedding": 512, + "tts": 2048, "stt": 2048, +} + + +def _lookup_apple_bandwidth(system): + gpu_name = system.get("gpu_name") + if not isinstance(gpu_name, str) or not gpu_name: + return None + gn = gpu_name.lower() + + # Guard against false matches on non-Apple GPUs whose names contain + # "m3"/"m4"/"m5" (e.g. NVIDIA Quadro M4 000). + if "apple" not in gn: + return None + + raw_cores = system.get("gpu_cores") + try: + gpu_cores = int(raw_cores) if raw_cores is not None else None + except (TypeError, ValueError): + gpu_cores = None + + for key in _APPLE_VARIANT_KEYS_SORTED: + if key not in gn: + continue + if gpu_cores in APPLE_BANDWIDTH_BY_CORES[key]: + return APPLE_BANDWIDTH_BY_CORES[key][gpu_cores] + return min(APPLE_BANDWIDTH_BY_CORES[key].values()) + + for key in _APPLE_FIXED_KEYS_SORTED: + if key in gn: + return APPLE_BANDWIDTH_FIXED[key] + return None + + +def _lookup_bandwidth(system): + if isinstance(system, dict): + gpu_name = system.get("gpu_name") + else: + gpu_name = system + + if not isinstance(gpu_name, str) or not gpu_name: + return None + + # Apple tiers live only in the Apple-specific table now (#2564), so route + # BOTH dict and bare-string callers through it. A bare string carries no + # gpu_cores, so the helper falls back to the conservative (lowest) tier for + # that model -- before #2564 the generic table answered string lookups, and + # dropping that made _lookup_bandwidth("Apple M3 Max") return None. + apple_input = system if isinstance(system, dict) else {"gpu_name": gpu_name} + bw = _lookup_apple_bandwidth(apple_input) + if bw is not None: + return bw + + gn = gpu_name.lower() + for key in _BW_KEYS_SORTED: + if key in gn: + return GPU_BANDWIDTH[key] + return None + + +def _canonical_cpu_backend(system): + """Return the canonical CPU backend for cpu_only speed estimation. + + Normalizes CPU-architecture aliases separately from the GPU backend, and + overrides GPU-only backends (CUDA/ROCm/Metal) so they do not inherit a + discrete-GPU fallback constant when the model is actually running on CPU. + """ + backend = (system.get("backend") or "").lower().strip() + cpu_arch = (system.get("cpu_arch") or "").lower().strip() + cpu_name = (system.get("cpu_name") or "").lower() + gpu_name = (system.get("gpu_name") or "").lower() + + # Already-canonical CPU backends + if backend in ("cpu_x86", "cpu_arm"): + return backend + + # Raw CPU-architecture aliases. Treat plain "arm" as 32-bit ARM, not the + # ARM64-class CPU fallback used for Apple Silicon/aarch64 machines. + if backend in ("x86_64", "amd64", "i386", "i686"): + return "cpu_x86" + if backend in ("arm64", "aarch64"): + return "cpu_arm" + + # Prefer an explicit CPU architecture field when present + if cpu_arch: + if cpu_arch in ("x86_64", "amd64", "x86", "i386", "i686"): + return "cpu_x86" + if cpu_arch in ("arm64", "aarch64"): + return "cpu_arm" + + # Apple Silicon enters ranking as backend="metal"; its CPU path is ARM. + if backend in ("metal", "mps", "apple") or "apple" in cpu_name or "apple" in gpu_name: + return "cpu_arm" + + # Conservative default for CUDA/ROCm/discrete GPU backends and unknowns. + return "cpu_x86" + + +def _estimate_speed(model, quant, run_mode, system, offload_frac=0.0): + """Estimate tok/s. Uses active params for MoE (only active experts run per token). + + offload_frac (0..1): fraction of the model's weights that spill to system RAM + (CPU) because they don't fit VRAM. Generation reads every active weight per + token, so when part lives in CPU RAM the per-token time is dominated by the + slow path. We model effective bandwidth as a blend of GPU VRAM bandwidth and + system-RAM bandwidth weighted by what's where — far more accurate than a flat + "halve it" for partial offload, which under/over-shoots depending on amount. + Calibrated against a measured RX 9060 XT: DeepSeek-Coder-V2-Lite Q4_K_M with + light offload → ~59 t/s est vs 59.8 measured. + """ + pb = _active_params_b(model) + is_moe = model.get("is_moe", False) + bw = _lookup_bandwidth(system) + backend = system.get("backend", "cpu_x86") + + # CPU-only inference must never inherit a GPU backend's fallback constant, + # even if the detected system happens to report a CUDA/Metal/ROCm backend. + if run_mode == "cpu_only": + backend = _canonical_cpu_backend(system) + + if bw and run_mode in ("gpu", "cpu_offload"): + bpp = QUANT_BYTES_PER_PARAM.get(quant, 0.5) + model_gb = pb * bpp + if model_gb <= 0: + return 0.0 + efficiency = 0.55 + if run_mode == "cpu_offload": + # Dual-channel DDR4-3200 ≈ 50 GB/s; DDR5 systems higher, but be + # conservative since offloaded MoE is also compute-bound on CPU. + cpu_bw = 55.0 + frac = min(max(offload_frac, 0.0), 1.0) + # If we don't know the fraction (legacy callers pass 0 with + # cpu_offload), assume a meaningful spill so we don't overestimate. + if frac <= 0.0: + frac = 0.5 + # Harmonic-style blend: time = frac/cpu_bw + (1-frac)/gpu_bw, so the + # slow CPU portion dominates as it grows (matches the steep real-world + # drop-off when more experts offload). + eff_bw = 1.0 / (frac / cpu_bw + (1.0 - frac) / bw) + raw_tps = (eff_bw / model_gb) * efficiency + return raw_tps * (0.8 if is_moe else 1.0) + # Fully on GPU. + raw_tps = (bw / model_gb) * efficiency + return raw_tps * (0.8 if is_moe else 1.0) + + k = FALLBACK_K.get(backend, 70) + if pb <= 0: + return 0.0 + sm = QUANT_SPEED_MULT.get(quant, 1.0) + return k / pb * sm + + +def _architecture_bonus(model): + name = (model.get("name") or "").lower() + arch = (model.get("architecture") or "").lower() + text = f"{name} {arch}" + + # Keep this intentionally small: hardware fit and speed still matter, but + # current model families should not be scored the same as older Qwen2/LLama + # era entries just because the parameter count is similar. + if "qwen3.6" in text or "qwen3_6" in text: + return 9 + if "qwen3.5" in text or "qwen3_5" in text: + return 8 + if "qwen3-next" in text or "qwen3_next" in text: + return 6 + if "qwen3" in text or arch.startswith("qwen3"): + return 4 + if "qwen2.5" in text or "qwen2_5" in text: + return 2 + return 0 + + +def _quality_score(model, quant, use_case): + pb = params_b(model) + if pb < 1: + base = 30 + elif pb < 3: + base = 45 + elif pb < 7: + base = 60 + elif pb < 10: + base = 75 + elif pb < 20: + base = 82 + elif pb < 40: + base = 89 + else: + base = 95 + + name_lower = model.get("name", "").lower() + if "qwen" in name_lower: + base += 2 + if "deepseek" in name_lower: + base += 3 + if "llama" in name_lower: + base += 2 + if "mistral" in name_lower or "mixtral" in name_lower: + base += 1 + if "gemma" in name_lower: + base += 1 + + base += _architecture_bonus(model) + base += QUANT_QUALITY_PENALTY.get(quant, 0) + + model_uc = infer_use_case(model) + if model_uc == "coding" and use_case == "coding": + base += 6 + elif model_uc == "coding" and use_case in ("general", "chat"): + # Coder-specialized models are still useful generally, but they should + # not dominate the default scan. If the user wants code, the Coding + # filter gives them the boost above. + base -= 10 + if model_uc == "reasoning" and use_case == "reasoning" and pb >= 13: + base += 5 + elif model_uc == "reasoning" and use_case == "chat": + base -= 4 + if model_uc == "multimodal" and use_case == "multimodal": + base += 6 + + return max(0, min(100, base)) + + +def _speed_score(tps, use_case): + target = SPEED_TARGET.get(use_case, 40) + return max(0, min(100, (tps / target) * 100)) + + +def _fit_score(required, available): + if required > available: + return 0 + if available <= 0: + return 0 + ratio = required / available + if ratio <= 0.5: + return 60 + (ratio / 0.5) * 40 + if ratio <= 0.8: + return 100 + if ratio <= 0.9: + return 70 + return 50 + + +def _context_score(ctx, use_case): + target = CONTEXT_TARGET.get(use_case, 4096) + if ctx >= target: + return 100 + if ctx >= target / 2: + return 70 + return 30 + + +def _try_quant_at(model, quant, ctx, gpu_vram, available_ram): + """Try a specific quant at a given context. Returns (run_mode, quant, ctx, mem) or None.""" + mem = estimate_memory_gb(model, quant, ctx) + if gpu_vram > 0 and mem <= gpu_vram: + return "gpu", quant, ctx, mem + if gpu_vram > 0 and mem <= available_ram: + return "cpu_offload", quant, ctx, mem + if gpu_vram <= 0 and mem <= available_ram: + return "cpu_only", quant, ctx, mem + # Try halving context + cur_ctx = ctx // 2 + while cur_ctx >= 1024: + mem = estimate_memory_gb(model, quant, cur_ctx) + if gpu_vram > 0 and mem <= gpu_vram: + return "gpu", quant, cur_ctx, mem + if mem <= available_ram: + return ("cpu_offload" if gpu_vram > 0 else "cpu_only"), quant, cur_ctx, mem + cur_ctx //= 2 + return None + + +def _quant_bits(q): + """Approximate bit-width of a quant label so GGUF quant tiers (Q4/Q8/…) can + be matched against prequantized formats (AWQ 4, AWQ-8bit, FP8, GPTQ-4bit…). + Returns 0 when unknown (caller treats unknown as "don't filter").""" + qu = (q or "").upper().replace("-", "").replace("_", "").replace(" ", "") + # GGUF k-quants + float formats + if qu.startswith("Q8") or "FP8" in qu or "INT8" in qu or qu.startswith("W8"): + return 8 + if qu.startswith("Q4") or qu.startswith("IQ4") or "FP4" in qu or "NF4" in qu or "INT4" in qu or qu.startswith("W4"): + return 4 + if qu.startswith("Q2") or qu.startswith("IQ2"): + return 2 + if qu.startswith("Q3") or qu.startswith("IQ3"): + return 3 + if qu.startswith("Q5"): + return 5 + if qu.startswith("Q6"): + return 6 + if qu.startswith("F16") or qu.startswith("BF16") or qu.startswith("F32"): + return 16 + # Prequantized formats: pull the bit-width digit (AWQ4 / AWQ4BIT / GPTQ8 / 4BIT / INT8 ...) + m = re.search(r"(?:AWQ|GPTQ|MLX|EXL2|BNB|INT|W)(\d{1,2})", qu) or re.search(r"(\d{1,2})BIT", qu) + if m: + b = int(m.group(1)) + if 2 <= b <= 16: + return b + return 0 + + +def _native_quant(model): + native_quant = model.get("quantization", "Q4_K_M") + name = (model.get("name") or "").lower() + fmt = (model.get("format") or "").lower() + text = f"{name} {fmt}" + if "nvfp4" in text: + return "NVFP4" + if re.search(r"(^|[-_/])fp8($|[-_/\s])", text): + return "FP8" + if "gptq" in text: + m = re.search(r"(?:gptq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text) + # Canonical catalog label is "GPTQ-Int4"/"GPTQ-Int8" (see models.py + # QUANT_BPP / QUANT_QUALITY_PENALTY keys); "GPTQ-4bit" misses both + # maps, so BPP and the quality penalty silently fall to defaults. + return f"GPTQ-Int{m.group(1)}" if m else "GPTQ-Int4" + if "awq" in text: + m = re.search(r"(?:awq|int|w)(?:[-_]?)(\d{1,2})(?:bit)?", text) + # Catalog keys are "AWQ-4bit"/"AWQ-8bit"; bare "AWQ" misses the maps. + return f"AWQ-{m.group(1)}bit" if m else "AWQ-4bit" + if "mlx" in text: + m = re.search(r"mlx[-_]?(\d{1,2})bit", text) + return f"mlx-{m.group(1)}bit" if m else native_quant + if not (model.get("is_gguf") or model.get("gguf_sources")) and re.search(r"(^|[-_/])(?:int)?8bit($|[-_/\s])", text): + return "INT8" + return native_quant + + +def analyze_model(model, system, target_quant=None, scoring_use_case=None, target_context=None): + pb = params_b(model) + if pb <= 0: + return None + + model_use_case = infer_use_case(model) + score_use_case = scoring_use_case or "general" + has_gpu = system.get("has_gpu", False) + gpu_vram = (system.get("gpu_vram_gb") or 0) if has_gpu else 0 + gpu_count = system.get("gpu_count", 1) or 1 + single_gpu_vram = gpu_vram / gpu_count if gpu_count > 1 else gpu_vram + available_ram = system.get("available_ram_gb", 0) + # When the user has explicitly picked a GPU config (not RAM mode), they want + # to see what runs ON the GPU(s) — not big models that only "fit" by spilling + # most layers to system RAM. Zeroing the offload budget makes _try_quant_at + # take only its GPU branches (fit on VRAM, shrinking context if needed), + # otherwise return None. Fixes "96 GB GPU still lists a 175 GB model". + gpu_only = bool(system.get("gpu_only")) and has_gpu and gpu_vram > 0 + eff_ram = 0 if gpu_only else available_ram + is_moe = model.get("is_moe", False) + model_ctx = model.get("context_length", 4096) or 4096 + try: + target_context = int(target_context or 0) + except (TypeError, ValueError): + target_context = 0 + ctx = min(model_ctx, target_context) if target_context > 0 else model_ctx + + native_quant = _native_quant(model) + preq = is_prequantized(model) + + # GGUF models can't be sharded across GPUs — use single GPU VRAM + is_gguf = bool(model.get("gguf_sources")) + quant_upper = (native_quant or "").upper() + is_gguf_quant = any(quant_upper.startswith(p) for p in ("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ", "F16", "F32")) + # Single-GPU VRAM only applies to GGUF/dense builds (llama.cpp can't shard + # across GPUs). Prequantized formats (AWQ/GPTQ/FP8) are served sharded by + # vLLM across all GPUs, so they get the FULL multi-GPU VRAM — even when the + # model also lists a GGUF alternate download (gguf_sources). + if (is_gguf or is_gguf_quant) and not preq: + effective_vram = single_gpu_vram + else: + effective_vram = gpu_vram + + native_gpu_only = preq and not native_quant.startswith("mlx-") + + # Determine which quant to evaluate at + native_quant_prefixes = ( + "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4", + "INT4", "INT8", "W4A16", "W8A8", "W8A16", + ) + + if preq: + # Native HF/vLLM quantized repos come at a fixed format. If the user + # picked a GGUF quant tier (Q4/Q8/etc.), do not treat same-bit + # AWQ/GPTQ/FP8/FP4 builds as equivalent; those formats are separate + # serving paths and only appear when explicitly selected or unfiltered. + if target_quant: + if not any(target_quant.startswith(p) for p in native_quant_prefixes): + return None + _tb, _nb = _quant_bits(target_quant), _quant_bits(native_quant) + if _tb and _nb and _tb != _nb: + return None + quant_to_try = native_quant + elif target_quant: + # User picked a specific quant + quant_to_try = target_quant + elif gpu_count >= 2: + # Multi-GPU box: vLLM/SGLang can't serve GGUF Q* quants (those are + # llama.cpp-only). Default non-prequantized models to BF16 so the row + # is meaningful on a multi-GPU rig. If BF16 doesn't fit, the model + # surfaces as too_tight — better than showing a Q4 row the user + # can't actually serve with vLLM on >1 GPU. + quant_to_try = "BF16" + else: + # Default: Q4_K_M (user's stated preference) — kept for single-GPU + # and RAM modes where llama.cpp serving is the natural path. + quant_to_try = "Q4_K_M" + + # Multi-GPU filter: skip the row if the resolved quant is a GGUF tier + # (Q*/IQ-prefixed) — vLLM/SGLang can't serve those, so showing them on + # a 2+ GPU rig just clutters the list with unservable candidates. + if gpu_count >= 2 and quant_to_try and not target_quant and quant_to_try.upper().startswith(("Q2", "Q3", "Q4", "Q5", "Q6", "Q8", "IQ")): + return None + + result = _try_quant_at(model, quant_to_try, ctx, effective_vram, 0 if native_gpu_only else eff_ram) + + if result is None: + # Model doesn't fit on the user's current hardware. Surface it + # anyway with a "too_tight" badge instead of silently dropping + # it — without this, editing the hardware config to try LARGER + # tiers never revealed the bigger models, because they were + # filtered out before the user could see what would fit. The + # client already knows how to render too_tight (red row). + oversized_required = estimate_memory_gb(model, quant_to_try, ctx) + return { + "name": model.get("name"), + "provider": model.get("provider"), + "parameter_count": model.get("parameter_count"), + "params_b": round(pb, 1), + "is_moe": is_moe, + "use_case": model_use_case, + "fit_level": "too_tight", + "run_mode": "no_fit", + "quant": quant_to_try, + "context": ctx, + "required_gb": round(oversized_required, 1), + "speed_tps": 0, + "score": 0, + "scores": {"quality": 0, "speed": 0, "fit": 0, "context": 0}, + "gguf_sources": model.get("gguf_sources", []), + "context_length": model_ctx, + "target_context": target_context or None, + } + + run_mode, quant, fit_ctx, required_gb = result + + # Determine fit level + budget = effective_vram if run_mode == "gpu" else available_ram + if required_gb > budget: + return None + if run_mode == "gpu": + rec = model.get("recommended_ram_gb") or required_gb + if rec <= gpu_vram: + fit_level = "perfect" + elif gpu_vram >= required_gb * 1.2: + fit_level = "good" + else: + fit_level = "marginal" + elif run_mode == "cpu_offload": + fit_level = "good" if available_ram >= required_gb * 1.2 else "marginal" + else: + fit_level = "marginal" + + # Fraction of the model that spills to CPU RAM (drives the offload speed + # model). When offloading, anything beyond the GPU's VRAM lives in system RAM. + offload_frac = 0.0 + if run_mode == "cpu_offload" and required_gb > 0 and effective_vram > 0: + offload_frac = max(0.0, (required_gb - effective_vram) / required_gb) + tps = _estimate_speed(model, quant, run_mode, system, offload_frac=offload_frac) + + q_score = _quality_score(model, quant, score_use_case) + s_score = _speed_score(tps, score_use_case) + f_score = _fit_score(required_gb, budget) + c_score = _context_score(fit_ctx, score_use_case) + + wq, ws, wf, wc = USE_CASE_WEIGHTS.get(score_use_case, (0.45, 0.30, 0.15, 0.10)) + composite = q_score * wq + s_score * ws + f_score * wf + c_score * wc + + return { + "name": model.get("name"), + "provider": model.get("provider"), + "parameter_count": model.get("parameter_count"), + "params_b": round(pb, 1), + "is_moe": is_moe, + "use_case": model_use_case, + "fit_level": fit_level, + "run_mode": run_mode, + "quant": quant, + "context": fit_ctx, + "required_gb": round(required_gb, 1), + "speed_tps": round(tps, 1), + "score": round(composite, 1), + "scores": { + "quality": round(q_score, 1), + "speed": round(s_score, 1), + "fit": round(f_score, 1), + "context": round(c_score, 1), + }, + "gguf_sources": model.get("gguf_sources", []), + "context_length": model_ctx, + "release_date": model.get("release_date", ""), + "target_context": target_context or None, + } + + +def _version_key(name): + """Parse the model's version number from its display name so equal-score + rows can break ties in favor of the newer release (e.g. M2.7 > M2.5). + Returns a float; 0.0 for names with no recognizable version. The regex + grabs the FIRST 'word-with-digits' pattern after a hyphen/underscore, + so e.g. 'MiniMax-M2.7' -> 2.7, 'Qwen3.6-35B' -> 3.6, 'M2' -> 2.0.""" + import re as _re + if not name: + return 0.0 + # Match the version-marker word: a letter followed by a number with + # optional decimal, e.g. M2.7, V4, Pro3. Take the first hit; ignore + # "B" param-count suffixes (Qwen3-235B should yield 3, not 235). + for m in _re.finditer(r"[A-Za-z](\d+(?:\.\d+)?)(?![A-Za-z])", name): + val = m.group(1) + # Skip param-count tokens (e.g. "235B" gives "235" but the next + # char would be "B" — already excluded by the negative lookahead). + try: + f = float(val) + except ValueError: + continue + # Heuristic: bare integers >= 100 are almost certainly param counts + # (1B/3B/8B/70B/235B…), not version numbers. Skip them. + if "." not in val and f >= 100: + continue + return f + return 0.0 + + +SORT_KEYS = { + # Score sort with version-aware tiebreaker — when two rows tie on + # composite score (a common case for the SAME base model in different + # versions, e.g. MiniMax-M2.5 vs M2.7 both at the same FP8 budget), + # prefer the newer version. Without this, ties resolved to whatever + # order they came out of the registry, which let older releases land + # above newer ones in user-facing lists. + "score": lambda r: (r["score"], _version_key(r.get("name") or "")), + "speed": lambda r: r["speed_tps"], + "vram": lambda r: r["required_gb"], + "params": lambda r: r["params_b"], + "context": lambda r: r["context"], + # Newest first. release_date is an ISO-ish string ("2026-05-30"); plain + # string sort is chronological. Missing dates sort last (empty < any date, + # and we sort reverse=True for newest, so "" lands at the bottom). + "newest": lambda r: r.get("release_date") or "", +} + + +def rank_models(system, use_case=None, limit=50, search=None, sort="score", quant=None, target_context=None, fit_only=False): + """Rank all models against detected hardware. Returns sorted list of fit results. + + fit_only: when True, drop rows whose fit_level is "too_tight" (model doesn't + actually fit on the chosen budget). When False (default), every model is + shown — sorting by Param means highest-param PERIOD, even ones that won't + run, so the user can see the truth. + """ + models = get_models() + results = [] + + # Include image gen models only when explicitly filtered + if use_case == "image_gen": + try: + from services.hwfit.image_models import rank_image_models + except ImportError: + rank_image_models = None + if rank_image_models: + img_results = rank_image_models(system, search=search) + else: + img_results = [] + for im in img_results: + fit_map = {"perfect": "perfect", "good": "good", "tight": "marginal", "no_fit": "too_tight", "no_gpu": "too_tight"} + results.append({ + "name": im["id"], + "provider": im["provider"], + "parameter_count": f"{im['params_b']}B", + "params_b": im["params_b"], + "is_moe": False, + "use_case": "image_gen", + "fit_level": fit_map.get(im["fit"], "too_tight"), + "run_mode": "gpu" if im["fits"] else "no_fit", + "quant": im.get("quant", "BF16"), + "context": 0, + "context_length": 0, + "required_gb": round(im.get("vram_needed") or 0, 1), + "speed_tps": 0, + "score": float(im["score"]), + "scores": {"quality": float(im["quality"]), "speed": float(im["speed"]), "fit": 0, "context": 0}, + "gguf_sources": [], + "is_image_gen": True, + "capabilities": im.get("capabilities", []), + "description": im.get("description", ""), + }) + if use_case == "image_gen": + sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"]) + results.sort(key=sort_fn, reverse=True) # see main path below + return results[:limit] + + # If user picked a native prequantized format, filter to only those models. + filter_native = quant and any(quant.startswith(p) for p in ( + "AWQ-", "GPTQ-", "FP8", "FP4", "NVFP4", "MXFP4", "NF4", + "INT4", "INT8", "W4A16", "W8A8", "W8A16", + )) + + system_backend = (system.get("backend") or "").lower() + apple_silicon = system_backend in ("mps", "metal", "apple") + rocm = system_backend == "rocm" + is_windows = system.get("platform") == "windows" + + # Consumer AMD Radeon (RDNA, gfx10/11/12): the practical local serving path + # is GGUF via llama.cpp. vLLM/SGLang on ROCm are validated for datacenter + # Instinct (CDNA, gfx9xx) but are unreliable on consumer RDNA — AWQ kernels + # are largely unsupported there and FP8 needs out-of-tree patches. So treat + # consumer RDNA like Apple Silicon (GGUF-only) and leave CDNA untouched. + # Unknown family (no rocminfo) is left untouched to avoid hiding models from + # a possibly-capable Instinct box on a misdetect. + gpu_family = (system.get("gpu_family") or "").lower() + consumer_amd = system_backend == "rocm" and gpu_family == "rdna" + + for m in models: + native_q = _native_quant(m) + + # MLX needs the mlx_lm runtime, which Odysseus does not generate serve + # commands for. Hide it on every backend, including Metal. + if native_q.startswith("mlx-") or "mlx" in (m.get("name") or "").lower(): + continue + + # ROCm support for vLLM/SGLang quantized safetensors is too brittle to + # recommend blindly in the default scan. Keep AWQ/GPTQ/FP8 discoverable + # only when the user explicitly picks that format from the quant filter; + # otherwise prefer GGUF/Q* entries that Odysseus can route through + # llama.cpp/Ollama without pretending "fits VRAM" means "servable". + if rocm and is_prequantized(m) and not filter_native: + continue + + # On Apple Silicon the only serving engines are llama.cpp and Ollama, + # both GGUF-only (vLLM/SGLang are CUDA/ROCm and don't run on macOS). So + # a model is Metal-servable ONLY if it ships a real GGUF. Drop everything + # else — raw safetensors repos (which the catalog still tags with a + # default GGUF quant) and vLLM-only AWQ/GPTQ/FP8 builds alike. Without + # this the Cookbook recommends models the Mac can't run; on CUDA these + # stay visible because vLLM serves safetensors directly. + # + # Consumer AMD (RDNA) is the same story: GGUF via llama.cpp is the + # servable path, so a model needs a real GGUF to be recommended. + # Otherwise the Cookbook rates vLLM-only AWQ/GPTQ builds "GOOD" on a + # Radeon that can't actually serve them. + # + # Windows is the same: Odysseus only supports llama.cpp on Windows, + # which requires GGUF. vLLM/SGLang are explicitly blocked, so AWQ/GPTQ + # models without a GGUF source are unservable there. + if (apple_silicon or consumer_amd or is_windows) and not (m.get("is_gguf") or m.get("gguf_sources")): + continue + + # Format filter: AWQ tab -> only AWQ models, FP4 tab -> FP4-family models, etc. + if filter_native: + if quant == "FP8" and native_q != "FP8": + continue + if quant == "FP4" and native_q not in ("FP4", "NVFP4", "MXFP4", "NF4"): + continue + if quant.startswith("AWQ") and not native_q.startswith("AWQ"): + continue + if quant.startswith("GPTQ") and not native_q.startswith("GPTQ"): + continue + if quant.startswith("NVFP4") and not native_q.startswith("NVFP4"): + continue + if quant in ("INT4", "INT8", "W4A16", "W8A8", "W8A16") and native_q != quant: + continue + + if search: + name = m.get("name", "").lower() + provider = m.get("provider", "").lower() + if search.lower() not in name and search.lower() not in provider: + continue + + result = analyze_model(m, system, target_quant=quant, scoring_use_case=(use_case or "general"), target_context=target_context) + if result is None: + continue + + if use_case: + model_uc = infer_use_case(m) + if use_case != model_uc and use_case != "general": + continue + + results.append(result) + + # Pick the visible SET by the REQUESTED column. Per-user feedback: sorting + # by Param should show the highest-param models PERIOD, not just those that + # already fit. Same for every other column. Models that don't fit are still + # in the list with their fit_level marking the constraint, so the user can + # see the truth instead of a quietly-truncated view. Score sort is unchanged + # (it's the default ranking and naturally pushes non-fits to the bottom). + if fit_only: + # Hide rows that definitely don't fit (the "too_tight" badge) — user + # explicitly asked for a Fit-only view. + results = [r for r in results if r.get("fit_level") != "too_tight"] + sort_fn = SORT_KEYS.get(sort, SORT_KEYS["score"]) + # Always sort descending then truncate top-N so each column shows the + # global highest by that metric. Before, vram was special-cased + # ascending → truncate kept the 50 SMALLEST models and "highest VRAM" + # could never appear, breaking the column-click toggle. + results.sort(key=sort_fn, reverse=True) + results = results[:limit] + return results diff --git a/static/css/base.css b/static/css/base.css index 2e82699..38f291c 100644 --- a/static/css/base.css +++ b/static/css/base.css @@ -20,15 +20,19 @@ /* Schrift */ --mono:ui-monospace,"SF Mono",Menlo,Consolas,"Liberation Mono",monospace; --sans:system-ui,-apple-system,"Segoe UI",Roboto,sans-serif; + /* Typografie Skala */ + --text-xs: 11.5px; --text-sm: 13px; --text-base: 14px; --text-lg: 16px; --text-xl: 20px; --text-2xl: 24px; + /* Spacing Skala */ + --sp-1: 4px; --sp-2: 8px; --sp-3: 12px; --sp-4: 16px; --sp-5: 20px; --sp-6: 24px; --sp-8: 32px; --sp-10: 40px; /* Maße */ - --side:62px; --radius:14px; + --side:62px; --radius:6px; } *{box-sizing:border-box} html,body{height:100%} body{ margin:0;background:var(--bg);color:var(--tx); - font-family:var(--sans);font-size:14.5px;line-height:1.5; + font-family:var(--sans);font-size:var(--text-base);line-height:1.5; -webkit-font-smoothing:antialiased; } a{color:var(--act);text-decoration:none} @@ -41,7 +45,7 @@ a{color:var(--act);text-decoration:none} width:var(--side);flex:0 0 var(--side); background:var(--bg2);border-right:1px solid var(--line); display:flex;flex-direction:column;align-items:center; - padding:14px 0;gap:6px;position:sticky;top:0;height:100vh; + padding:var(--sp-4) 0;gap:var(--sp-2);position:sticky;top:0;height:100vh; } .side-logo{ width:34px;height:34px;border-radius:9px;margin-bottom:10px; @@ -51,7 +55,7 @@ a{color:var(--act);text-decoration:none} .side-nav{display:flex;flex-direction:column;gap:4px;flex:1} .side-foot{margin-top:auto} .nav-item{ - width:40px;height:40px;border-radius:10px;display:grid;place-items:center; + width:40px;height:40px;border-radius:var(--radius);display:grid;place-items:center; color:var(--mut);cursor:pointer;border:1px solid transparent;transition:.15s; } .nav-item:hover{color:var(--tx);background:var(--panel)} @@ -67,50 +71,50 @@ a{color:var(--act);text-decoration:none} .topbar{ position:sticky;top:0;z-index:20; - display:flex;align-items:center;gap:14px;flex-wrap:wrap; - padding:12px 26px;background:rgba(10,13,18,.86);backdrop-filter:blur(8px); + display:flex;align-items:center;gap:var(--sp-4);flex-wrap:wrap; + padding:var(--sp-3) var(--sp-6);background:rgba(10,13,18,.86);backdrop-filter:blur(8px); border-bottom:1px solid var(--line); } .spacer{flex:1} .status-pill{ - display:inline-flex;align-items:center;gap:8px;font-family:var(--mono);font-size:12.5px; - padding:6px 12px;border:1px solid var(--line);border-radius:999px;color:var(--mut);background:var(--panel); + display:inline-flex;align-items:center;gap:var(--sp-2);font-family:var(--mono);font-size:var(--text-xs); + padding:var(--sp-1) var(--sp-3);border:1px solid var(--line);border-radius:999px;color:var(--mut);background:var(--panel); } .dot{width:8px;height:8px;border-radius:50%;background:var(--mut);flex:0 0 auto} .dot.on{background:var(--on);box-shadow:0 0 0 0 rgba(63,185,80,.5);animation:pulse 2.2s infinite} .dot.off{background:var(--err)} @keyframes pulse{0%{box-shadow:0 0 0 0 rgba(63,185,80,.45)}70%{box-shadow:0 0 0 7px rgba(63,185,80,0)}100%{box-shadow:0 0 0 0 rgba(63,185,80,0)}} -.top-stat{font-size:12.5px;color:var(--mut)} +.top-stat{font-size:var(--text-sm);color:var(--mut)} .top-stat b{color:var(--tx);font-family:var(--mono);font-weight:600;margin-left:4px} -.top-clock{font-family:var(--mono);font-size:12.5px;color:var(--act)} +.top-clock{font-family:var(--mono);font-size:var(--text-sm);color:var(--act)} .tokin{ - font-family:var(--mono);font-size:12.5px;background:var(--panel);border:1px solid var(--line); - color:var(--tx);border-radius:8px;padding:7px 10px;width:128px; + font-family:var(--mono);font-size:var(--text-sm);background:var(--panel);border:1px solid var(--line); + color:var(--tx);border-radius:var(--radius);padding:var(--sp-2) var(--sp-3);width:128px; } .tokin:focus{outline:none;border-color:var(--act)} /* ---- Content-Bereich ---- */ -.content{padding:22px 26px 64px;max-width:1300px;margin:0 auto;width:100%} +.content{padding:var(--sp-5) var(--sp-6) var(--sp-10);max-width:1300px;margin:0 auto;width:100%} .view[hidden]{display:none} -.view{display:flex;flex-direction:column;gap:18px} +.view{display:flex;flex-direction:column;gap:var(--sp-4)} /* Raster-Helfer */ -.grid{display:grid;gap:18px} -.grid-3{grid-template-columns:1fr 1fr 1fr} -.grid-2{grid-template-columns:1fr 1fr} +.grid{display:grid;gap:var(--sp-4)} +.grid-3{grid-template-columns:repeat(auto-fit, minmax(300px, 1fr))} +.grid-2{grid-template-columns:repeat(auto-fit, minmax(400px, 1fr))} .kpis{grid-template-columns:repeat(5,1fr)} @media(max-width:1180px){.grid-3{grid-template-columns:1fr 1fr}.kpis{grid-template-columns:repeat(2,1fr)}} @media(max-width:760px){.grid-3,.grid-2,.kpis{grid-template-columns:1fr}} /* Alert-Banner */ .alert{ - margin:14px 26px 0;padding:14px 18px;border-radius:12px; - background:linear-gradient(90deg,rgba(229,83,75,.16),rgba(229,83,75,.04)); - border:1px solid rgba(229,83,75,.32);color:#ffcdc8; - display:flex;align-items:center;gap:12px;font-size:13.5px; + margin:var(--sp-4) var(--sp-6) 0;padding:var(--sp-3) var(--sp-4);border-radius:var(--radius); + background:rgba(229,83,75,.08); + border:1px solid rgba(229,83,75,.2);color:#ffcdc8; + display:flex;align-items:center;gap:var(--sp-3);font-size:var(--text-sm); } .alert .a-dot{width:8px;height:8px;border-radius:50%;background:var(--err);flex:0 0 auto} .alert b{color:#ffe2de} -.alert.warn{background:linear-gradient(90deg,rgba(224,163,46,.15),rgba(224,163,46,.03)); - border-color:rgba(224,163,46,.32);color:#f3dca6} +.alert.warn{background:rgba(224,163,46,.08); + border-color:rgba(224,163,46,.2);color:#f3dca6} .alert.warn .a-dot{background:var(--warn)} diff --git a/static/css/components.css b/static/css/components.css index a300dfc..8784c9c 100644 --- a/static/css/components.css +++ b/static/css/components.css @@ -5,7 +5,7 @@ /* ---- Karte (Grundbaustein) ---- */ .card{ background:var(--panel);border:1px solid var(--line);border-radius:var(--radius); - padding:18px 20px; + padding:var(--sp-3) var(--sp-4); } .card-h{display:flex;align-items:center;gap:10px;margin:0 0 14px} .card-h h3{font-size:15px;font-weight:600;margin:0;flex:1;color:var(--tx)} @@ -14,11 +14,9 @@ /* ---- Hero (Overview-Kopf) ---- */ .hero{ - background: - radial-gradient(120% 140% at 100% 0%,rgba(68,147,224,.10),transparent 60%), - var(--panel); + background:var(--panel); border:1px solid var(--line);border-radius:var(--radius); - padding:26px 28px;display:flex;justify-content:space-between;gap:24px;flex-wrap:wrap; + padding:var(--sp-4) var(--sp-5);display:flex;justify-content:space-between;gap:var(--sp-6);flex-wrap:wrap; } .hero .eyebrow{font-size:11.5px;letter-spacing:.22em;text-transform:uppercase;color:var(--mut)} .hero h1{font-size:26px;font-weight:650;margin:8px 0 6px} @@ -34,24 +32,24 @@ /* ---- KPI-Kacheln ---- */ .kpi{ - position:relative;border-radius:var(--radius);padding:18px 20px; + position:relative;border-radius:var(--radius);padding:var(--sp-3) var(--sp-4); border:1px solid var(--line);background:var(--panel);overflow:hidden; } -.kpi .k-h{display:flex;align-items:flex-start;justify-content:space-between;gap:8px} -.kpi .k-t{font-size:13.5px;color:var(--mut)} +.kpi .k-h{display:flex;align-items:flex-start;justify-content:space-between;gap:var(--sp-2)} +.kpi .k-t{font-size:var(--text-sm);color:var(--mut)} .kpi .k-ic{color:var(--mut);opacity:.85} -.kpi .k-ic svg{width:20px;height:20px} -.kpi .k-v{font-family:var(--mono);font-size:32px;font-weight:600;line-height:1.1;margin:14px 0 4px;color:var(--tx)} -.kpi .k-v small{font-size:15px;color:var(--mut);font-weight:400} -.kpi .k-s{font-size:12px;color:var(--mut)} -/* Farb-Varianten */ -.kpi.green {background:linear-gradient(160deg,var(--t-green),transparent 70%);border-color:var(--b-green)} +.kpi .k-ic svg{width:18px;height:18px} +.kpi .k-v{font-family:var(--mono);font-size:var(--text-2xl);font-weight:600;line-height:1.1;margin:var(--sp-3) 0 var(--sp-1);color:var(--tx)} +.kpi .k-v small{font-size:var(--text-base);color:var(--mut);font-weight:400} +.kpi .k-s{font-size:var(--text-xs);color:var(--mut)} +/* Farb-Varianten (flach) */ +.kpi.green {border-top:2px solid var(--on)} .kpi.green .k-v,.kpi.green .k-ic{color:var(--on)} -.kpi.blue {background:linear-gradient(160deg,var(--t-blue),transparent 70%);border-color:var(--b-blue)} +.kpi.blue {border-top:2px solid var(--act)} .kpi.blue .k-v,.kpi.blue .k-ic{color:var(--act)} -.kpi.purple{background:linear-gradient(160deg,var(--t-purple),transparent 70%);border-color:var(--b-purple)} +.kpi.purple{border-top:2px solid var(--purple)} .kpi.purple .k-v,.kpi.purple .k-ic{color:var(--purple)} -.kpi.red {background:linear-gradient(160deg,var(--t-red),transparent 70%);border-color:var(--b-red)} +.kpi.red {border-top:2px solid var(--err)} .kpi.red .k-v,.kpi.red .k-ic{color:var(--err)} .kpi.muted .k-v{color:var(--dim)} @@ -157,3 +155,36 @@ button:disabled{opacity:.5;cursor:not-allowed} .guide-acc summary::after { content: "▼"; font-size: 10px; color: var(--mut); transition: transform 0.2s; } .guide-acc[open] summary::after { transform: rotate(180deg); } .guide-acc .acc-body { padding: 0 20px 20px 20px; } + +/* ---- Utilities & Layout Helpers ---- */ +.flex { display: flex; } +.flex-col { display: flex; flex-direction: column; } +.items-center { align-items: center; } +.justify-between { justify-content: space-between; } +.gap-2 { gap: var(--sp-2); } +.gap-3 { gap: var(--sp-3); } +.mt-2 { margin-top: var(--sp-2); } +.mt-3 { margin-top: var(--sp-3); } +.mt-4 { margin-top: var(--sp-4); } +.mt-6 { margin-top: var(--sp-6); } +.mb-2 { margin-bottom: var(--sp-2); } +.mb-4 { margin-bottom: var(--sp-4); } +.text-sm { font-size: var(--text-sm); } +.text-xs { font-size: var(--text-xs); } +.text-mut { color: var(--mut); } +.text-act { color: var(--act); } + +/* ---- Interaktive Karten (A11y) ---- */ +.card-btn { + display: block; width: 100%; text-align: left; + background: var(--panel); border: 1px solid var(--line); border-radius: var(--radius); + padding: var(--sp-3) var(--sp-4); cursor: pointer; transition: border-color 0.15s, background 0.15s; + color: var(--tx); font-family: var(--sans); +} +.card-btn:hover, .card-btn:focus { + border-color: var(--act); + background: var(--bg2); + outline: none; +} +.card-btn h3 { margin: 0; font-size: var(--text-lg); font-weight: 600; } +.card-btn p { margin: var(--sp-2) 0 0; font-size: var(--text-sm); color: var(--mut); } diff --git a/static/index.html b/static/index.html index a4b3fe0..00eacf6 100644 --- a/static/index.html +++ b/static/index.html @@ -43,9 +43,9 @@
-
+
-
+
@@ -86,10 +86,10 @@

Einstellungen

-
+
-
Wird für die WebSockets und API Calls genutzt, falls der Server geschützt ist.
+
Wird für die WebSockets und API Calls genutzt, falls der Server geschützt ist.
diff --git a/static/js/panels/models.js b/static/js/panels/models.js index ffb1eb2..33d6138 100644 --- a/static/js/panels/models.js +++ b/static/js/panels/models.js @@ -20,7 +20,7 @@ function mount() { $("#m-table").innerHTML = `

Modelle & Ports

-
+
💡 Modelle werden automatisch geladen, sobald eine Chat-Anfrage an sie gestellt wird. Du musst sie nicht manuell starten.
@@ -36,13 +36,13 @@ function mount() {

Modell konfigurieren

-
+
-
Höhere Werte erlauben längere Dokumente, brauchen aber mehr VRAM.
+
Höhere Werte erlauben längere Dokumente, brauchen aber mehr VRAM.
- +
`; diff --git a/static/js/panels/overview.js b/static/js/panels/overview.js index d5c788d..d14f592 100644 --- a/static/js/panels/overview.js +++ b/static/js/panels/overview.js @@ -29,29 +29,29 @@ function renderHero() { function renderQuickActions() { // 3 Kacheln (Cookbook, Server-Status, Aktivität/Guides) $("#ov-quick").innerHTML = ` -
-
-

Modell finden

- ${icon("book")} +
+

Durchsuche HuggingFace nach neuen Modellen im Cookbook.

+ -
-
-

Live Metriken

- ${icon("pulse")} +
+

${SYS ? `System läuft (RAM: ${SYS.ram.percent.toFixed(0)}%, CPU: ${SYS.cpu.percent.toFixed(0)}%)` : 'Lade Metriken...'}

+ -
-
-

Wartung

- ${icon("server")} +
+

Server neustarten, VRAM leeren oder Engine aktualisieren.

+ `; } @@ -107,9 +107,9 @@ function renderRecentJobs() { ${latest.length ? `
${latest.map(j => ` -
+
-
${esc(j.label)}
+
${esc(j.label)}
${statusBadge(j.state)}
diff --git a/update.tar.gz b/update.tar.gz new file mode 100644 index 0000000..312463b Binary files /dev/null and b/update.tar.gz differ