diff --git a/app.py b/app.py index 859afdf..a830675 100644 --- a/app.py +++ b/app.py @@ -20,7 +20,7 @@ from fastapi import FastAPI, HTTPException from fastapi.responses import FileResponse, JSONResponse from fastapi.staticfiles import StaticFiles -from routers import jobs, maintenance, models, system +from routers import jobs, maintenance, models, system, cookbook app = FastAPI(title="Mission Control") @@ -28,6 +28,7 @@ app.include_router(models.router) app.include_router(jobs.router) app.include_router(maintenance.router) app.include_router(system.router) +app.include_router(cookbook.router) _STATIC = Path(__file__).parent / "static" diff --git a/hw_math.py b/hw_math.py new file mode 100644 index 0000000..6d1704b --- /dev/null +++ b/hw_math.py @@ -0,0 +1,71 @@ +""" +Extrahierte Mathematik aus dem Odysseus Projekt zur VRAM/RAM Berechnung. +Abgestimmt auf APUs mit Unified Memory (Bosgame M5 / Strix Halo). +""" + +# Annahme: Bytes per Parameter für GGUF Quants +QUANT_BYTES_PER_PARAM = { + "Q2_K": 0.35, + "Q3_K_S": 0.38, + "Q3_K_M": 0.42, + "Q3_K_L": 0.45, + "Q4_0": 0.50, + "Q4_1": 0.55, + "Q4_K_S": 0.50, + "Q4_K_M": 0.55, + "Q5_0": 0.62, + "Q5_1": 0.68, + "Q5_K_S": 0.62, + "Q5_K_M": 0.65, + "Q6_K": 0.75, + "Q8_0": 1.00, + "F16": 2.00, + "BF16": 2.00, +} + +def estimate_memory_gb(params_b: float, quant: str, ctx: int) -> float: + """Berechnet den geschätzten Speicherbedarf in GB (Gewichte + Kontext).""" + # Wenn unbekanntes Format, nimm sicherheitshalber Q5_K_M (0.65) + bpp = QUANT_BYTES_PER_PARAM.get(quant.upper(), 0.65) + weights = params_b * bpp + + # Heuristik für Context-RAM: 8k Context bei 7B Parametern frisst ca. 0.8 GB + context_vram = (ctx / 8192) * (max(params_b, 7) / 7) * 0.8 + + return weights + context_vram + +def estimate_speed(req_gb: float, sys_ram_gb: float) -> float: + """Berechnet die geschätzte Tokens/s basierend auf der 273 GB/s Bandbreite der APU.""" + # Strix Halo hat ca 273 GB/s Unified Memory Bandbreite. + bw = 273 if sys_ram_gb > 8 else 70 + if req_gb <= 0: + return 0.0 + + # (Bandbreite / Modellgröße) * Effizienz (0.55) + raw_tps = (bw / req_gb) * 0.55 + return raw_tps + +def evaluate_fit(params_b: float, quant: str, ctx: int, sys_ram_gb: float) -> dict: + """Berechnet den Fit für ein System mit Shared Memory (APU).""" + req_gb = estimate_memory_gb(params_b, quant, ctx) + tps = estimate_speed(req_gb, sys_ram_gb) + + # Das OS und andere Prozesse brauchen RAM. Wir lassen 4GB Puffer. + usable_ram = max(sys_ram_gb - 4.0, 0) + + if req_gb > usable_ram: + fit_level = "too_tight" + text = "Zu groß (OOM)" + elif req_gb > usable_ram * 0.8: + fit_level = "marginal" + text = "Könnte knapp werden" + else: + fit_level = "perfect" + text = "Passt perfekt" + + return { + "level": fit_level, + "text": text, + "req_gb": round(req_gb, 1), + "tps": round(tps, 0) + } diff --git a/routers/cookbook.py b/routers/cookbook.py new file mode 100644 index 0000000..f99e70f --- /dev/null +++ b/routers/cookbook.py @@ -0,0 +1,100 @@ +""" +Cookbook Router: Verbindet die HuggingFace API mit der Odysseus-Hardware-Berechnung. +""" + +import httpx +import re +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +import psutil + +from auth import auth +from hw_math import evaluate_fit + +router = APIRouter(prefix="/api/cookbook", dependencies=[Depends(auth)]) + +class AnalyzeRequest(BaseModel): + repo_id: str + ctx: int = 8192 + +class EvaluateRequest(BaseModel): + params_b: float + quant: str + ctx: int + +def extract_params_b(repo_id: str) -> float: + """Extrahiert die Parametergröße (in Milliarden) aus dem Repo-Namen.""" + # z.B. Qwen2.5-Coder-32B -> 32 + # 8x7B -> 56 (MoE) + moe = re.search(r"(\d+)x(\d+(?:\.\d+)?)[bB]", repo_id) + if moe: + return float(moe.group(1)) * float(moe.group(2)) + m = re.search(r"(\d+(?:\.\d+)?)[bB](?![a-zA-Z])", repo_id) + if m: + return float(m.group(1)) + return 7.0 # Fallback + +def extract_quant(filename: str) -> str: + m = re.search(r"(Q\d_[A-Z0-9_]+|IQ\d_[A-Z0-9_]+|FP16|BF16)", filename, re.IGNORECASE) + return m.group(1).upper() if m else "Q4_K_M" + +@router.post("/analyze") +async def analyze_repo(req: AnalyzeRequest): + """Holt die GGUF Dateien von HuggingFace und berechnet den Hardware-Fit.""" + url = f"https://huggingface.co/api/models/{req.repo_id}/tree/main" + + async with httpx.AsyncClient() as client: + try: + resp = await client.get(url, timeout=10.0) + resp.raise_for_status() + tree = resp.json() + except Exception as e: + raise HTTPException(status_code=500, detail=f"HuggingFace Fehler: {str(e)}") + + gguf_files = [f["path"] for f in tree if f.get("path", "").endswith(".gguf")] + + if not gguf_files: + return {"files": []} + + params_b = extract_params_b(req.repo_id) + + # Ermittle RAM des Systems (da APU = Shared Memory) + ram_gb = psutil.virtual_memory().total / (1024**3) + + results = [] + for f in gguf_files: + quant = extract_quant(f) + fit = evaluate_fit(params_b, quant, req.ctx, ram_gb) + + # Priority-Score, um den besten Fit an oberste Stelle zu setzen. + # "Q4_K_M" ist oft der Sweetspot. + priority = 0 + if fit["level"] == "perfect": + priority += 10 + if quant == "Q4_K_M": priority += 5 + elif quant.startswith("Q4"): priority += 4 + elif quant.startswith("Q5"): priority += 3 + + results.append({ + "filename": f, + "quant": quant, + "fit": fit, + "priority": priority + }) + + # Sortieren: Highest priority first, dann nach tps (schnellste zuerst) + results.sort(key=lambda x: (x["priority"], x["fit"]["tps"]), reverse=True) + + return { + "repo": req.repo_id, + "params_b": params_b, + "sys_ram_gb": round(ram_gb, 1), + "files": results + } + +@router.post("/evaluate") +def evaluate_single(req: EvaluateRequest): + ram_gb = psutil.virtual_memory().total / (1024**3) + fit = evaluate_fit(req.params_b, req.quant, req.ctx, ram_gb) + return fit + diff --git a/static/js/panels/cookbook.js b/static/js/panels/cookbook.js index e6a7957..5e2fc89 100644 --- a/static/js/panels/cookbook.js +++ b/static/js/panels/cookbook.js @@ -37,38 +37,7 @@ const CURATED_MODELS = [ } ]; -function estimateMemoryGB(params_b, quant, ctx) { - const bpp = 0.6; - const weights = params_b * bpp; - const context = (ctx / 8192) * (params_b / 7) * 0.8; - return weights + context; -} - -function estimateSpeed(req_gb, vram_gb) { - // Heuristic for speed in tokens/s - // Bosgame APU (Strix Halo) has unified memory with ~273 GB/s bandwidth. - // We approximate bandwidth: if huge VRAM/GTT, it's the APU. - const bw = (vram_gb > 32) ? 250 : 70; // 250 GB/s for APU, 70 GB/s for standard CPU - if (req_gb <= 0) return 0; - return (bw / req_gb) * 0.55; // 55% efficiency -} - -function getFit(m, sys) { - const req = estimateMemoryGB(m.params_b, m.quant, m.ctx); - const vram_bytes = (sys?.gpu?.vram?.total || 0) + (sys?.gpu?.gtt?.total || 0); - const vram = vram_bytes / (1024 ** 3); - const ram_bytes = sys?.ram?.total || 0; - const ram_used = sys?.ram?.used || 0; - const ram = ram_bytes / (1024 ** 3); - const freeRam = (ram_bytes - ram_used) / (1024 ** 3); - - const tps = estimateSpeed(req, vram); - - if (vram === 0 && ram === 0) return { level: "perfect", class: "b-run", text: "Lade...", req, tps }; - if (vram > 0 && req <= vram) return { level: "perfect", class: "b-run", text: "Passt in VRAM", req, tps }; - if (req <= (vram + freeRam)) return { level: "good", class: "b-load", text: "RAM Offload", req, tps }; - return { level: "too_tight", class: "b-err", text: "Zu groß (OOM)", req, tps }; -} +// Lokale Mathe entfernt. Wir nutzen jetzt das Backend. let lastSys = null; let currentResults = []; @@ -134,43 +103,58 @@ function mount() { $("#cb-m-download").addEventListener("click", doDownload); $("#cb-m-files").addEventListener("change", updateLiveFit); - $("#cb-m-ctx").addEventListener("input", updateLiveFit); + $("#cb-m-ctx").addEventListener("change", reanalyzeCtx); renderCurated(); } -function extractParamsB(name) { - const moe = name.match(/(\d+)x(\d+(?:\.\d+)?)[bB]/i); - if (moe) return parseInt(moe[1]) * parseFloat(moe[2]); - const m = name.match(/(\d+(?:\.\d+)?)[bB](?![a-zA-Z])/i); - if (m) return parseFloat(m[1]); - return 7; // Fallback -} - -function extractQuant(filename) { - const m = filename.match(/(Q\d_[A-Z0-9_]+|IQ\d_[A-Z0-9_]+|FP16|BF16)/i); - return m ? m[1].toUpperCase() : "Q4_K_M"; -} +// Aktuelle Analyse-Daten vom Backend +let currentAnalysis = null; function updateLiveFit() { - const repo = $("#cb-m-repo").textContent; const file = $("#cb-m-files").value; - const ctx = parseInt($("#cb-m-ctx").value) || 8192; - - if (!repo || !file) { + if (!currentAnalysis || !file) { $("#cb-m-fit-container").style.display = "none"; return; } - const params_b = extractParamsB(repo); - const quant = extractQuant(file); + const fData = currentAnalysis.files.find(f => f.filename === file); + if (!fData) return; - const m = { params_b, quant, ctx }; - const fit = getFit(m, lastSys); + const fit = fData.fit; + const cls = fit.level === "perfect" ? "b-run" : (fit.level === "marginal" ? "b-load" : "b-err"); $("#cb-m-fit-container").style.display = "flex"; - $("#cb-m-fit-text").innerHTML = `Geschätzter Bedarf: ~${fit.req.toFixed(1)} GB RAM/VRAM
${params_b}B Params · ${quant} · ~${Math.round(fit.tps)} t/s`; - $("#cb-m-fit-badge").innerHTML = `${fit.text}`; + $("#cb-m-fit-text").innerHTML = `Geschätzter Bedarf: ~${fit.req_gb.toFixed(1)} GB RAM/VRAM
${currentAnalysis.params_b}B Params · ${fData.quant} · ~${Math.round(fit.tps)} t/s`; + $("#cb-m-fit-badge").innerHTML = `${fit.text}`; + + // Wenn "too_tight", machen wir den Download-Button gelb zur Warnung, erlauben ihn aber + const btn = $("#cb-m-download"); + if (fit.level === "too_tight") { + btn.className = "primary warn"; + btn.innerHTML = "Trotzdem herunterladen (OOM Risiko!)"; + } else { + btn.className = "primary"; + btn.innerHTML = "Herunterladen & Einpflegen"; + } +} + +async function reanalyzeCtx() { + if (!currentAnalysis) return; + const ctx = parseInt($("#cb-m-ctx").value) || 8192; + const repo = currentAnalysis.repo; + const file = $("#cb-m-files").value; + + $("#cb-m-fit-text").innerHTML = "Berechne neues Context-Limit..."; + try { + const res = await api("/api/cookbook/analyze", { + method: "POST", body: JSON.stringify({ repo_id: repo, ctx }) + }); + currentAnalysis = res; + // Auswahl beibehalten + $("#cb-m-files").value = file; + updateLiveFit(); + } catch(e) {} } async function doSearch() { @@ -226,27 +210,37 @@ window.openModelModal = async (index) => { $("#cb-m-files").style.display = "none"; $("#cb-m-loading").style.display = "block"; + $("#cb-m-download").disabled = true; try { - const url = `https://huggingface.co/api/models/${m.id}/tree/main`; - const r = await fetch(url); - const tree = await r.json(); - const files = tree.filter(f => f.path.endsWith('.gguf')).map(f => f.path); + const ctx = parseInt($("#cb-m-ctx").value) || 8192; + const res = await api("/api/cookbook/analyze", { + method: "POST", body: JSON.stringify({ repo_id: m.id, ctx }) + }); + + currentAnalysis = res; $("#cb-m-loading").style.display = "none"; $("#cb-m-files").style.display = "block"; - if (files.length === 0) { - $("#cb-m-files").innerHTML = ""; - $("#cb-m-download").disabled = true; + if (!res.files || res.files.length === 0) { + $("#cb-m-files").innerHTML = ""; $("#cb-m-fit-container").style.display = "none"; } else { - $("#cb-m-files").innerHTML = files.map(f => ``).join(""); + // Optische Indikatoren im Dropdown + $("#cb-m-files").innerHTML = res.files.map(f => { + let mark = ""; + if (f.fit.level === "perfect") mark = "🟢"; + else if (f.fit.level === "marginal") mark = "🟡"; + else mark = "🔴"; + return ``; + }).join(""); + $("#cb-m-download").disabled = false; updateLiveFit(); } } catch(e) { - $("#cb-m-loading").textContent = "Fehler beim Laden der Dateien."; + $("#cb-m-loading").textContent = "Fehler: " + e.message; } }; @@ -287,31 +281,46 @@ async function doDownload() { $("#cb-m-download").textContent = "Herunterladen & Einpflegen"; } -function renderCurated() { +async function renderCurated() { $("#cb-section-title").textContent = "Kuratierte Empfehlungen"; const grid = $("#cb-grid"); if (!grid) return; - grid.innerHTML = CURATED_MODELS.map((m, i) => { - const fit = getFit(m, lastSys); - return ` -
-
-

${esc(m.name)}

- ${fit.text} + + grid.innerHTML = "
Berechne Hardware-Fit für Empfehlungen...
"; + + try { + let html = ""; + for (let i = 0; i < CURATED_MODELS.length; i++) { + const m = CURATED_MODELS[i]; + const fit = await api("/api/cookbook/evaluate", { + method: "POST", body: JSON.stringify({ params_b: m.params_b, quant: m.quant, ctx: m.ctx }) + }); + + const cls = fit.level === "perfect" ? "b-run" : (fit.level === "marginal" ? "b-load" : "b-err"); + + html += ` +
+
+

${esc(m.name)}

+ ${fit.text} +
+
+ ${m.desc} +
+
+ ~${fit.req_gb.toFixed(1)} GB RAM/VRAM · ~${Math.round(fit.tps)} t/s + ${m.quant} +
-
- ${m.desc} -
-
- ~${fit.req.toFixed(1)} GB RAM · ~${Math.round(fit.tps)} t/s - ${m.quant} -
-
- `; - }).join(""); + `; + } + grid.innerHTML = html; + } catch (e) { + grid.innerHTML = `
Fehler beim Laden der Empfehlungen: ${e.message}
`; + } } -window.openCuratedModal = (index) => { +window.openCuratedModal = async (index) => { const m = CURATED_MODELS[index]; if (!m) return; $("#cb-modal").style.display = "flex"; @@ -323,7 +332,19 @@ window.openCuratedModal = (index) => { $("#cb-m-alias").value = m.alias; $("#cb-m-ctx").value = m.ctx; $("#cb-m-download").disabled = false; - updateLiveFit(); + + // Wir nutzen die neue API Struktur auch für das simulierte Modal + try { + const fit = await api("/api/cookbook/evaluate", { + method: "POST", body: JSON.stringify({ params_b: m.params_b, quant: m.quant, ctx: m.ctx }) + }); + currentAnalysis = { + repo: m.repo, + params_b: m.params_b, + files: [{ filename: m.file, quant: m.quant, fit: fit }] + }; + updateLiveFit(); + } catch(e) {} }; function onSystem(sys) {