feat: smart cookbook MVP mit odysseus fit logik

This commit is contained in:
Hitonabi
2026-06-20 23:13:05 +02:00
parent c76bcc7293
commit 0a81a9fe99
4 changed files with 278 additions and 85 deletions
+2 -1
View File
@@ -20,7 +20,7 @@ from fastapi import FastAPI, HTTPException
from fastapi.responses import FileResponse, JSONResponse from fastapi.responses import FileResponse, JSONResponse
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from routers import jobs, maintenance, models, system from routers import jobs, maintenance, models, system, cookbook
app = FastAPI(title="Mission Control") app = FastAPI(title="Mission Control")
@@ -28,6 +28,7 @@ app.include_router(models.router)
app.include_router(jobs.router) app.include_router(jobs.router)
app.include_router(maintenance.router) app.include_router(maintenance.router)
app.include_router(system.router) app.include_router(system.router)
app.include_router(cookbook.router)
_STATIC = Path(__file__).parent / "static" _STATIC = Path(__file__).parent / "static"
+71
View File
@@ -0,0 +1,71 @@
"""
Extrahierte Mathematik aus dem Odysseus Projekt zur VRAM/RAM Berechnung.
Abgestimmt auf APUs mit Unified Memory (Bosgame M5 / Strix Halo).
"""
# Annahme: Bytes per Parameter für GGUF Quants
QUANT_BYTES_PER_PARAM = {
"Q2_K": 0.35,
"Q3_K_S": 0.38,
"Q3_K_M": 0.42,
"Q3_K_L": 0.45,
"Q4_0": 0.50,
"Q4_1": 0.55,
"Q4_K_S": 0.50,
"Q4_K_M": 0.55,
"Q5_0": 0.62,
"Q5_1": 0.68,
"Q5_K_S": 0.62,
"Q5_K_M": 0.65,
"Q6_K": 0.75,
"Q8_0": 1.00,
"F16": 2.00,
"BF16": 2.00,
}
def estimate_memory_gb(params_b: float, quant: str, ctx: int) -> float:
"""Berechnet den geschätzten Speicherbedarf in GB (Gewichte + Kontext)."""
# Wenn unbekanntes Format, nimm sicherheitshalber Q5_K_M (0.65)
bpp = QUANT_BYTES_PER_PARAM.get(quant.upper(), 0.65)
weights = params_b * bpp
# Heuristik für Context-RAM: 8k Context bei 7B Parametern frisst ca. 0.8 GB
context_vram = (ctx / 8192) * (max(params_b, 7) / 7) * 0.8
return weights + context_vram
def estimate_speed(req_gb: float, sys_ram_gb: float) -> float:
"""Berechnet die geschätzte Tokens/s basierend auf der 273 GB/s Bandbreite der APU."""
# Strix Halo hat ca 273 GB/s Unified Memory Bandbreite.
bw = 273 if sys_ram_gb > 8 else 70
if req_gb <= 0:
return 0.0
# (Bandbreite / Modellgröße) * Effizienz (0.55)
raw_tps = (bw / req_gb) * 0.55
return raw_tps
def evaluate_fit(params_b: float, quant: str, ctx: int, sys_ram_gb: float) -> dict:
"""Berechnet den Fit für ein System mit Shared Memory (APU)."""
req_gb = estimate_memory_gb(params_b, quant, ctx)
tps = estimate_speed(req_gb, sys_ram_gb)
# Das OS und andere Prozesse brauchen RAM. Wir lassen 4GB Puffer.
usable_ram = max(sys_ram_gb - 4.0, 0)
if req_gb > usable_ram:
fit_level = "too_tight"
text = "Zu groß (OOM)"
elif req_gb > usable_ram * 0.8:
fit_level = "marginal"
text = "Könnte knapp werden"
else:
fit_level = "perfect"
text = "Passt perfekt"
return {
"level": fit_level,
"text": text,
"req_gb": round(req_gb, 1),
"tps": round(tps, 0)
}
+100
View File
@@ -0,0 +1,100 @@
"""
Cookbook Router: Verbindet die HuggingFace API mit der Odysseus-Hardware-Berechnung.
"""
import httpx
import re
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
import psutil
from auth import auth
from hw_math import evaluate_fit
router = APIRouter(prefix="/api/cookbook", dependencies=[Depends(auth)])
class AnalyzeRequest(BaseModel):
repo_id: str
ctx: int = 8192
class EvaluateRequest(BaseModel):
params_b: float
quant: str
ctx: int
def extract_params_b(repo_id: str) -> float:
"""Extrahiert die Parametergröße (in Milliarden) aus dem Repo-Namen."""
# z.B. Qwen2.5-Coder-32B -> 32
# 8x7B -> 56 (MoE)
moe = re.search(r"(\d+)x(\d+(?:\.\d+)?)[bB]", repo_id)
if moe:
return float(moe.group(1)) * float(moe.group(2))
m = re.search(r"(\d+(?:\.\d+)?)[bB](?![a-zA-Z])", repo_id)
if m:
return float(m.group(1))
return 7.0 # Fallback
def extract_quant(filename: str) -> str:
m = re.search(r"(Q\d_[A-Z0-9_]+|IQ\d_[A-Z0-9_]+|FP16|BF16)", filename, re.IGNORECASE)
return m.group(1).upper() if m else "Q4_K_M"
@router.post("/analyze")
async def analyze_repo(req: AnalyzeRequest):
"""Holt die GGUF Dateien von HuggingFace und berechnet den Hardware-Fit."""
url = f"https://huggingface.co/api/models/{req.repo_id}/tree/main"
async with httpx.AsyncClient() as client:
try:
resp = await client.get(url, timeout=10.0)
resp.raise_for_status()
tree = resp.json()
except Exception as e:
raise HTTPException(status_code=500, detail=f"HuggingFace Fehler: {str(e)}")
gguf_files = [f["path"] for f in tree if f.get("path", "").endswith(".gguf")]
if not gguf_files:
return {"files": []}
params_b = extract_params_b(req.repo_id)
# Ermittle RAM des Systems (da APU = Shared Memory)
ram_gb = psutil.virtual_memory().total / (1024**3)
results = []
for f in gguf_files:
quant = extract_quant(f)
fit = evaluate_fit(params_b, quant, req.ctx, ram_gb)
# Priority-Score, um den besten Fit an oberste Stelle zu setzen.
# "Q4_K_M" ist oft der Sweetspot.
priority = 0
if fit["level"] == "perfect":
priority += 10
if quant == "Q4_K_M": priority += 5
elif quant.startswith("Q4"): priority += 4
elif quant.startswith("Q5"): priority += 3
results.append({
"filename": f,
"quant": quant,
"fit": fit,
"priority": priority
})
# Sortieren: Highest priority first, dann nach tps (schnellste zuerst)
results.sort(key=lambda x: (x["priority"], x["fit"]["tps"]), reverse=True)
return {
"repo": req.repo_id,
"params_b": params_b,
"sys_ram_gb": round(ram_gb, 1),
"files": results
}
@router.post("/evaluate")
def evaluate_single(req: EvaluateRequest):
ram_gb = psutil.virtual_memory().total / (1024**3)
fit = evaluate_fit(req.params_b, req.quant, req.ctx, ram_gb)
return fit
+93 -72
View File
@@ -37,38 +37,7 @@ const CURATED_MODELS = [
} }
]; ];
function estimateMemoryGB(params_b, quant, ctx) { // Lokale Mathe entfernt. Wir nutzen jetzt das Backend.
const bpp = 0.6;
const weights = params_b * bpp;
const context = (ctx / 8192) * (params_b / 7) * 0.8;
return weights + context;
}
function estimateSpeed(req_gb, vram_gb) {
// Heuristic for speed in tokens/s
// Bosgame APU (Strix Halo) has unified memory with ~273 GB/s bandwidth.
// We approximate bandwidth: if huge VRAM/GTT, it's the APU.
const bw = (vram_gb > 32) ? 250 : 70; // 250 GB/s for APU, 70 GB/s for standard CPU
if (req_gb <= 0) return 0;
return (bw / req_gb) * 0.55; // 55% efficiency
}
function getFit(m, sys) {
const req = estimateMemoryGB(m.params_b, m.quant, m.ctx);
const vram_bytes = (sys?.gpu?.vram?.total || 0) + (sys?.gpu?.gtt?.total || 0);
const vram = vram_bytes / (1024 ** 3);
const ram_bytes = sys?.ram?.total || 0;
const ram_used = sys?.ram?.used || 0;
const ram = ram_bytes / (1024 ** 3);
const freeRam = (ram_bytes - ram_used) / (1024 ** 3);
const tps = estimateSpeed(req, vram);
if (vram === 0 && ram === 0) return { level: "perfect", class: "b-run", text: "Lade...", req, tps };
if (vram > 0 && req <= vram) return { level: "perfect", class: "b-run", text: "Passt in VRAM", req, tps };
if (req <= (vram + freeRam)) return { level: "good", class: "b-load", text: "RAM Offload", req, tps };
return { level: "too_tight", class: "b-err", text: "Zu groß (OOM)", req, tps };
}
let lastSys = null; let lastSys = null;
let currentResults = []; let currentResults = [];
@@ -134,43 +103,58 @@ function mount() {
$("#cb-m-download").addEventListener("click", doDownload); $("#cb-m-download").addEventListener("click", doDownload);
$("#cb-m-files").addEventListener("change", updateLiveFit); $("#cb-m-files").addEventListener("change", updateLiveFit);
$("#cb-m-ctx").addEventListener("input", updateLiveFit); $("#cb-m-ctx").addEventListener("change", reanalyzeCtx);
renderCurated(); renderCurated();
} }
function extractParamsB(name) { // Aktuelle Analyse-Daten vom Backend
const moe = name.match(/(\d+)x(\d+(?:\.\d+)?)[bB]/i); let currentAnalysis = null;
if (moe) return parseInt(moe[1]) * parseFloat(moe[2]);
const m = name.match(/(\d+(?:\.\d+)?)[bB](?![a-zA-Z])/i);
if (m) return parseFloat(m[1]);
return 7; // Fallback
}
function extractQuant(filename) {
const m = filename.match(/(Q\d_[A-Z0-9_]+|IQ\d_[A-Z0-9_]+|FP16|BF16)/i);
return m ? m[1].toUpperCase() : "Q4_K_M";
}
function updateLiveFit() { function updateLiveFit() {
const repo = $("#cb-m-repo").textContent;
const file = $("#cb-m-files").value; const file = $("#cb-m-files").value;
const ctx = parseInt($("#cb-m-ctx").value) || 8192; if (!currentAnalysis || !file) {
if (!repo || !file) {
$("#cb-m-fit-container").style.display = "none"; $("#cb-m-fit-container").style.display = "none";
return; return;
} }
const params_b = extractParamsB(repo); const fData = currentAnalysis.files.find(f => f.filename === file);
const quant = extractQuant(file); if (!fData) return;
const m = { params_b, quant, ctx }; const fit = fData.fit;
const fit = getFit(m, lastSys); const cls = fit.level === "perfect" ? "b-run" : (fit.level === "marginal" ? "b-load" : "b-err");
$("#cb-m-fit-container").style.display = "flex"; $("#cb-m-fit-container").style.display = "flex";
$("#cb-m-fit-text").innerHTML = `Geschätzter Bedarf: <b>~${fit.req.toFixed(1)} GB RAM/VRAM</b> <br><small class="meta">${params_b}B Params · ${quant} · ~${Math.round(fit.tps)} t/s</small>`; $("#cb-m-fit-text").innerHTML = `Geschätzter Bedarf: <b>~${fit.req_gb.toFixed(1)} GB RAM/VRAM</b> <br><small class="meta">${currentAnalysis.params_b}B Params · ${fData.quant} · ~${Math.round(fit.tps)} t/s</small>`;
$("#cb-m-fit-badge").innerHTML = `<span class="badge ${fit.class}">${fit.text}</span>`; $("#cb-m-fit-badge").innerHTML = `<span class="badge ${cls}">${fit.text}</span>`;
// Wenn "too_tight", machen wir den Download-Button gelb zur Warnung, erlauben ihn aber
const btn = $("#cb-m-download");
if (fit.level === "too_tight") {
btn.className = "primary warn";
btn.innerHTML = "Trotzdem herunterladen (OOM Risiko!)";
} else {
btn.className = "primary";
btn.innerHTML = "Herunterladen & Einpflegen";
}
}
async function reanalyzeCtx() {
if (!currentAnalysis) return;
const ctx = parseInt($("#cb-m-ctx").value) || 8192;
const repo = currentAnalysis.repo;
const file = $("#cb-m-files").value;
$("#cb-m-fit-text").innerHTML = "Berechne neues Context-Limit...";
try {
const res = await api("/api/cookbook/analyze", {
method: "POST", body: JSON.stringify({ repo_id: repo, ctx })
});
currentAnalysis = res;
// Auswahl beibehalten
$("#cb-m-files").value = file;
updateLiveFit();
} catch(e) {}
} }
async function doSearch() { async function doSearch() {
@@ -226,27 +210,37 @@ window.openModelModal = async (index) => {
$("#cb-m-files").style.display = "none"; $("#cb-m-files").style.display = "none";
$("#cb-m-loading").style.display = "block"; $("#cb-m-loading").style.display = "block";
$("#cb-m-download").disabled = true;
try { try {
const url = `https://huggingface.co/api/models/${m.id}/tree/main`; const ctx = parseInt($("#cb-m-ctx").value) || 8192;
const r = await fetch(url); const res = await api("/api/cookbook/analyze", {
const tree = await r.json(); method: "POST", body: JSON.stringify({ repo_id: m.id, ctx })
const files = tree.filter(f => f.path.endsWith('.gguf')).map(f => f.path); });
currentAnalysis = res;
$("#cb-m-loading").style.display = "none"; $("#cb-m-loading").style.display = "none";
$("#cb-m-files").style.display = "block"; $("#cb-m-files").style.display = "block";
if (files.length === 0) { if (!res.files || res.files.length === 0) {
$("#cb-m-files").innerHTML = "<option value=''>Keine GGUF-Dateien im Hauptverzeichnis gefunden.</option>"; $("#cb-m-files").innerHTML = "<option value=''>Keine GGUF-Dateien gefunden.</option>";
$("#cb-m-download").disabled = true;
$("#cb-m-fit-container").style.display = "none"; $("#cb-m-fit-container").style.display = "none";
} else { } else {
$("#cb-m-files").innerHTML = files.map(f => `<option value="${esc(f)}">${esc(f)}</option>`).join(""); // Optische Indikatoren im Dropdown
$("#cb-m-files").innerHTML = res.files.map(f => {
let mark = "";
if (f.fit.level === "perfect") mark = "🟢";
else if (f.fit.level === "marginal") mark = "🟡";
else mark = "🔴";
return `<option value="${esc(f.filename)}">${mark} ${esc(f.filename)}</option>`;
}).join("");
$("#cb-m-download").disabled = false; $("#cb-m-download").disabled = false;
updateLiveFit(); updateLiveFit();
} }
} catch(e) { } catch(e) {
$("#cb-m-loading").textContent = "Fehler beim Laden der Dateien."; $("#cb-m-loading").textContent = "Fehler: " + e.message;
} }
}; };
@@ -287,31 +281,46 @@ async function doDownload() {
$("#cb-m-download").textContent = "Herunterladen & Einpflegen"; $("#cb-m-download").textContent = "Herunterladen & Einpflegen";
} }
function renderCurated() { async function renderCurated() {
$("#cb-section-title").textContent = "Kuratierte Empfehlungen"; $("#cb-section-title").textContent = "Kuratierte Empfehlungen";
const grid = $("#cb-grid"); const grid = $("#cb-grid");
if (!grid) return; if (!grid) return;
grid.innerHTML = CURATED_MODELS.map((m, i) => {
const fit = getFit(m, lastSys); grid.innerHTML = "<div class='meta' style='grid-column:1/-1;text-align:center;padding:40px'>Berechne Hardware-Fit für Empfehlungen...</div>";
return `
try {
let html = "";
for (let i = 0; i < CURATED_MODELS.length; i++) {
const m = CURATED_MODELS[i];
const fit = await api("/api/cookbook/evaluate", {
method: "POST", body: JSON.stringify({ params_b: m.params_b, quant: m.quant, ctx: m.ctx })
});
const cls = fit.level === "perfect" ? "b-run" : (fit.level === "marginal" ? "b-load" : "b-err");
html += `
<div class="card" style="display:flex; flex-direction:column; cursor:pointer" onclick="window.openCuratedModal(${i})"> <div class="card" style="display:flex; flex-direction:column; cursor:pointer" onclick="window.openCuratedModal(${i})">
<div style="display:flex; justify-content:space-between; align-items:center;"> <div style="display:flex; justify-content:space-between; align-items:center;">
<h3 style="margin:0; font-size:16px">${esc(m.name)}</h3> <h3 style="margin:0; font-size:16px">${esc(m.name)}</h3>
<span class="badge ${fit.class}">${fit.text}</span> <span class="badge ${cls}">${fit.text}</span>
</div> </div>
<div style="font-size:13px; color:var(--mut); margin-top:12px; flex:1; line-height:1.5;"> <div style="font-size:13px; color:var(--mut); margin-top:12px; flex:1; line-height:1.5;">
${m.desc} ${m.desc}
</div> </div>
<div style="display:flex; justify-content:space-between; margin-top:16px; font-size:12px" class="meta"> <div style="display:flex; justify-content:space-between; margin-top:16px; font-size:12px" class="meta">
<span>~${fit.req.toFixed(1)} GB RAM · ~${Math.round(fit.tps)} t/s</span> <span>~${fit.req_gb.toFixed(1)} GB RAM/VRAM · ~${Math.round(fit.tps)} t/s</span>
<span>${m.quant}</span> <span>${m.quant}</span>
</div> </div>
</div> </div>
`; `;
}).join(""); }
grid.innerHTML = html;
} catch (e) {
grid.innerHTML = `<div class="alert err" style="grid-column:1/-1">Fehler beim Laden der Empfehlungen: ${e.message}</div>`;
}
} }
window.openCuratedModal = (index) => { window.openCuratedModal = async (index) => {
const m = CURATED_MODELS[index]; const m = CURATED_MODELS[index];
if (!m) return; if (!m) return;
$("#cb-modal").style.display = "flex"; $("#cb-modal").style.display = "flex";
@@ -323,7 +332,19 @@ window.openCuratedModal = (index) => {
$("#cb-m-alias").value = m.alias; $("#cb-m-alias").value = m.alias;
$("#cb-m-ctx").value = m.ctx; $("#cb-m-ctx").value = m.ctx;
$("#cb-m-download").disabled = false; $("#cb-m-download").disabled = false;
// Wir nutzen die neue API Struktur auch für das simulierte Modal
try {
const fit = await api("/api/cookbook/evaluate", {
method: "POST", body: JSON.stringify({ params_b: m.params_b, quant: m.quant, ctx: m.ctx })
});
currentAnalysis = {
repo: m.repo,
params_b: m.params_b,
files: [{ filename: m.file, quant: m.quant, fit: fit }]
};
updateLiveFit(); updateLiveFit();
} catch(e) {}
}; };
function onSystem(sys) { function onSystem(sys) {