perf: improve streaming generation pipeline

Add CUDA inference hot-path optimizations, safer attention fallback handling, and generation profiling hooks. Improve SSE streaming, browser buffering telemetry, and playback recovery while preserving default audio quality settings.
This commit is contained in:
2026-04-30 18:54:14 +01:00
parent a39ec536fd
commit 75b84b211b
9 changed files with 459 additions and 48 deletions
+6 -1
View File
@@ -1,5 +1,8 @@
import { NextRequest, NextResponse } from "next/server";
export const dynamic = "force-dynamic";
export const runtime = "nodejs";
export async function POST(request: NextRequest) {
const pythonServerUrl = process.env.VIBEVOICE_SERVER_URL ?? "http://localhost:8000";
@@ -24,6 +27,7 @@ export async function POST(request: NextRequest) {
cfg_scale: body.cfg_scale ?? 1.5,
inference_steps: body.inference_steps ?? 10,
}),
signal: request.signal,
});
if (!upstream.ok) {
@@ -36,8 +40,9 @@ export async function POST(request: NextRequest) {
status: 200,
headers: {
"Content-Type": "text/event-stream",
"Cache-Control": "no-cache",
"Cache-Control": "no-cache, no-transform",
"Connection": "keep-alive",
"X-Content-Type-Options": "nosniff",
"X-Accel-Buffering": "no",
},
});