Create VibePod TTS podcast generator application

Agent-Logs-Url: https://github.com/JezzWTF/vibepod/sessions/a78fcf03-e979-4777-a428-18cc8eccc095

Co-authored-by: LyAhn <27559362+LyAhn@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-04-27 15:41:46 +00:00
committed by GitHub
parent ee85bece74
commit 3974a4cf69
26 changed files with 3083 additions and 0 deletions
+41
View File
@@ -0,0 +1,41 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
# dependencies
/node_modules
/.pnp
.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/versions
# testing
/coverage
# next.js
/.next/
/out/
# production
/build
# misc
.DS_Store
*.pem
# debug
npm-debug.log*
yarn-debug.log*
yarn-error.log*
.pnpm-debug.log*
# env files (can opt-in for committing if needed)
.env*
# vercel
.vercel
# typescript
*.tsbuildinfo
next-env.d.ts
+36
View File
@@ -0,0 +1,36 @@
This is a [Next.js](https://nextjs.org) project bootstrapped with [`create-next-app`](https://nextjs.org/docs/app/api-reference/cli/create-next-app).
## Getting Started
First, run the development server:
```bash
npm run dev
# or
yarn dev
# or
pnpm dev
# or
bun dev
```
Open [http://localhost:3000](http://localhost:3000) with your browser to see the result.
You can start editing the page by modifying `app/page.tsx`. The page auto-updates as you edit the file.
This project uses [`next/font`](https://nextjs.org/docs/app/building-your-application/optimizing/fonts) to automatically optimize and load [Geist](https://vercel.com/font), a new font family for Vercel.
## Learn More
To learn more about Next.js, take a look at the following resources:
- [Next.js Documentation](https://nextjs.org/docs) - learn about Next.js features and API.
- [Learn Next.js](https://nextjs.org/learn) - an interactive Next.js tutorial.
You can check out [the Next.js GitHub repository](https://github.com/vercel/next.js) - your feedback and contributions are welcome!
## Deploy on Vercel
The easiest way to deploy your Next.js app is to use the [Vercel Platform](https://vercel.com/new?utm_medium=default-template&filter=next.js&utm_source=create-next-app&utm_campaign=create-next-app-readme) from the creators of Next.js.
Check out our [Next.js deployment documentation](https://nextjs.org/docs/app/building-your-application/deploying) for more details.
+55
View File
@@ -0,0 +1,55 @@
import { NextRequest, NextResponse } from "next/server";
export async function POST(request: NextRequest) {
try {
const body = await request.json();
const { text, cfg_scale, inference_steps } = body as {
text: string;
cfg_scale: number;
inference_steps: number;
};
if (!text || typeof text !== "string" || text.trim().length === 0) {
return NextResponse.json(
{ error: "Missing or empty text field" },
{ status: 400 }
);
}
const pythonServerUrl =
process.env.VIBEVOICE_SERVER_URL ?? "http://localhost:8000";
const upstream = await fetch(`${pythonServerUrl}/generate`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
text: text.trim(),
cfg_scale: cfg_scale ?? 2.5,
inference_steps: inference_steps ?? 20,
}),
});
if (!upstream.ok) {
const errorText = await upstream.text().catch(() => "Unknown error");
return NextResponse.json(
{ error: `VibeVoice server error: ${errorText}` },
{ status: upstream.status }
);
}
const audioBuffer = await upstream.arrayBuffer();
return new NextResponse(audioBuffer, {
status: 200,
headers: {
"Content-Type": "audio/wav",
"Content-Disposition": 'attachment; filename="vibepod-output.wav"',
"Cache-Control": "no-store",
},
});
} catch (err) {
const message =
err instanceof Error ? err.message : "Failed to connect to VibeVoice server";
return NextResponse.json({ error: message }, { status: 502 });
}
}
+20
View File
@@ -0,0 +1,20 @@
import { NextResponse } from "next/server";
export async function GET() {
const pythonServerUrl =
process.env.VIBEVOICE_SERVER_URL ?? "http://localhost:8000";
try {
const res = await fetch(`${pythonServerUrl}/health`, {
method: "GET",
signal: AbortSignal.timeout(4000),
});
if (res.ok) {
return NextResponse.json({ status: "online" });
}
return NextResponse.json({ status: "offline" });
} catch {
return NextResponse.json({ status: "offline" });
}
}
Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

+87
View File
@@ -0,0 +1,87 @@
@import "tailwindcss";
:root {
--background: #0d1117;
--foreground: #e2e8f0;
--card-bg: #161b22;
--border: #21262d;
--accent-teal: #2dd4bf;
--accent-violet: #a78bfa;
--accent-teal-dim: #0d9488;
--accent-violet-dim: #7c3aed;
--muted: #64748b;
--success: #22c55e;
--error: #ef4444;
--font-sans: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
--font-mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Consolas, "Liberation Mono", monospace;
}
@theme inline {
--color-background: var(--background);
--color-foreground: var(--foreground);
--font-sans: var(--font-sans);
--font-mono: var(--font-mono);
}
body {
background: var(--background);
color: var(--foreground);
font-family: var(--font-sans);
min-height: 100vh;
}
/* Custom scrollbar */
::-webkit-scrollbar {
width: 6px;
height: 6px;
}
::-webkit-scrollbar-track {
background: var(--card-bg);
}
::-webkit-scrollbar-thumb {
background: var(--border);
border-radius: 3px;
}
::-webkit-scrollbar-thumb:hover {
background: var(--muted);
}
/* Range input styling */
input[type="range"] {
-webkit-appearance: none;
appearance: none;
background: transparent;
cursor: pointer;
}
input[type="range"]::-webkit-slider-runnable-track {
background: var(--border);
height: 4px;
border-radius: 2px;
}
input[type="range"]::-webkit-slider-thumb {
-webkit-appearance: none;
appearance: none;
width: 16px;
height: 16px;
border-radius: 50%;
background: var(--accent-teal);
margin-top: -6px;
box-shadow: 0 0 6px rgba(45, 212, 191, 0.4);
transition: box-shadow 0.15s ease;
}
input[type="range"]:hover::-webkit-slider-thumb {
box-shadow: 0 0 10px rgba(45, 212, 191, 0.7);
}
input[type="range"]::-moz-range-track {
background: var(--border);
height: 4px;
border-radius: 2px;
}
input[type="range"]::-moz-range-thumb {
width: 16px;
height: 16px;
border-radius: 50%;
background: var(--accent-teal);
border: none;
box-shadow: 0 0 6px rgba(45, 212, 191, 0.4);
}
+21
View File
@@ -0,0 +1,21 @@
import type { Metadata } from "next";
import "./globals.css";
export const metadata: Metadata = {
title: "VibePod — TTS Podcast Generator",
description: "Generate podcast audio using Microsoft VibeVoice 0.5B",
};
export default function RootLayout({
children,
}: Readonly<{
children: React.ReactNode;
}>) {
return (
<html lang="en">
<body style={{ background: "var(--background)", color: "var(--foreground)" }}>
{children}
</body>
</html>
);
}
+168
View File
@@ -0,0 +1,168 @@
"use client";
import { useReducer, useCallback } from "react";
import Header from "@/components/Header";
import TextInputPanel from "@/components/TextInputPanel";
import GenerationControls from "@/components/GenerationControls";
import AudioPlayer from "@/components/AudioPlayer";
import StatusLog from "@/components/StatusLog";
interface AppState {
script: string;
cfgScale: number;
inferenceSteps: number;
isGenerating: boolean;
audioUrl: string | null;
logs: string[];
}
type AppAction =
| { type: "SET_SCRIPT"; payload: string }
| { type: "SET_CFG_SCALE"; payload: number }
| { type: "SET_INFERENCE_STEPS"; payload: number }
| { type: "START_GENERATION" }
| { type: "GENERATION_SUCCESS"; payload: string }
| { type: "GENERATION_ERROR"; payload: string }
| { type: "ADD_LOG"; payload: string };
function appReducer(state: AppState, action: AppAction): AppState {
switch (action.type) {
case "SET_SCRIPT":
return { ...state, script: action.payload };
case "SET_CFG_SCALE":
return { ...state, cfgScale: action.payload };
case "SET_INFERENCE_STEPS":
return { ...state, inferenceSteps: action.payload };
case "START_GENERATION":
return {
...state,
isGenerating: true,
audioUrl: null,
logs: [],
};
case "GENERATION_SUCCESS":
return {
...state,
isGenerating: false,
audioUrl: action.payload,
};
case "GENERATION_ERROR":
return {
...state,
isGenerating: false,
};
case "ADD_LOG":
return { ...state, logs: [...state.logs, action.payload] };
default:
return state;
}
}
const initialState: AppState = {
script: "",
cfgScale: 2.5,
inferenceSteps: 20,
isGenerating: false,
audioUrl: null,
logs: [],
};
export default function HomePage() {
const [state, dispatch] = useReducer(appReducer, initialState);
const wordCount =
state.script.trim() === ""
? 0
: state.script.trim().split(/\s+/).length;
const addLog = useCallback((msg: string) => {
dispatch({ type: "ADD_LOG", payload: msg });
}, []);
const handleGenerate = useCallback(async () => {
if (!state.script.trim() || state.isGenerating) return;
dispatch({ type: "START_GENERATION" });
addLog("Connecting to VibeVoice server...");
try {
addLog(`Sending script (${wordCount} words) for synthesis...`);
addLog(
`Settings: CFG=${state.cfgScale.toFixed(1)}, Steps=${state.inferenceSteps}`
);
const res = await fetch("/api/generate", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({
text: state.script,
cfg_scale: state.cfgScale,
inference_steps: state.inferenceSteps,
}),
});
if (!res.ok) {
const err = await res.json().catch(() => ({ error: res.statusText }));
throw new Error(err.error ?? `HTTP ${res.status}`);
}
addLog("Generating audio...");
const blob = await res.blob();
const url = URL.createObjectURL(blob);
const sizeMB = (blob.size / 1024 / 1024).toFixed(2);
addLog(`Audio received — ${sizeMB} MB`);
addLog("Done — audio ready for playback.");
dispatch({ type: "GENERATION_SUCCESS", payload: url });
} catch (err) {
const message =
err instanceof Error ? err.message : "Unknown error occurred";
addLog(`Error: ${message}`);
dispatch({ type: "GENERATION_ERROR", payload: message });
}
}, [state.script, state.cfgScale, state.inferenceSteps, state.isGenerating, wordCount, addLog]);
return (
<div
className="min-h-screen flex flex-col"
style={{ background: "var(--background)" }}
>
<Header />
<main className="flex-1 container mx-auto px-4 py-6 max-w-6xl">
<div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
{/* Left column: script input */}
<div className="lg:col-span-2 flex flex-col gap-6">
<TextInputPanel
value={state.script}
onChange={(text) =>
dispatch({ type: "SET_SCRIPT", payload: text })
}
/>
{state.audioUrl && <AudioPlayer audioUrl={state.audioUrl} />}
</div>
{/* Right column: controls + log */}
<div className="flex flex-col gap-6">
<GenerationControls
cfgScale={state.cfgScale}
onCfgScaleChange={(v) =>
dispatch({ type: "SET_CFG_SCALE", payload: v })
}
inferenceSteps={state.inferenceSteps}
onInferenceStepsChange={(v) =>
dispatch({ type: "SET_INFERENCE_STEPS", payload: v })
}
onGenerate={handleGenerate}
isGenerating={state.isGenerating}
wordCount={wordCount}
/>
<StatusLog messages={state.logs} />
</div>
</div>
</main>
</div>
);
}
+195
View File
@@ -0,0 +1,195 @@
"use client";
import { useAudioPlayer } from "@/hooks/useAudioPlayer";
interface AudioPlayerProps {
audioUrl: string | null;
}
function formatTime(seconds: number): string {
if (!isFinite(seconds) || isNaN(seconds)) return "0:00";
const m = Math.floor(seconds / 60);
const s = Math.floor(seconds % 60);
return `${m}:${s.toString().padStart(2, "0")}`;
}
export default function AudioPlayer({ audioUrl }: AudioPlayerProps) {
const {
isPlaying,
currentTime,
duration,
volume,
toggle,
seek,
setVolume,
} = useAudioPlayer(audioUrl);
if (!audioUrl) return null;
const progress = duration > 0 ? (currentTime / duration) * 100 : 0;
const handleDownload = () => {
const a = document.createElement("a");
a.href = audioUrl;
a.download = "vibepod-output.wav";
a.click();
};
return (
<div
className="rounded-xl border p-5 flex flex-col gap-4"
style={{ background: "var(--card-bg)", borderColor: "var(--border)" }}
>
<div className="flex items-center justify-between">
<h2
className="text-sm font-semibold uppercase tracking-wider"
style={{ color: "var(--accent-teal)" }}
>
Audio Player
</h2>
<button
onClick={handleDownload}
className="flex items-center gap-2 text-xs px-3 py-1.5 rounded-lg border transition-colors cursor-pointer"
style={{
borderColor: "var(--accent-teal-dim)",
color: "var(--accent-teal)",
background: "rgba(45, 212, 191, 0.05)",
}}
onMouseEnter={(e) => {
(e.currentTarget as HTMLButtonElement).style.background =
"rgba(45, 212, 191, 0.15)";
}}
onMouseLeave={(e) => {
(e.currentTarget as HTMLButtonElement).style.background =
"rgba(45, 212, 191, 0.05)";
}}
>
<svg
className="w-3.5 h-3.5"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
>
<path d="M21 15v4a2 2 0 01-2 2H5a2 2 0 01-2-2v-4" />
<polyline points="7 10 12 15 17 10" />
<line x1="12" y1="15" x2="12" y2="3" />
</svg>
Download WAV
</button>
</div>
{/* Waveform / progress bar */}
<div className="flex flex-col gap-2">
<div
className="relative h-2 rounded-full cursor-pointer overflow-hidden"
style={{ background: "var(--border)" }}
onClick={(e) => {
const rect = e.currentTarget.getBoundingClientRect();
const ratio = (e.clientX - rect.left) / rect.width;
seek(ratio * duration);
}}
>
<div
className="absolute inset-y-0 left-0 rounded-full transition-all"
style={{
width: `${progress}%`,
background:
"linear-gradient(90deg, var(--accent-teal-dim), var(--accent-violet-dim))",
}}
/>
</div>
<div
className="flex items-center justify-between text-xs font-mono"
style={{ color: "var(--muted)" }}
>
<span>{formatTime(currentTime)}</span>
<span>{formatTime(duration)}</span>
</div>
</div>
{/* Controls row */}
<div className="flex items-center gap-4">
{/* Play/Pause */}
<button
onClick={toggle}
className="w-10 h-10 rounded-full flex items-center justify-center transition-transform active:scale-95 cursor-pointer"
style={{
background:
"linear-gradient(135deg, var(--accent-teal-dim), var(--accent-violet-dim))",
boxShadow: "0 4px 12px rgba(45, 212, 191, 0.3)",
}}
aria-label={isPlaying ? "Pause" : "Play"}
>
{isPlaying ? (
<svg
className="w-4 h-4 text-white"
viewBox="0 0 24 24"
fill="currentColor"
>
<rect x="6" y="4" width="4" height="16" />
<rect x="14" y="4" width="4" height="16" />
</svg>
) : (
<svg
className="w-4 h-4 text-white"
viewBox="0 0 24 24"
fill="currentColor"
>
<polygon points="5 3 19 12 5 21 5 3" />
</svg>
)}
</button>
{/* Duration info */}
<div className="flex-1 flex items-center gap-1 text-sm">
<span style={{ color: "var(--foreground)" }}>
{formatTime(currentTime)}
</span>
<span style={{ color: "var(--muted)" }}>/</span>
<span style={{ color: "var(--muted)" }}>{formatTime(duration)}</span>
</div>
{/* Volume control */}
<div className="flex items-center gap-2">
<svg
className="w-4 h-4 flex-shrink-0"
style={{ color: "var(--muted)" }}
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
>
{volume === 0 ? (
<>
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
<line x1="23" y1="9" x2="17" y2="15" />
<line x1="17" y1="9" x2="23" y2="15" />
</>
) : volume < 0.5 ? (
<>
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
<path d="M15.54 8.46a5 5 0 010 7.07" />
</>
) : (
<>
<polygon points="11 5 6 9 2 9 2 15 6 15 11 19 11 5" />
<path d="M19.07 4.93a10 10 0 010 14.14M15.54 8.46a5 5 0 010 7.07" />
</>
)}
</svg>
<input
type="range"
min={0}
max={1}
step={0.05}
value={volume}
onChange={(e) => setVolume(parseFloat(e.target.value))}
className="w-20"
aria-label="Volume"
/>
</div>
</div>
</div>
);
}
@@ -0,0 +1,193 @@
"use client";
interface GenerationControlsProps {
cfgScale: number;
onCfgScaleChange: (v: number) => void;
inferenceSteps: number;
onInferenceStepsChange: (v: number) => void;
onGenerate: () => void;
isGenerating: boolean;
wordCount: number;
}
export default function GenerationControls({
cfgScale,
onCfgScaleChange,
inferenceSteps,
onInferenceStepsChange,
onGenerate,
isGenerating,
wordCount,
}: GenerationControlsProps) {
const estimatedSeconds = Math.ceil(wordCount / 50);
const estimatedDisplay =
wordCount === 0
? "—"
: estimatedSeconds < 60
? `~${estimatedSeconds}s`
: `~${Math.floor(estimatedSeconds / 60)}m ${estimatedSeconds % 60}s`;
return (
<div
className="rounded-xl border p-5 flex flex-col gap-5"
style={{ background: "var(--card-bg)", borderColor: "var(--border)" }}
>
<h2
className="text-sm font-semibold uppercase tracking-wider"
style={{ color: "var(--accent-teal)" }}
>
Generation Settings
</h2>
{/* CFG Scale slider */}
<div className="flex flex-col gap-2">
<div className="flex items-center justify-between">
<label className="text-sm font-medium" style={{ color: "var(--foreground)" }}>
Voice Expressiveness
</label>
<span
className="text-sm font-mono px-2 py-0.5 rounded"
style={{
background: "var(--background)",
color: "var(--accent-teal)",
}}
>
{cfgScale.toFixed(1)}
</span>
</div>
<input
type="range"
min={1.0}
max={3.0}
step={0.1}
value={cfgScale}
onChange={(e) => onCfgScaleChange(parseFloat(e.target.value))}
className="w-full"
/>
<div
className="flex items-center justify-between text-xs"
style={{ color: "var(--muted)" }}
>
<span>Flat (1.0)</span>
<span>CFG Scale</span>
<span>Expressive (3.0)</span>
</div>
</div>
{/* Inference Steps slider */}
<div className="flex flex-col gap-2">
<div className="flex items-center justify-between">
<label className="text-sm font-medium" style={{ color: "var(--foreground)" }}>
Quality vs Speed
</label>
<span
className="text-sm font-mono px-2 py-0.5 rounded"
style={{
background: "var(--background)",
color: "var(--accent-violet)",
}}
>
{inferenceSteps}
</span>
</div>
<input
type="range"
min={10}
max={30}
step={1}
value={inferenceSteps}
onChange={(e) => onInferenceStepsChange(parseInt(e.target.value, 10))}
className="w-full"
style={
{
"--thumb-color": "var(--accent-violet)",
} as React.CSSProperties
}
/>
<div
className="flex items-center justify-between text-xs"
style={{ color: "var(--muted)" }}
>
<span>Faster (10)</span>
<span>Inference Steps</span>
<span>Higher quality (30)</span>
</div>
</div>
{/* Estimated time */}
<div
className="flex items-center justify-between px-3 py-2 rounded-lg text-sm"
style={{
background: "var(--background)",
border: "1px solid var(--border)",
}}
>
<span style={{ color: "var(--muted)" }}>Estimated generation time</span>
<span
className="font-mono font-medium"
style={{ color: "var(--accent-teal)" }}
>
{estimatedDisplay}
</span>
</div>
{/* Generate button */}
<button
onClick={onGenerate}
disabled={isGenerating || wordCount === 0}
className="w-full py-3 rounded-xl font-semibold text-sm transition-all cursor-pointer disabled:cursor-not-allowed flex items-center justify-center gap-2"
style={
isGenerating || wordCount === 0
? {
background: "var(--border)",
color: "var(--muted)",
}
: {
background:
"linear-gradient(135deg, var(--accent-teal-dim), var(--accent-violet-dim))",
color: "#fff",
boxShadow: "0 4px 15px rgba(45, 212, 191, 0.2)",
}
}
>
{isGenerating ? (
<>
<svg
className="animate-spin w-4 h-4"
viewBox="0 0 24 24"
fill="none"
>
<circle
className="opacity-25"
cx="12"
cy="12"
r="10"
stroke="currentColor"
strokeWidth="4"
/>
<path
className="opacity-75"
fill="currentColor"
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
/>
</svg>
Generating audio...
</>
) : (
<>
<svg
className="w-4 h-4"
viewBox="0 0 24 24"
fill="none"
stroke="currentColor"
strokeWidth="2"
>
<polygon points="5 3 19 12 5 21 5 3" />
</svg>
Generate Podcast Audio
</>
)}
</button>
</div>
);
}
+106
View File
@@ -0,0 +1,106 @@
"use client";
import { useEffect, useState } from "react";
type ServerStatus = "checking" | "online" | "offline";
export default function Header() {
const [status, setStatus] = useState<ServerStatus>("checking");
useEffect(() => {
const checkHealth = async () => {
try {
const res = await fetch("/api/health");
const data = await res.json();
setStatus(data.status === "online" ? "online" : "offline");
} catch {
setStatus("offline");
}
};
checkHealth();
const interval = setInterval(checkHealth, 30000);
return () => clearInterval(interval);
}, []);
const statusConfig = {
checking: {
color: "bg-yellow-500",
label: "Checking...",
textColor: "text-yellow-400",
pulse: true,
},
online: {
color: "bg-green-500",
label: "Server Online",
textColor: "text-green-400",
pulse: false,
},
offline: {
color: "bg-red-500",
label: "Server Offline",
textColor: "text-red-400",
pulse: false,
},
};
const cfg = statusConfig[status];
return (
<header
className="border-b px-6 py-4 flex items-center justify-between"
style={{
background: "var(--card-bg)",
borderColor: "var(--border)",
}}
>
<div className="flex items-center gap-4">
<div className="flex items-center gap-3">
<div
className="w-9 h-9 rounded-xl flex items-center justify-center text-lg font-bold"
style={{
background:
"linear-gradient(135deg, var(--accent-teal-dim), var(--accent-violet-dim))",
}}
>
🎙
</div>
<div>
<h1
className="text-xl font-bold tracking-tight"
style={{
background:
"linear-gradient(135deg, var(--accent-teal), var(--accent-violet))",
WebkitBackgroundClip: "text",
WebkitTextFillColor: "transparent",
}}
>
VibePod
</h1>
<p className="text-xs" style={{ color: "var(--muted)" }}>
Powered by VibeVoice 0.5B
</p>
</div>
</div>
</div>
<div
className="flex items-center gap-2 px-3 py-1.5 rounded-full text-xs font-medium border"
style={{
background: "var(--background)",
borderColor: "var(--border)",
}}
>
<span className="relative flex h-2 w-2">
<span
className={`${cfg.pulse ? "animate-ping absolute inline-flex h-full w-full rounded-full opacity-75 " + cfg.color : "hidden"}`}
/>
<span
className={`relative inline-flex rounded-full h-2 w-2 ${cfg.color}`}
/>
</span>
<span style={{ color: "var(--foreground)" }}>{cfg.label}</span>
</div>
</header>
);
}
+76
View File
@@ -0,0 +1,76 @@
"use client";
import { useEffect, useRef } from "react";
interface StatusLogProps {
messages: string[];
}
export default function StatusLog({ messages }: StatusLogProps) {
const bottomRef = useRef<HTMLDivElement>(null);
useEffect(() => {
bottomRef.current?.scrollIntoView({ behavior: "smooth" });
}, [messages]);
return (
<div
className="rounded-xl border p-5 flex flex-col gap-3"
style={{ background: "var(--card-bg)", borderColor: "var(--border)" }}
>
<div className="flex items-center gap-2">
<h2
className="text-sm font-semibold uppercase tracking-wider"
style={{ color: "var(--accent-teal)" }}
>
Status Log
</h2>
<div className="flex gap-1 ml-auto">
<span className="w-2.5 h-2.5 rounded-full bg-red-500 opacity-70" />
<span className="w-2.5 h-2.5 rounded-full bg-yellow-500 opacity-70" />
<span className="w-2.5 h-2.5 rounded-full bg-green-500 opacity-70" />
</div>
</div>
<div
className="rounded-lg p-4 h-40 overflow-y-auto font-mono text-xs leading-relaxed"
style={{
background: "var(--background)",
border: "1px solid var(--border)",
}}
>
{messages.length === 0 ? (
<p style={{ color: "var(--muted)" }}>
Waiting for input...
<span className="animate-pulse"></span>
</p>
) : (
messages.map((msg, i) => {
const isError =
msg.toLowerCase().includes("error") ||
msg.toLowerCase().includes("failed");
const isSuccess =
msg.toLowerCase().includes("done") ||
msg.toLowerCase().includes("complete") ||
msg.toLowerCase().includes("ready");
const color = isError
? "var(--error)"
: isSuccess
? "var(--success)"
: "var(--foreground)";
return (
<div key={i} className="flex items-start gap-2">
<span style={{ color: "var(--muted)" }} className="select-none">
{String(i + 1).padStart(2, "0")}
</span>
<span style={{ color }}>{msg}</span>
</div>
);
})
)}
<div ref={bottomRef} />
</div>
</div>
);
}
+112
View File
@@ -0,0 +1,112 @@
"use client";
const SAMPLE_SCRIPT = `Welcome to VibePod, your gateway to the future of audio content creation. Today, we're diving deep into the world of artificial intelligence and how it's transforming the way we produce and consume podcasts.
Imagine being able to transform any written article, blog post, or essay into a professional-sounding audio experience in just seconds. That's exactly what VibeVoice 0.5B brings to the table — a compact yet powerful text-to-speech model that delivers remarkably natural-sounding voices.
The technology behind modern TTS systems has evolved dramatically over the past few years. We've moved from robotic, stilted speech synthesis to voices that carry real emotional nuance and natural prosody. VibeVoice represents Microsoft's latest contribution to this rapidly advancing field.
Whether you're a content creator looking to repurpose written material, an educator who wants to make content more accessible, or a developer building the next generation of audio applications, VibePod provides the tools you need.
In today's episode, we'll explore the key features that make VibeVoice unique, discuss practical use cases across different industries, and look ahead to what the next generation of voice AI might bring. Let's get started.`;
interface TextInputPanelProps {
value: string;
onChange: (text: string) => void;
}
export default function TextInputPanel({
value,
onChange,
}: TextInputPanelProps) {
const charCount = value.length;
const wordCount = value.trim() === "" ? 0 : value.trim().split(/\s+/).length;
return (
<div
className="rounded-xl border p-5 flex flex-col gap-4"
style={{ background: "var(--card-bg)", borderColor: "var(--border)" }}
>
<div className="flex items-center justify-between">
<h2
className="text-sm font-semibold uppercase tracking-wider"
style={{ color: "var(--accent-teal)" }}
>
Podcast Script
</h2>
<div className="flex items-center gap-2">
<button
onClick={() => onChange(SAMPLE_SCRIPT)}
className="text-xs px-3 py-1.5 rounded-lg border transition-colors cursor-pointer"
style={{
borderColor: "var(--border)",
color: "var(--muted)",
}}
onMouseEnter={(e) => {
(e.target as HTMLButtonElement).style.color =
"var(--accent-violet)";
(e.target as HTMLButtonElement).style.borderColor =
"var(--accent-violet)";
}}
onMouseLeave={(e) => {
(e.target as HTMLButtonElement).style.color = "var(--muted)";
(e.target as HTMLButtonElement).style.borderColor =
"var(--border)";
}}
>
Load sample script
</button>
<button
onClick={() => onChange("")}
className="text-xs px-3 py-1.5 rounded-lg border transition-colors cursor-pointer"
style={{
borderColor: "var(--border)",
color: "var(--muted)",
}}
onMouseEnter={(e) => {
(e.target as HTMLButtonElement).style.color = "var(--error)";
(e.target as HTMLButtonElement).style.borderColor = "var(--error)";
}}
onMouseLeave={(e) => {
(e.target as HTMLButtonElement).style.color = "var(--muted)";
(e.target as HTMLButtonElement).style.borderColor =
"var(--border)";
}}
>
Clear
</button>
</div>
</div>
<textarea
value={value}
onChange={(e) => onChange(e.target.value)}
placeholder="Paste or type your podcast script here..."
rows={12}
className="w-full rounded-lg p-4 text-sm resize-y outline-none transition-colors font-sans leading-relaxed"
style={{
background: "var(--background)",
border: "1px solid var(--border)",
color: "var(--foreground)",
minHeight: "200px",
}}
onFocus={(e) => {
e.target.style.borderColor = "var(--accent-teal)";
}}
onBlur={(e) => {
e.target.style.borderColor = "var(--border)";
}}
/>
<div
className="flex items-center justify-between text-xs"
style={{ color: "var(--muted)" }}
>
<span>
{wordCount} word{wordCount !== 1 ? "s" : ""}
</span>
<span>{charCount.toLocaleString()} characters</span>
</div>
</div>
);
}
+94
View File
@@ -0,0 +1,94 @@
"use client";
import { useCallback, useEffect, useRef, useState } from "react";
interface AudioPlayerState {
isPlaying: boolean;
currentTime: number;
duration: number;
volume: number;
}
export function useAudioPlayer(audioUrl: string | null) {
const audioRef = useRef<HTMLAudioElement | null>(null);
const [state, setState] = useState<AudioPlayerState>({
isPlaying: false,
currentTime: 0,
duration: 0,
volume: 1,
});
// Create/replace the Audio element whenever the URL changes
useEffect(() => {
if (!audioUrl) {
if (audioRef.current) {
audioRef.current.pause();
audioRef.current = null;
}
setState({ isPlaying: false, currentTime: 0, duration: 0, volume: 1 });
return;
}
const audio = new Audio(audioUrl);
audioRef.current = audio;
const onTimeUpdate = () =>
setState((prev) => ({ ...prev, currentTime: audio.currentTime }));
const onDurationChange = () =>
setState((prev) => ({ ...prev, duration: audio.duration }));
const onEnded = () =>
setState((prev) => ({ ...prev, isPlaying: false, currentTime: 0 }));
const onPlay = () => setState((prev) => ({ ...prev, isPlaying: true }));
const onPause = () => setState((prev) => ({ ...prev, isPlaying: false }));
audio.addEventListener("timeupdate", onTimeUpdate);
audio.addEventListener("durationchange", onDurationChange);
audio.addEventListener("loadedmetadata", onDurationChange);
audio.addEventListener("ended", onEnded);
audio.addEventListener("play", onPlay);
audio.addEventListener("pause", onPause);
return () => {
audio.pause();
audio.removeEventListener("timeupdate", onTimeUpdate);
audio.removeEventListener("durationchange", onDurationChange);
audio.removeEventListener("loadedmetadata", onDurationChange);
audio.removeEventListener("ended", onEnded);
audio.removeEventListener("play", onPlay);
audio.removeEventListener("pause", onPause);
};
}, [audioUrl]);
const toggle = useCallback(() => {
const audio = audioRef.current;
if (!audio) return;
if (audio.paused) {
audio.play();
} else {
audio.pause();
}
}, []);
const seek = useCallback((time: number) => {
const audio = audioRef.current;
if (!audio) return;
audio.currentTime = Math.max(0, Math.min(time, audio.duration));
}, []);
const setVolume = useCallback((v: number) => {
const audio = audioRef.current;
if (!audio) return;
audio.volume = Math.max(0, Math.min(1, v));
setState((prev) => ({ ...prev, volume: v }));
}, []);
return {
isPlaying: state.isPlaying,
currentTime: state.currentTime,
duration: state.duration,
volume: state.volume,
toggle,
seek,
setVolume,
};
}
+7
View File
@@ -0,0 +1,7 @@
import type { NextConfig } from "next";
const nextConfig: NextConfig = {
/* config options here */
};
export default nextConfig;
+1651
View File
File diff suppressed because it is too large Load Diff
+23
View File
@@ -0,0 +1,23 @@
{
"name": "podcast-forge",
"version": "0.1.0",
"private": true,
"scripts": {
"dev": "next dev --turbopack",
"build": "next build --turbopack",
"start": "next start"
},
"dependencies": {
"react": "19.1.0",
"react-dom": "19.1.0",
"next": "15.5.15"
},
"devDependencies": {
"typescript": "^5",
"@types/node": "^20",
"@types/react": "^19",
"@types/react-dom": "^19",
"@tailwindcss/postcss": "^4",
"tailwindcss": "^4"
}
}
+5
View File
@@ -0,0 +1,5 @@
const config = {
plugins: ["@tailwindcss/postcss"],
};
export default config;
+1
View File
@@ -0,0 +1 @@
<svg fill="none" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><path d="M14.5 13.5V5.41a1 1 0 0 0-.3-.7L9.8.29A1 1 0 0 0 9.08 0H1.5v13.5A2.5 2.5 0 0 0 4 16h8a2.5 2.5 0 0 0 2.5-2.5m-1.5 0v-7H8v-5H3v12a1 1 0 0 0 1 1h8a1 1 0 0 0 1-1M9.5 5V2.12L12.38 5zM5.13 5h-.62v1.25h2.12V5zm-.62 3h7.12v1.25H4.5zm.62 3h-.62v1.25h7.12V11z" clip-rule="evenodd" fill="#666" fill-rule="evenodd"/></svg>

After

Width:  |  Height:  |  Size: 391 B

+1
View File
@@ -0,0 +1 @@
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><g clip-path="url(#a)"><path fill-rule="evenodd" clip-rule="evenodd" d="M10.27 14.1a6.5 6.5 0 0 0 3.67-3.45q-1.24.21-2.7.34-.31 1.83-.97 3.1M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16m.48-1.52a7 7 0 0 1-.96 0H7.5a4 4 0 0 1-.84-1.32q-.38-.89-.63-2.08a40 40 0 0 0 3.92 0q-.25 1.2-.63 2.08a4 4 0 0 1-.84 1.31zm2.94-4.76q1.66-.15 2.95-.43a7 7 0 0 0 0-2.58q-1.3-.27-2.95-.43a18 18 0 0 1 0 3.44m-1.27-3.54a17 17 0 0 1 0 3.64 39 39 0 0 1-4.3 0 17 17 0 0 1 0-3.64 39 39 0 0 1 4.3 0m1.1-1.17q1.45.13 2.69.34a6.5 6.5 0 0 0-3.67-3.44q.65 1.26.98 3.1M8.48 1.5l.01.02q.41.37.84 1.31.38.89.63 2.08a40 40 0 0 0-3.92 0q.25-1.2.63-2.08a4 4 0 0 1 .85-1.32 7 7 0 0 1 .96 0m-2.75.4a6.5 6.5 0 0 0-3.67 3.44 29 29 0 0 1 2.7-.34q.31-1.83.97-3.1M4.58 6.28q-1.66.16-2.95.43a7 7 0 0 0 0 2.58q1.3.27 2.95.43a18 18 0 0 1 0-3.44m.17 4.71q-1.45-.12-2.69-.34a6.5 6.5 0 0 0 3.67 3.44q-.65-1.27-.98-3.1" fill="#666"/></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h16v16H0z"/></clipPath></defs></svg>

After

Width:  |  Height:  |  Size: 1.0 KiB

+1
View File
@@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>

After

Width:  |  Height:  |  Size: 1.3 KiB

+1
View File
@@ -0,0 +1 @@
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1155 1000"><path d="m577.3 0 577.4 1000H0z" fill="#fff"/></svg>

After

Width:  |  Height:  |  Size: 128 B

+1
View File
@@ -0,0 +1 @@
<svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.5 2.5h13v10a1 1 0 0 1-1 1h-11a1 1 0 0 1-1-1zM0 1h16v11.5a2.5 2.5 0 0 1-2.5 2.5h-11A2.5 2.5 0 0 1 0 12.5zm3.75 4.5a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5M7 4.75a.75.75 0 1 1-1.5 0 .75.75 0 0 1 1.5 0m1.75.75a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5" fill="#666"/></svg>

After

Width:  |  Height:  |  Size: 385 B

+11
View File
@@ -0,0 +1,11 @@
# VibePod TTS Server dependencies
# Install with: pip install -r requirements.txt
fastapi>=0.111.0
uvicorn[standard]>=0.29.0
transformers>=4.40.0
torch>=2.2.0
soundfile>=0.12.1
scipy>=1.13.0
numpy>=1.26.0
pydantic>=2.7.0
+150
View File
@@ -0,0 +1,150 @@
"""
VibePod — VibeVoice FastAPI TTS Server
Loads microsoft/VibeVoice-Realtime-0.5B via HuggingFace transformers and
exposes a POST /generate endpoint that accepts { text, cfg_scale, inference_steps }
and returns a WAV audio blob.
Start with:
uvicorn vibevoice_server:app --host 0.0.0.0 --port 8000
"""
import io
import logging
from contextlib import asynccontextmanager
from typing import AsyncGenerator, Optional
import numpy as np
import soundfile as sf
import torch
from fastapi import FastAPI, HTTPException
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field, field_validator
from transformers import AutoProcessor, AutoModel
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
MODEL_ID = "microsoft/VibeVoice-Realtime-0.5B"
# ─── Global model state ────────────────────────────────────────────────────────
_processor: Optional[object] = None
_model: Optional[object] = None
_device: str = "cpu"
def _load_model() -> None:
global _processor, _model, _device
if _model is not None:
return
_device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info("Loading %s on %s", MODEL_ID, _device)
_processor = AutoProcessor.from_pretrained(MODEL_ID)
_model = AutoModel.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16 if _device == "cuda" else torch.float32,
)
_model = _model.to(_device)
_model.eval()
logger.info("Model loaded successfully.")
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
_load_model()
yield
app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan)
# ─── Request / response schemas ────────────────────────────────────────────────
class GenerateRequest(BaseModel):
text: str = Field(..., min_length=1, max_length=10_000)
cfg_scale: float = Field(default=2.5, ge=1.0, le=3.0)
inference_steps: int = Field(default=20, ge=10, le=30)
@field_validator("text")
@classmethod
def text_not_blank(cls, v: str) -> str:
if not v.strip():
raise ValueError("text must not be blank")
return v.strip()
# ─── Endpoints ─────────────────────────────────────────────────────────────────
@app.get("/health")
async def health() -> dict:
"""Liveness probe used by the Next.js /api/health route."""
return {"status": "online", "model": MODEL_ID}
@app.post("/generate")
async def generate(req: GenerateRequest) -> StreamingResponse:
"""
Generate speech from text and return a WAV audio stream.
"""
if _model is None or _processor is None:
raise HTTPException(status_code=503, detail="Model not loaded yet — please retry in a moment.")
logger.info(
"Generating audio for %d chars (cfg=%.1f, steps=%d)",
len(req.text),
req.cfg_scale,
req.inference_steps,
)
try:
inputs = _processor(text=req.text, return_tensors="pt").to(_device)
with torch.no_grad():
output = _model.generate(
**inputs,
guidance_scale=req.cfg_scale,
num_inference_steps=req.inference_steps,
)
# output is typically a tensor of shape (1, num_samples) or (num_samples,)
audio_array = output.squeeze().cpu().numpy()
# Normalise to [-1, 1] float32 for WAV
if audio_array.dtype != np.float32:
audio_array = audio_array.astype(np.float32)
peak = np.abs(audio_array).max()
if peak > 0:
audio_array = audio_array / peak
# Determine sample rate — try common attribute names
sample_rate: int = (
getattr(_model.config, "sampling_rate", None)
or getattr(_model.config, "sample_rate", None)
or 24_000
)
buf = io.BytesIO()
sf.write(buf, audio_array, sample_rate, format="WAV", subtype="FLOAT")
buf.seek(0)
logger.info(
"Audio generated: %.2f s at %d Hz (%d bytes)",
len(audio_array) / sample_rate,
sample_rate,
buf.getbuffer().nbytes,
)
return StreamingResponse(
buf,
media_type="audio/wav",
headers={"Content-Disposition": 'attachment; filename="vibepod-output.wav"'},
)
except Exception as exc:
logger.exception("Generation failed: %s", exc)
raise HTTPException(status_code=500, detail=str(exc)) from exc
+27
View File
@@ -0,0 +1,27 @@
{
"compilerOptions": {
"target": "ES2017",
"lib": ["dom", "dom.iterable", "esnext"],
"allowJs": true,
"skipLibCheck": true,
"strict": true,
"noEmit": true,
"esModuleInterop": true,
"module": "esnext",
"moduleResolution": "bundler",
"resolveJsonModule": true,
"isolatedModules": true,
"jsx": "preserve",
"incremental": true,
"plugins": [
{
"name": "next"
}
],
"paths": {
"@/*": ["./*"]
}
},
"include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
"exclude": ["node_modules"]
}