mirror of
https://github.com/JezzWTF/vibepod.git
synced 2026-06-01 15:22:14 +00:00
Merge pull request #12 from JezzWTF/codex/gen-optim
Improve streaming generation flow and CPU startup handling
This commit is contained in:
@@ -1,20 +0,0 @@
|
|||||||
{
|
|
||||||
"permissions": {
|
|
||||||
"allow": [
|
|
||||||
"Bash(mv podcast-forge/pnpm-lock.yaml /tmp/vibepod-pnpm-lock.yaml)",
|
|
||||||
"Bash(git mv *)",
|
|
||||||
"Bash(mv /tmp/vibepod-pnpm-lock.yaml web/pnpm-lock.yaml)",
|
|
||||||
"Bash(git rm *)",
|
|
||||||
"Bash(uv lock *)",
|
|
||||||
"Bash(pnpm install *)",
|
|
||||||
"Bash(git add *)",
|
|
||||||
"Bash(command -v uv)",
|
|
||||||
"Bash(uv --version)",
|
|
||||||
"Bash(uv sync *)",
|
|
||||||
"Bash(pnpm --filter vibepod-web exec tsc --noEmit)",
|
|
||||||
"Bash(xargs cat *)",
|
|
||||||
"Bash(.venv/Scripts/python.exe -c \"import torch; print\\('torch:', torch.__version__\\); print\\('CUDA available:', torch.cuda.is_available\\(\\)\\); print\\('CUDA version:', torch.version.cuda\\)\")",
|
|
||||||
"Bash(nvidia-smi)"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -0,0 +1,37 @@
|
|||||||
|
root = true
|
||||||
|
|
||||||
|
[*]
|
||||||
|
end_of_line = lf
|
||||||
|
charset = utf-8
|
||||||
|
trim_trailing_whitespace = true
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
[*.{ts,tsx,js,jsx,mjs,cjs,mts,cts}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
[*.{json,jsonc}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
[*.{css,html}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
[*.{yaml,yml}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
[*.py]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[*.{toml}]
|
||||||
|
indent_style = space
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
[*.md]
|
||||||
|
trim_trailing_whitespace = false
|
||||||
|
|
||||||
|
[Makefile]
|
||||||
|
indent_style = tab
|
||||||
@@ -8,3 +8,42 @@ HF_TOKEN=
|
|||||||
|
|
||||||
# Override the HuggingFace model cache directory (optional)
|
# Override the HuggingFace model cache directory (optional)
|
||||||
# HF_HOME=/path/to/hf-cache
|
# HF_HOME=/path/to/hf-cache
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Runtime tuning
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Force the Python server device. Usually set by `pnpm dev` / `pnpm dev:cpu`.
|
||||||
|
# VIBEPOD_DEVICE=cuda
|
||||||
|
# VIBEPOD_DEVICE=cpu
|
||||||
|
|
||||||
|
# CPU mode: keep async decode enabled. This overlaps acoustic decoding with
|
||||||
|
# language-model work and measured ~20% faster on an 8-thread CPU run.
|
||||||
|
VIBEPOD_ASYNC_DECODE=1
|
||||||
|
|
||||||
|
# CPU mode: thread tuning. On an 8-core / 16-thread Ryzen test system,
|
||||||
|
# 8 worker threads with 1 inter-op thread gave the best wall time, while 12
|
||||||
|
# over-subscribed and regressed.
|
||||||
|
# VIBEPOD_CPU_THREADS=8
|
||||||
|
# VIBEPOD_CPU_INTEROP_THREADS=1
|
||||||
|
|
||||||
|
# CPU mode: playback buffering. CPU generation is slower than realtime, so
|
||||||
|
# smooth streaming needs a larger initial buffer than CUDA. Lower these for
|
||||||
|
# faster startup if you are OK with occasional rebuffering.
|
||||||
|
# VIBEPOD_PREBUFFER_SECS=24
|
||||||
|
# VIBEPOD_REBUFFER_THRESHOLD_SECS=2
|
||||||
|
# VIBEPOD_RESUME_THRESHOLD_SECS=12
|
||||||
|
|
||||||
|
# CPU mode: dynamic INT8 quantization is enabled by default in start.sh.
|
||||||
|
# Set to 0 if you are comparing quality/performance or debugging.
|
||||||
|
# VIBEPOD_QUANTIZE=1
|
||||||
|
|
||||||
|
# CUDA mode: dtype and attention selection. Defaults are bf16 + SDPA unless
|
||||||
|
# optional FlashAttention is explicitly enabled and importable.
|
||||||
|
# VIBEPOD_CUDA_DTYPE=bf16
|
||||||
|
# VIBEPOD_ATTN_IMPL=sdpa
|
||||||
|
# VIBEPOD_ENABLE_FLASH_ATTN=0
|
||||||
|
|
||||||
|
# Debug/profiling. Keep disabled for benchmark timing; async CPU profiling
|
||||||
|
# double-counts overlapped decode work.
|
||||||
|
# VIBEPOD_PROFILE_GENERATION=0
|
||||||
|
|||||||
@@ -0,0 +1,24 @@
|
|||||||
|
* text=auto eol=lf
|
||||||
|
|
||||||
|
*.sh text eol=lf
|
||||||
|
*.py text eol=lf
|
||||||
|
*.ts text eol=lf
|
||||||
|
*.tsx text eol=lf
|
||||||
|
*.js text eol=lf
|
||||||
|
*.jsx text eol=lf
|
||||||
|
*.mjs text eol=lf
|
||||||
|
*.cjs text eol=lf
|
||||||
|
*.mts text eol=lf
|
||||||
|
*.cts text eol=lf
|
||||||
|
*.css text eol=lf
|
||||||
|
*.html text eol=lf
|
||||||
|
*.json text eol=lf
|
||||||
|
*.jsonc text eol=lf
|
||||||
|
*.yaml text eol=lf
|
||||||
|
*.yml text eol=lf
|
||||||
|
*.toml text eol=lf
|
||||||
|
*.md text eol=lf
|
||||||
|
*.mdx text eol=lf
|
||||||
|
*.lock text eol=lf
|
||||||
|
*.env text eol=lf
|
||||||
|
*.env.* text eol=lf
|
||||||
@@ -24,3 +24,4 @@ web/node_modules/
|
|||||||
.DS_Store
|
.DS_Store
|
||||||
Thumbs.db
|
Thumbs.db
|
||||||
.vscode/settings.json
|
.vscode/settings.json
|
||||||
|
.claude/settings.local.json
|
||||||
|
|||||||
@@ -0,0 +1,18 @@
|
|||||||
|
# Dependencies
|
||||||
|
node_modules/
|
||||||
|
web/node_modules/
|
||||||
|
|
||||||
|
# Build outputs
|
||||||
|
web/.next/
|
||||||
|
web/tsconfig.tsbuildinfo
|
||||||
|
web/next-env.d.ts
|
||||||
|
|
||||||
|
# Python / server
|
||||||
|
server/
|
||||||
|
|
||||||
|
# Lock files
|
||||||
|
pnpm-lock.yaml
|
||||||
|
web/pnpm-lock.yaml
|
||||||
|
|
||||||
|
# Generated
|
||||||
|
web/public/
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"semi": true,
|
||||||
|
"singleQuote": false,
|
||||||
|
"tabWidth": 2,
|
||||||
|
"trailingComma": "es5",
|
||||||
|
"printWidth": 100,
|
||||||
|
"endOfLine": "lf"
|
||||||
|
}
|
||||||
@@ -8,10 +8,10 @@ This file gives AI coding agents (Jules, Copilot, Claude Code, etc.) the context
|
|||||||
|
|
||||||
VibePod is a text-to-speech web app. It has two services that must both run for the app to work:
|
VibePod is a text-to-speech web app. It has two services that must both run for the app to work:
|
||||||
|
|
||||||
| Service | Language | Entry point | Port |
|
| Service | Language | Entry point | Port |
|
||||||
|---------|----------|-------------|------|
|
| ---------- | ---------------------------------- | ------------------------------- | ---- |
|
||||||
| **server** | Python 3.10+ (FastAPI + VibeVoice) | `server/start.sh` | 8000 |
|
| **server** | Python 3.10+ (FastAPI + VibeVoice) | `server/start.sh` | 8000 |
|
||||||
| **web** | TypeScript (Next.js 15, React 19) | `pnpm --filter vibepod-web dev` | 3000 |
|
| **web** | TypeScript (Next.js 15, React 19) | `pnpm --filter vibepod-web dev` | 3000 |
|
||||||
|
|
||||||
The Next.js frontend proxies all model requests through its own API routes to the FastAPI server — it never calls the Python server directly from the browser.
|
The Next.js frontend proxies all model requests through its own API routes to the FastAPI server — it never calls the Python server directly from the browser.
|
||||||
|
|
||||||
@@ -51,12 +51,12 @@ pnpm build
|
|||||||
|
|
||||||
The `--cpu` flag in `start.sh` sets `VIBEPOD_DEVICE=cpu` and uses a separate venv (`server/.venv-cpu`) so CUDA and CPU installs never conflict. `vibevoice_server.py` reads `VIBEPOD_DEVICE` at startup via `_resolve_device()` — do not remove or rename that function.
|
The `--cpu` flag in `start.sh` sets `VIBEPOD_DEVICE=cpu` and uses a separate venv (`server/.venv-cpu`) so CUDA and CPU installs never conflict. `vibevoice_server.py` reads `VIBEPOD_DEVICE` at startup via `_resolve_device()` — do not remove or rename that function.
|
||||||
|
|
||||||
| Env var | Values | Set by |
|
| Env var | Values | Set by |
|
||||||
|---------|--------|--------|
|
| ------------------------ | ----------------------- | --------------------------- |
|
||||||
| `VIBEPOD_DEVICE` | `cpu` \| `cuda` | `server/start.sh` |
|
| `VIBEPOD_DEVICE` | `cpu` \| `cuda` | `server/start.sh` |
|
||||||
| `UV_PROJECT_ENVIRONMENT` | `.venv-cpu` \| `.venv` | `server/start.sh` |
|
| `UV_PROJECT_ENVIRONMENT` | `.venv-cpu` \| `.venv` | `server/start.sh` |
|
||||||
| `HF_TOKEN` | HuggingFace token | Jules secret / `.env.local` |
|
| `HF_TOKEN` | HuggingFace token | Jules secret / `.env.local` |
|
||||||
| `VIBEVOICE_SERVER_URL` | `http://localhost:8000` | `.env.local` |
|
| `VIBEVOICE_SERVER_URL` | `http://localhost:8000` | `.env.local` |
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -94,7 +94,9 @@ dev.sh Concurrent launcher (forwards flags to start.sh)
|
|||||||
## API reference
|
## API reference
|
||||||
|
|
||||||
### `GET /health`
|
### `GET /health`
|
||||||
|
|
||||||
Returns server status. Safe to poll.
|
Returns server status. Safe to poll.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{
|
{
|
||||||
"status": "online",
|
"status": "online",
|
||||||
@@ -103,13 +105,17 @@ Returns server status. Safe to poll.
|
|||||||
"voices": ["carter", "davis", "emma", "frank", "grace", "mike"]
|
"voices": ["carter", "davis", "emma", "frank", "grace", "mike"]
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
`status` values: `downloading` | `loading` | `online` | `error`
|
`status` values: `downloading` | `loading` | `online` | `error`
|
||||||
|
|
||||||
### `POST /generate`
|
### `POST /generate`
|
||||||
|
|
||||||
Streams audio as SSE events.
|
Streams audio as SSE events.
|
||||||
|
|
||||||
```json
|
```json
|
||||||
{ "text": "Hello world", "speaker": "carter", "cfg_scale": 1.5, "inference_steps": 10 }
|
{ "text": "Hello world", "speaker": "carter", "cfg_scale": 1.5, "inference_steps": 10 }
|
||||||
```
|
```
|
||||||
|
|
||||||
Event types: `audio_chunk` (base64 float32 PCM) | `complete` | `error` | `cancelled`
|
Event types: `audio_chunk` (base64 float32 PCM) | `complete` | `error` | `cancelled`
|
||||||
|
|
||||||
---
|
---
|
||||||
@@ -117,12 +123,14 @@ Event types: `audio_chunk` (base64 float32 PCM) | `complete` | `error` | `cancel
|
|||||||
## Do / Don't
|
## Do / Don't
|
||||||
|
|
||||||
**Do:**
|
**Do:**
|
||||||
|
|
||||||
- Use `pnpm dev:cpu` in Jules — never plain `pnpm dev`
|
- Use `pnpm dev:cpu` in Jules — never plain `pnpm dev`
|
||||||
- Run `git checkout server/uv.lock` if uv rewrites it during setup
|
- Run `git checkout server/uv.lock` if uv rewrites it during setup
|
||||||
- Keep `_resolve_device()` in `vibevoice_server.py` — it's the CPU/CUDA switching logic
|
- Keep `_resolve_device()` in `vibevoice_server.py` — it's the CPU/CUDA switching logic
|
||||||
- Test server changes against `GET /health` and `POST /generate`
|
- Test server changes against `GET /health` and `POST /generate`
|
||||||
|
|
||||||
**Don't:**
|
**Don't:**
|
||||||
|
|
||||||
- Run `uv sync` without `UV_PROJECT_ENVIRONMENT=.venv-cpu` in the Jules sandbox
|
- Run `uv sync` without `UV_PROJECT_ENVIRONMENT=.venv-cpu` in the Jules sandbox
|
||||||
- Install Python packages with pip
|
- Install Python packages with pip
|
||||||
- Modify `server/uv.lock` manually
|
- Modify `server/uv.lock` manually
|
||||||
|
|||||||
@@ -173,16 +173,21 @@ The shape language is a hybrid of structural precision and tactile softness.
|
|||||||
## Components
|
## Components
|
||||||
|
|
||||||
### Card Containers
|
### Card Containers
|
||||||
|
|
||||||
The fundamental building block of the UI. Every distinct section (Script, Player, Controls, Logs) is housed in a card featuring the `card-bg`, a 1px `border`, and `rounded-xl` corners. The internal layout always features an uppercase teal header for immediate section identification.
|
The fundamental building block of the UI. Every distinct section (Script, Player, Controls, Logs) is housed in a card featuring the `card-bg`, a 1px `border`, and `rounded-xl` corners. The internal layout always features an uppercase teal header for immediate section identification.
|
||||||
|
|
||||||
### Primary Action Buttons
|
### Primary Action Buttons
|
||||||
|
|
||||||
Used for high-leverage actions like "Generate Audio" and "Play/Pause." These buttons utilize the `gradient-primary-dim` background, bold white text, and emit a soft teal glow to draw the eye and signify their importance.
|
Used for high-leverage actions like "Generate Audio" and "Play/Pause." These buttons utilize the `gradient-primary-dim` background, bold white text, and emit a soft teal glow to draw the eye and signify their importance.
|
||||||
|
|
||||||
### Range Sliders
|
### Range Sliders
|
||||||
|
|
||||||
Custom-styled input ranges replace default browser styles. The tracks are muted and slim, while the thumbs are bright teal, fully rounded, and emit a glow that intensifies on hover, providing a premium, tactile scrubbing experience.
|
Custom-styled input ranges replace default browser styles. The tracks are muted and slim, while the thumbs are bright teal, fully rounded, and emit a glow that intensifies on hover, providing a premium, tactile scrubbing experience.
|
||||||
|
|
||||||
### Status Indicators & Logs
|
### Status Indicators & Logs
|
||||||
|
|
||||||
A critical component of the application. Status badges utilize a minimalist pill shape with a pulsing ring animation to indicate active server processing. The log panel explicitly uses monospace typography and color-codes messages (green for success, red for error, white for neutral) to provide a terminal-like readout of the backend systems.
|
A critical component of the application. Status badges utilize a minimalist pill shape with a pulsing ring animation to indicate active server processing. The log panel explicitly uses monospace typography and color-codes messages (green for success, red for error, white for neutral) to provide a terminal-like readout of the backend systems.
|
||||||
|
|
||||||
### Gradients
|
### Gradients
|
||||||
|
|
||||||
Gradients are used purposefully to indicate progress, activity, or brand presence. The primary gradient (`135deg` from teal to violet) is used for branding (the logo icon and text) and primary buttons. Horizontal gradients (`90deg`) are used dynamically in progress bars to represent the flow of data over time (e.g., loading, downloading, and audio generation).
|
Gradients are used purposefully to indicate progress, activity, or brand presence. The primary gradient (`135deg` from teal to violet) is used for branding (the logo icon and text) and primary buttons. Horizontal gradients (`90deg`) are used dynamically in progress bars to represent the flow of data over time (e.g., loading, downloading, and audio generation).
|
||||||
|
|||||||
@@ -14,12 +14,12 @@ The Next.js app proxies audio generation requests to the FastAPI server, keeping
|
|||||||
|
|
||||||
## Prerequisites
|
## Prerequisites
|
||||||
|
|
||||||
| Tool | Install |
|
| Tool | Install |
|
||||||
|------|---------|
|
| ---------------------------------- | ----------------------------------- |
|
||||||
| [Node.js 20+](https://nodejs.org) | `winget install OpenJS.NodeJS.LTS` |
|
| [Node.js 20+](https://nodejs.org) | `winget install OpenJS.NodeJS.LTS` |
|
||||||
| [pnpm](https://pnpm.io) | `npm i -g pnpm` |
|
| [pnpm](https://pnpm.io) | `npm i -g pnpm` |
|
||||||
| [Python 3.10+](https://python.org) | `winget install Python.Python.3.13` |
|
| [Python 3.10+](https://python.org) | `winget install Python.Python.3.13` |
|
||||||
| [uv](https://docs.astral.sh/uv/) | `winget install astral-sh.uv` |
|
| [uv](https://docs.astral.sh/uv/) | `winget install astral-sh.uv` |
|
||||||
|
|
||||||
## Getting started
|
## Getting started
|
||||||
|
|
||||||
@@ -50,10 +50,10 @@ The frontend shows a loading indicator while the model downloads. Once the serve
|
|||||||
|
|
||||||
VibePod maintains two completely separate Python virtual environments so CUDA and CPU torch installs never conflict:
|
VibePod maintains two completely separate Python virtual environments so CUDA and CPU torch installs never conflict:
|
||||||
|
|
||||||
| Mode | Command | venv | torch source |
|
| Mode | Command | venv | torch source |
|
||||||
|------|---------|------|--------------|
|
| -------------- | -------------- | ------------------ | ----------------------- |
|
||||||
| CUDA (default) | `pnpm dev` | `server/.venv` | PyTorch CUDA 12.4 index |
|
| CUDA (default) | `pnpm dev` | `server/.venv` | PyTorch CUDA 12.4 index |
|
||||||
| CPU-only | `pnpm dev:cpu` | `server/.venv-cpu` | PyPI (CPU wheel) |
|
| CPU-only | `pnpm dev:cpu` | `server/.venv-cpu` | PyPI (CPU wheel) |
|
||||||
|
|
||||||
On first run, each mode creates its own venv automatically. You can switch between them freely — they are fully independent. The active device is reported by the `/health` endpoint as `"device": "cpu"` or `"device": "cuda"`.
|
On first run, each mode creates its own venv automatically. You can switch between them freely — they are fully independent. The active device is reported by the `/health` endpoint as `"device": "cpu"` or `"device": "cuda"`.
|
||||||
|
|
||||||
@@ -74,11 +74,11 @@ pnpm build # Production build of the frontend
|
|||||||
|
|
||||||
Copy `.env.example` to `.env.local` and set:
|
Copy `.env.example` to `.env.local` and set:
|
||||||
|
|
||||||
| Variable | Default | Description |
|
| Variable | Default | Description |
|
||||||
|----------|---------|-------------|
|
| ---------------------- | ----------------------- | --------------------------------------------------------- |
|
||||||
| `VIBEVOICE_SERVER_URL` | `http://localhost:8000` | URL the Next.js API routes use to reach the Python server |
|
| `VIBEVOICE_SERVER_URL` | `http://localhost:8000` | URL the Next.js API routes use to reach the Python server |
|
||||||
| `HF_TOKEN` | — | HuggingFace token (required if the model repo is gated) |
|
| `HF_TOKEN` | — | HuggingFace token (required if the model repo is gated) |
|
||||||
| `HF_HOME` | — | Override the HuggingFace model cache directory |
|
| `HF_HOME` | — | Override the HuggingFace model cache directory |
|
||||||
|
|
||||||
## Project structure
|
## Project structure
|
||||||
|
|
||||||
@@ -107,11 +107,11 @@ server/
|
|||||||
|
|
||||||
## Generation parameters
|
## Generation parameters
|
||||||
|
|
||||||
| Parameter | Range | Default | Effect |
|
| Parameter | Range | Default | Effect |
|
||||||
|-----------|-------|---------|--------|
|
| ----------------- | --------------------------------------------------- | -------- | ---------------------------------------------- |
|
||||||
| `speaker` | `carter`, `davis`, `emma`, `frank`, `grace`, `mike` | `carter` | Voice preset used for the generated audio |
|
| `speaker` | `carter`, `davis`, `emma`, `frank`, `grace`, `mike` | `carter` | Voice preset used for the generated audio |
|
||||||
| `cfg_scale` | 0.5 – 4.0 | 1.5 | Higher = more expressive guidance |
|
| `cfg_scale` | 0.5 – 4.0 | 1.5 | Higher = more expressive guidance |
|
||||||
| `inference_steps` | 5 – 20 | 10 | More steps = higher quality, slower generation |
|
| `inference_steps` | 5 – 20 | 10 | More steps = higher quality, slower generation |
|
||||||
|
|
||||||
## How it works
|
## How it works
|
||||||
|
|
||||||
|
|||||||
+8
-1
@@ -8,7 +8,14 @@
|
|||||||
"dev:cpu": "bash dev.sh --cpu",
|
"dev:cpu": "bash dev.sh --cpu",
|
||||||
"dev:server": "bash server/start.sh",
|
"dev:server": "bash server/start.sh",
|
||||||
"dev:server:cpu": "bash server/start.sh --cpu",
|
"dev:server:cpu": "bash server/start.sh --cpu",
|
||||||
"dev:web": "pnpm --filter vibepod-web dev"
|
"dev:web": "pnpm --filter vibepod-web dev",
|
||||||
|
"format": "prettier --write . && cd server && uv run ruff format .",
|
||||||
|
"format:check": "prettier --check . && cd server && uv run ruff format --check .",
|
||||||
|
"lint:server": "cd server && uv run ruff check .",
|
||||||
|
"lint:server:fix": "cd server && uv run ruff check --fix ."
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"prettier": "^3.5.3"
|
||||||
},
|
},
|
||||||
"packageManager": "pnpm@10.33.2+sha512.a90faf6feeab71ad6c6e57f94e0fe1a12f5dcc22cd754db40ae9593eb6a3e0b6b12e3540218bb37ae083404b1f2ce6db2a4121e979829b4aff94b99f49da1cf8"
|
"packageManager": "pnpm@10.33.2+sha512.a90faf6feeab71ad6c6e57f94e0fe1a12f5dcc22cd754db40ae9593eb6a3e0b6b12e3540218bb37ae083404b1f2ce6db2a4121e979829b4aff94b99f49da1cf8"
|
||||||
}
|
}
|
||||||
|
|||||||
Generated
+12
-1
@@ -6,7 +6,11 @@ settings:
|
|||||||
|
|
||||||
importers:
|
importers:
|
||||||
|
|
||||||
.: {}
|
.:
|
||||||
|
devDependencies:
|
||||||
|
prettier:
|
||||||
|
specifier: ^3.5.3
|
||||||
|
version: 3.8.3
|
||||||
|
|
||||||
web:
|
web:
|
||||||
dependencies:
|
dependencies:
|
||||||
@@ -516,6 +520,11 @@ packages:
|
|||||||
resolution: {integrity: sha512-W62t/Se6rA0Az3DfCL0AqJwXuKwBeYg6nOaIgzP+xZ7N5BFCI7DYi1qs6ygUYT6rvfi6t9k65UMLJC+PHZpDAA==}
|
resolution: {integrity: sha512-W62t/Se6rA0Az3DfCL0AqJwXuKwBeYg6nOaIgzP+xZ7N5BFCI7DYi1qs6ygUYT6rvfi6t9k65UMLJC+PHZpDAA==}
|
||||||
engines: {node: ^10 || ^12 || >=14}
|
engines: {node: ^10 || ^12 || >=14}
|
||||||
|
|
||||||
|
prettier@3.8.3:
|
||||||
|
resolution: {integrity: sha512-7igPTM53cGHMW8xWuVTydi2KO233VFiTNyF5hLJqpilHfmn8C8gPf+PS7dUT64YcXFbiMGZxS9pCSxL/Dxm/Jw==}
|
||||||
|
engines: {node: '>=14'}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
react-dom@19.1.0:
|
react-dom@19.1.0:
|
||||||
resolution: {integrity: sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==}
|
resolution: {integrity: sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==}
|
||||||
peerDependencies:
|
peerDependencies:
|
||||||
@@ -917,6 +926,8 @@ snapshots:
|
|||||||
picocolors: 1.1.1
|
picocolors: 1.1.1
|
||||||
source-map-js: 1.2.1
|
source-map-js: 1.2.1
|
||||||
|
|
||||||
|
prettier@3.8.3: {}
|
||||||
|
|
||||||
react-dom@19.1.0(react@19.1.0):
|
react-dom@19.1.0(react@19.1.0):
|
||||||
dependencies:
|
dependencies:
|
||||||
react: 19.1.0
|
react: 19.1.0
|
||||||
|
|||||||
+1
-1
@@ -1,2 +1,2 @@
|
|||||||
packages:
|
packages:
|
||||||
- 'web'
|
- "web"
|
||||||
|
|||||||
@@ -39,6 +39,13 @@ VibePod Studio will turn generated audio from a one-shot download into a reusabl
|
|||||||
- Add project save/load, autosave, and recoverable render jobs.
|
- Add project save/load, autosave, and recoverable render jobs.
|
||||||
- Prepare the audio pipeline for queueing longer renders outside the request lifecycle.
|
- Prepare the audio pipeline for queueing longer renders outside the request lifecycle.
|
||||||
|
|
||||||
|
## Later: VibeVoice Performance Research
|
||||||
|
|
||||||
|
- Move the current VibePod hot-path monkey patches into the `JezzWTF/VibeVoice` fork once the feature direction has settled.
|
||||||
|
- Add clearer generation profiling for overlapped CPU work, especially decode wait time versus total acoustic decode time.
|
||||||
|
- Prototype batched positive/negative CFG TTS LM inference behind an opt-in flag and benchmark it against the current sequential path on CPU and CUDA.
|
||||||
|
- Keep experimental performance work isolated from user-facing feature work unless it shows a clear speedup without audio quality regressions.
|
||||||
|
|
||||||
## Foundation Work Needed First
|
## Foundation Work Needed First
|
||||||
|
|
||||||
- Persist generated outputs with stable IDs.
|
- Persist generated outputs with stable IDs.
|
||||||
|
|||||||
@@ -0,0 +1 @@
|
|||||||
|
3.12
|
||||||
@@ -30,15 +30,12 @@ def download() -> str:
|
|||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
except ImportError:
|
except ImportError:
|
||||||
print(
|
print(
|
||||||
"ERROR: huggingface_hub is not installed.\n"
|
"ERROR: huggingface_hub is not installed.\nRun: pip install huggingface_hub",
|
||||||
"Run: pip install huggingface_hub",
|
|
||||||
file=sys.stderr,
|
file=sys.stderr,
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
token: str | None = os.environ.get("HF_TOKEN") or os.environ.get(
|
token: str | None = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
|
||||||
"HUGGINGFACE_TOKEN"
|
|
||||||
)
|
|
||||||
|
|
||||||
print(f"Checking / downloading model: {MODEL_ID}")
|
print(f"Checking / downloading model: {MODEL_ID}")
|
||||||
print("(This may take several minutes on first run — the model is ~1 GB)")
|
print("(This may take several minutes on first run — the model is ~1 GB)")
|
||||||
|
|||||||
+28
-1
@@ -8,7 +8,8 @@ dependencies = [
|
|||||||
# To switch back to CPU-only, remove the [tool.uv.sources] torch entry below.
|
# To switch back to CPU-only, remove the [tool.uv.sources] torch entry below.
|
||||||
"torch>=2.0.0",
|
"torch>=2.0.0",
|
||||||
# VibeVoice custom model + processor classes (not yet in upstream transformers)
|
# VibeVoice custom model + processor classes (not yet in upstream transformers)
|
||||||
"vibevoice @ git+https://github.com/microsoft/VibeVoice.git",
|
# Uses JezzWTF/VibeVoice fork so VibePod-specific optimisations land here.
|
||||||
|
"vibevoice @ git+https://github.com/JezzWTF/VibeVoice.git",
|
||||||
# Exact version required by vibevoice's streaming TTS module
|
# Exact version required by vibevoice's streaming TTS module
|
||||||
"transformers==4.51.3",
|
"transformers==4.51.3",
|
||||||
"fastapi>=0.111.0",
|
"fastapi>=0.111.0",
|
||||||
@@ -18,10 +19,36 @@ dependencies = [
|
|||||||
"huggingface_hub>=0.23.0",
|
"huggingface_hub>=0.23.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[dependency-groups]
|
||||||
|
dev = [
|
||||||
|
"ruff>=0.11.0",
|
||||||
|
]
|
||||||
|
|
||||||
# No build-system — this is a scripts project, not an installable package.
|
# No build-system — this is a scripts project, not an installable package.
|
||||||
# Lock file is committed so installs are reproducible.
|
# Lock file is committed so installs are reproducible.
|
||||||
# Run `uv lock --upgrade` to bump dependencies.
|
# Run `uv lock --upgrade` to bump dependencies.
|
||||||
|
|
||||||
|
[tool.ruff]
|
||||||
|
line-length = 100
|
||||||
|
indent-width = 4
|
||||||
|
target-version = "py310"
|
||||||
|
exclude = [".git", ".venv", ".venv-cpu", "__pycache__"]
|
||||||
|
|
||||||
|
[tool.ruff.lint]
|
||||||
|
select = ["E", "F", "UP", "B", "SIM", "I"]
|
||||||
|
ignore = [
|
||||||
|
"E501", # line-too-long — handled by formatter
|
||||||
|
"B905", # zip() without strict= — existing code, lengths are known-correct
|
||||||
|
]
|
||||||
|
|
||||||
|
[tool.ruff.lint.per-file-ignores]
|
||||||
|
"download_model.py" = ["T201"] # allow print statements in CLI scripts
|
||||||
|
|
||||||
|
[tool.ruff.format]
|
||||||
|
quote-style = "double"
|
||||||
|
indent-style = "space"
|
||||||
|
line-ending = "lf"
|
||||||
|
|
||||||
[[tool.uv.index]]
|
[[tool.uv.index]]
|
||||||
name = "pytorch-cu124"
|
name = "pytorch-cu124"
|
||||||
url = "https://download.pytorch.org/whl/cu124"
|
url = "https://download.pytorch.org/whl/cu124"
|
||||||
|
|||||||
+90
-1
@@ -7,6 +7,11 @@
|
|||||||
# ./start.sh — CUDA mode (default, uses PyTorch CUDA 12.4 wheel, venv: .venv)
|
# ./start.sh — CUDA mode (default, uses PyTorch CUDA 12.4 wheel, venv: .venv)
|
||||||
# ./start.sh --cpu — CPU-only mode (uses PyPI CPU torch wheel, venv: .venv-cpu)
|
# ./start.sh --cpu — CPU-only mode (uses PyPI CPU torch wheel, venv: .venv-cpu)
|
||||||
#
|
#
|
||||||
|
# Optional CUDA acceleration:
|
||||||
|
# VIBEPOD_ENABLE_FLASH_ATTN=1 ./start.sh
|
||||||
|
# Installs a pre-built flash-attn wheel when the CUDA venv uses Python 3.12,
|
||||||
|
# torch 2.6.0, and CUDA 12.4 on Windows. Other platforms fall back to SDPA.
|
||||||
|
#
|
||||||
# The two modes maintain completely separate virtual environments so their torch
|
# The two modes maintain completely separate virtual environments so their torch
|
||||||
# installations never conflict. UV_PROJECT_ENVIRONMENT tells uv which venv to use;
|
# installations never conflict. UV_PROJECT_ENVIRONMENT tells uv which venv to use;
|
||||||
# --no-sources skips [tool.uv.sources] so the CPU run pulls the default PyPI torch wheel.
|
# --no-sources skips [tool.uv.sources] so the CPU run pulls the default PyPI torch wheel.
|
||||||
@@ -51,6 +56,19 @@ if ! command -v uv &>/dev/null; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
validate_flash_attn() {
|
||||||
|
uv run python -c "import flash_attn; import triton; import transformers.modeling_utils" &>/dev/null
|
||||||
|
}
|
||||||
|
|
||||||
|
remove_broken_flash_attn() {
|
||||||
|
if uv run python -c "import importlib.util; raise SystemExit(0 if importlib.util.find_spec('flash_attn') else 1)" &>/dev/null; then
|
||||||
|
if ! validate_flash_attn; then
|
||||||
|
echo " Installed flash-attn is not usable in this environment; removing it."
|
||||||
|
uv pip uninstall flash-attn
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# 2. Sync Python environment
|
# 2. Sync Python environment
|
||||||
# CPU mode: use .venv-cpu and skip [tool.uv.sources] so uv pulls the
|
# CPU mode: use .venv-cpu and skip [tool.uv.sources] so uv pulls the
|
||||||
@@ -61,10 +79,64 @@ echo ""
|
|||||||
if $CPU_MODE; then
|
if $CPU_MODE; then
|
||||||
echo "--> Syncing CPU Python environment (.venv-cpu)..."
|
echo "--> Syncing CPU Python environment (.venv-cpu)..."
|
||||||
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
||||||
|
LOCK_BACKUP=""
|
||||||
|
if [[ -f uv.lock ]]; then
|
||||||
|
LOCK_BACKUP="$(mktemp)"
|
||||||
|
cp uv.lock "$LOCK_BACKUP"
|
||||||
|
fi
|
||||||
uv sync --no-sources
|
uv sync --no-sources
|
||||||
|
if [[ -n "$LOCK_BACKUP" ]]; then
|
||||||
|
cp "$LOCK_BACKUP" uv.lock
|
||||||
|
rm -f "$LOCK_BACKUP"
|
||||||
|
fi
|
||||||
else
|
else
|
||||||
echo "--> Syncing CUDA Python environment (.venv)..."
|
echo "--> Syncing CUDA Python environment (.venv)..."
|
||||||
uv sync
|
uv sync
|
||||||
|
|
||||||
|
remove_broken_flash_attn
|
||||||
|
|
||||||
|
if [[ "${VIBEPOD_ENABLE_FLASH_ATTN:-0}" == "1" ]]; then
|
||||||
|
echo ""
|
||||||
|
echo "--> Checking optional FlashAttention wheel..."
|
||||||
|
|
||||||
|
if validate_flash_attn; then
|
||||||
|
echo " flash-attn already installed and importable."
|
||||||
|
else
|
||||||
|
PY_TAG="$(uv run python -c "import sys; print(f'cp{sys.version_info.major}{sys.version_info.minor}')")"
|
||||||
|
TORCH_TAG="$(uv run python -c "import torch; print(torch.__version__.split('+', 1)[0])")"
|
||||||
|
CUDA_TAG="$(uv run python -c "import torch; print('cu' + torch.version.cuda.replace('.', ''))")"
|
||||||
|
|
||||||
|
FLASH_ATTN_WHEEL_URL=""
|
||||||
|
if [[ "$PY_TAG" == "cp312" && "$TORCH_TAG" == "2.6.0" && "$CUDA_TAG" == "cu124" ]]; then
|
||||||
|
case "$(uname -s)" in
|
||||||
|
MINGW*|CYGWIN*|MSYS*)
|
||||||
|
FLASH_ATTN_WHEEL_URL="https://huggingface.co/lldacing/flash-attention-windows-wheel/resolve/main/flash_attn-2.7.4%2Bcu124torch2.6.0cxx11abiFALSE-cp312-cp312-win_amd64.whl"
|
||||||
|
echo " Installing flash-attn for Python 3.12, torch 2.6.0, CUDA 12.4 (Windows)..."
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
echo " No pre-built flash-attn wheel available for this platform ($(uname -s))."
|
||||||
|
echo " Continuing with PyTorch SDPA attention."
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
else
|
||||||
|
echo " No known wheel for Python tag $PY_TAG, torch $TORCH_TAG, CUDA $CUDA_TAG."
|
||||||
|
echo " Continuing with PyTorch SDPA attention."
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [[ -n "$FLASH_ATTN_WHEEL_URL" ]]; then
|
||||||
|
if uv pip install "$FLASH_ATTN_WHEEL_URL"; then
|
||||||
|
if validate_flash_attn; then
|
||||||
|
echo " flash-attn import check passed."
|
||||||
|
else
|
||||||
|
echo " flash-attn import check failed; removing it and continuing with SDPA."
|
||||||
|
uv pip uninstall flash-attn
|
||||||
|
fi
|
||||||
|
else
|
||||||
|
echo " flash-attn wheel install failed; continuing with SDPA."
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
fi
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
@@ -78,11 +150,28 @@ export PYTHONUTF8=1
|
|||||||
if $CPU_MODE; then
|
if $CPU_MODE; then
|
||||||
export VIBEPOD_DEVICE="cpu"
|
export VIBEPOD_DEVICE="cpu"
|
||||||
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
||||||
|
if [[ -z "${VIBEPOD_CPU_THREADS:-}" ]]; then
|
||||||
|
VIBEPOD_CPU_THREADS="$(uv run --no-sync --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")"
|
||||||
|
export VIBEPOD_CPU_THREADS
|
||||||
|
fi
|
||||||
|
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$VIBEPOD_CPU_THREADS}"
|
||||||
|
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$VIBEPOD_CPU_THREADS}"
|
||||||
|
# Dynamic INT8 quantization — on by default for CPU (~22% faster, prediction_head
|
||||||
|
# excluded automatically to avoid regression on small fixed-size tensors).
|
||||||
|
# Set VIBEPOD_QUANTIZE=0 to disable if you notice audio quality differences.
|
||||||
|
export VIBEPOD_QUANTIZE="${VIBEPOD_QUANTIZE:-1}"
|
||||||
|
# Optional CPU flags:
|
||||||
|
# VIBEPOD_ASYNC_DECODE=0 Disable async decode/tts_lm overlap (on by default)
|
||||||
|
# VIBEPOD_CPU_BF16=1 Force bfloat16 weights (auto-detected via AVX512_BF16)
|
||||||
|
# VIBEPOD_COMPILE=1 torch.compile hot paths (ineffective for autoregressive
|
||||||
|
# models on CPU — not recommended, kept for experimentation)
|
||||||
|
UV_RUN_ARGS=(--no-sync --no-sources)
|
||||||
else
|
else
|
||||||
export VIBEPOD_DEVICE="cuda"
|
export VIBEPOD_DEVICE="cuda"
|
||||||
|
UV_RUN_ARGS=()
|
||||||
fi
|
fi
|
||||||
|
|
||||||
exec uv run uvicorn vibevoice_server:app \
|
exec uv run "${UV_RUN_ARGS[@]}" uvicorn vibevoice_server:app \
|
||||||
--host 127.0.0.1 \
|
--host 127.0.0.1 \
|
||||||
--port 8000 \
|
--port 8000 \
|
||||||
--log-level info \
|
--log-level info \
|
||||||
|
|||||||
Generated
+41
-8
@@ -1479,7 +1479,7 @@ name = "nvidia-cudnn-cu12"
|
|||||||
version = "9.1.0.70"
|
version = "9.1.0.70"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "nvidia-cublas-cu12" },
|
{ name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||||
]
|
]
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 },
|
{ url = "https://files.pythonhosted.org/packages/9f/fd/713452cd72343f682b1c7b9321e23829f00b842ceaedcda96e742ea0b0b3/nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl", hash = "sha256:165764f44ef8c61fcdfdfdbe769d687e06374059fbb388b6c89ecb0e28793a6f", size = 664752741 },
|
||||||
@@ -1490,7 +1490,7 @@ name = "nvidia-cufft-cu12"
|
|||||||
version = "11.2.1.3"
|
version = "11.2.1.3"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "nvidia-nvjitlink-cu12" },
|
{ name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||||
]
|
]
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
|
{ url = "https://files.pythonhosted.org/packages/27/94/3266821f65b92b3138631e9c8e7fe1fb513804ac934485a8d05776e1dd43/nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:f083fc24912aa410be21fa16d157fed2055dab1cc4b6934a0e03cba69eb242b9", size = 211459117 },
|
||||||
@@ -1509,9 +1509,9 @@ name = "nvidia-cusolver-cu12"
|
|||||||
version = "11.6.1.9"
|
version = "11.6.1.9"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "nvidia-cublas-cu12" },
|
{ name = "nvidia-cublas-cu12", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||||
{ name = "nvidia-cusparse-cu12" },
|
{ name = "nvidia-cusparse-cu12", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||||
{ name = "nvidia-nvjitlink-cu12" },
|
{ name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||||
]
|
]
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
|
{ url = "https://files.pythonhosted.org/packages/3a/e1/5b9089a4b2a4790dfdea8b3a006052cfecff58139d5a4e34cb1a51df8d6f/nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl", hash = "sha256:19e33fa442bcfd085b3086c4ebf7e8debc07cfe01e11513cc6d332fd918ac260", size = 127936057 },
|
||||||
@@ -1522,7 +1522,7 @@ name = "nvidia-cusparse-cu12"
|
|||||||
version = "12.3.1.170"
|
version = "12.3.1.170"
|
||||||
source = { registry = "https://pypi.org/simple" }
|
source = { registry = "https://pypi.org/simple" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "nvidia-nvjitlink-cu12" },
|
{ name = "nvidia-nvjitlink-cu12", marker = "(python_full_version < '3.11' and sys_platform == 'emscripten') or (python_full_version < '3.11' and sys_platform == 'win32') or (sys_platform != 'emscripten' and sys_platform != 'win32')" },
|
||||||
]
|
]
|
||||||
wheels = [
|
wheels = [
|
||||||
{ url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
|
{ url = "https://files.pythonhosted.org/packages/db/f7/97a9ea26ed4bbbfc2d470994b8b4f338ef663be97b8f677519ac195e113d/nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl", hash = "sha256:ea4f11a2904e2a8dc4b1833cc1b5181cde564edd0d5cd33e3c168eff2d1863f1", size = 207454763 },
|
||||||
@@ -2394,6 +2394,31 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654 },
|
{ url = "https://files.pythonhosted.org/packages/82/3b/64d4899d73f91ba49a8c18a8ff3f0ea8f1c1d75481760df8c68ef5235bf5/rich-15.0.0-py3-none-any.whl", hash = "sha256:33bd4ef74232fb73fe9279a257718407f169c09b78a87ad3d296f548e27de0bb", size = 310654 },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "ruff"
|
||||||
|
version = "0.15.12"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/99/43/3291f1cc9106f4c63bdce7a8d0df5047fe8422a75b091c16b5e9355e0b11/ruff-0.15.12.tar.gz", hash = "sha256:ecea26adb26b4232c0c2ca19ccbc0083a68344180bba2a600605538ce51a40a6", size = 4643852 }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c3/6e/e78ffb61d4686f3d96ba3df2c801161843746dcbcbb17a1e927d4829312b/ruff-0.15.12-py3-none-linux_armv6l.whl", hash = "sha256:f86f176e188e94d6bdbc09f09bfd9dc729059ad93d0e7390b5a73efe19f8861c", size = 10640713 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ae/08/a317bc231fb9e7b93e4ef3089501e51922ff88d6936ce5cf870c4fe55419/ruff-0.15.12-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:e3bcd123364c3770b8e1b7baaf343cc99a35f197c5c6e8af79015c666c423a6c", size = 11069267 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/aa/a4/f828e9718d3dce1f5f11c39c4f65afd32783c8b2aebb2e3d259e492c47bd/ruff-0.15.12-py3-none-macosx_11_0_arm64.whl", hash = "sha256:fe87510d000220aa1ed530d4448a7c696a0cae1213e5ec30e5874287b66557b5", size = 10397182 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/71/e0/3310fc6d1b5e1fdea22bf3b1b807c7e187b581021b0d7d4514cccdb5fb71/ruff-0.15.12-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84a1630093121375a3e2a95b4a6dc7b59e2b4ee76216e32d81aae550a832d002", size = 10758012 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/11/c1/a606911aee04c324ddaa883ae418f3569792fd3c4a10c50e0dd0a2311e1e/ruff-0.15.12-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fb129f40f114f089ebe0ca56c0d251cf2061b17651d464bb6478dc01e69f11f5", size = 10447479 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/9d/68/4201e8444f0894f21ab4aeeaee68aa4f10b51613514a20d80bd628d57e88/ruff-0.15.12-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b0c862b172d695db7598426b8af465e7e9ac00a3ea2a3630ee67eb82e366aaa6", size = 11234040 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/34/ff/8a6d6cf4ccc23fd67060874e832c18919d1557a0611ebef03fdb01fff11e/ruff-0.15.12-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2849ea9f3484c3aca43a82f484210370319e7170df4dfe4843395ddf6c57bc33", size = 12087377 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/85/f6/c669cf73f5152f623d34e69866a46d5e6185816b19fcd5b6dd8a2d299922/ruff-0.15.12-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9e77c7e51c07fe396826d5969a5b846d9cd4c402535835fb6e21ce8b28fef847", size = 11367784 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/e8/39/c61d193b8a1daaa8977f7dea9e8d8ba866e02ea7b65d32f6861693aa4c12/ruff-0.15.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:83b2f4f2f3b1026b5fb449b467d9264bf22067b600f7b6f41fc5958909f449d0", size = 11344088 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c2/8d/49afab3645e31e12c590acb6d3b5b69d7aab5b81926dbaf7461f9441f37a/ruff-0.15.12-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9ba3b8f1afd7e2e43d8943e55f249e13f9682fde09711644a6e7290eb4f3e339", size = 11271770 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/46/06/33f41fe94403e2b755481cdfb9b7ef3e4e0ed031c4581124658d935d52b4/ruff-0.15.12-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:e852ba9fdc890655e1d78f2df1499efbe0e54126bd405362154a75e2bde159c5", size = 10719355 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/0d/59/18aa4e014debbf559670e4048e39260a85c7fcee84acfd761ac01e7b8d35/ruff-0.15.12-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:dd8aed930da53780d22fc70bdf84452c843cf64f8cb4eb38984319c24c5cd5fd", size = 10462758 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/25/e7/cc9f16fd0f3b5fddcbd7ec3d6ae30c8f3fde1047f32a4093a98d633c6570/ruff-0.15.12-py3-none-musllinux_1_2_i686.whl", hash = "sha256:01da3988d225628b709493d7dc67c3b9b12c0210016b08690ef9bd27970b262b", size = 10953498 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/72/7a/a9ba7f98c7a575978698f4230c5e8cc54bbc761af34f560818f933dafa0c/ruff-0.15.12-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:9cae0f92bd5700d1213188b31cd3bdd2b315361296d10b96b8e2337d3d11f53e", size = 11447765 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/ea/f9/0ae446942c846b8266059ad8a30702a35afae55f5cdc54c5adf8d7afdc27/ruff-0.15.12-py3-none-win32.whl", hash = "sha256:d0185894e038d7043ba8fd6aee7499ece6462dc0ea9f1e260c7451807c714c20", size = 10657277 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/33/f1/9614e03e1cdcbf9437570b5400ced8a720b5db22b28d8e0f1bda429f660d/ruff-0.15.12-py3-none-win_amd64.whl", hash = "sha256:c87a162d61ab3adca47c03f7f717c68672edec7d1b5499e652331780fe74950d", size = 11837758 },
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/c0/98/6beb4b351e472e5f4c4613f7c35a5290b8be2497e183825310c4c3a3984b/ruff-0.15.12-py3-none-win_arm64.whl", hash = "sha256:a538f7a82d061cee7be55542aca1d86d1393d55d81d4fcc314370f4340930d4f", size = 11120821 },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "safehttpx"
|
name = "safehttpx"
|
||||||
version = "0.1.7"
|
version = "0.1.7"
|
||||||
@@ -3100,6 +3125,11 @@ dependencies = [
|
|||||||
{ name = "vibevoice" },
|
{ name = "vibevoice" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[package.dev-dependencies]
|
||||||
|
dev = [
|
||||||
|
{ name = "ruff" },
|
||||||
|
]
|
||||||
|
|
||||||
[package.metadata]
|
[package.metadata]
|
||||||
requires-dist = [
|
requires-dist = [
|
||||||
{ name = "fastapi", specifier = ">=0.111.0" },
|
{ name = "fastapi", specifier = ">=0.111.0" },
|
||||||
@@ -3109,13 +3139,16 @@ requires-dist = [
|
|||||||
{ name = "torch", specifier = ">=2.0.0", index = "https://download.pytorch.org/whl/cu124" },
|
{ name = "torch", specifier = ">=2.0.0", index = "https://download.pytorch.org/whl/cu124" },
|
||||||
{ name = "transformers", specifier = "==4.51.3" },
|
{ name = "transformers", specifier = "==4.51.3" },
|
||||||
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.29.0" },
|
{ name = "uvicorn", extras = ["standard"], specifier = ">=0.29.0" },
|
||||||
{ name = "vibevoice", git = "https://github.com/microsoft/VibeVoice.git" },
|
{ name = "vibevoice", git = "https://github.com/JezzWTF/VibeVoice.git" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[package.metadata.requires-dev]
|
||||||
|
dev = [{ name = "ruff", specifier = ">=0.11.0" }]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "vibevoice"
|
name = "vibevoice"
|
||||||
version = "1.0.0"
|
version = "1.0.0"
|
||||||
source = { git = "https://github.com/microsoft/VibeVoice.git#e73d1e17c3754f046352014856a922f8208fb5d3" }
|
source = { git = "https://github.com/JezzWTF/VibeVoice.git#fe832f20e3d1638594f551a08f02253f14408dbd" }
|
||||||
dependencies = [
|
dependencies = [
|
||||||
{ name = "absl-py" },
|
{ name = "absl-py" },
|
||||||
{ name = "accelerate" },
|
{ name = "accelerate" },
|
||||||
|
|||||||
+527
-51
@@ -20,17 +20,22 @@ Device selection:
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import base64
|
import base64
|
||||||
|
import concurrent.futures
|
||||||
import copy
|
import copy
|
||||||
import functools
|
import functools
|
||||||
|
import importlib.util
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
|
import platform
|
||||||
import threading
|
import threading
|
||||||
import time
|
import time
|
||||||
|
import types
|
||||||
import urllib.request
|
import urllib.request
|
||||||
|
from collections.abc import AsyncGenerator
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import AsyncGenerator, Literal, Optional
|
from typing import Literal
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from fastapi import FastAPI, HTTPException, Request
|
from fastapi import FastAPI, HTTPException, Request
|
||||||
@@ -46,8 +51,7 @@ SAMPLE_RATE = 24_000
|
|||||||
|
|
||||||
VOICES_DIR = Path(__file__).parent / "voices" / "streaming_model"
|
VOICES_DIR = Path(__file__).parent / "voices" / "streaming_model"
|
||||||
VOICE_BASE_URL = (
|
VOICE_BASE_URL = (
|
||||||
"https://raw.githubusercontent.com/microsoft/VibeVoice/main"
|
"https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model"
|
||||||
"/demo/voices/streaming_model"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
EN_VOICES: dict[str, str] = {
|
EN_VOICES: dict[str, str] = {
|
||||||
@@ -62,6 +66,11 @@ DEFAULT_SPEAKER = "carter"
|
|||||||
|
|
||||||
_IGNORE_PATTERNS = ["*.msgpack", "flax_model*", "tf_model*", "rust_model*", "*.ot"]
|
_IGNORE_PATTERNS = ["*.msgpack", "flax_model*", "tf_model*", "rust_model*", "*.ot"]
|
||||||
|
|
||||||
|
# ── Pipeline executor ──────────────────────────────────────────────────────────
|
||||||
|
# Overlaps acoustic_decode with forward_tts_lm on a background thread (1 worker).
|
||||||
|
|
||||||
|
_decode_executor: concurrent.futures.ThreadPoolExecutor | None = None
|
||||||
|
|
||||||
# ── Device selection ────────────────────────────────────────────────────────────
|
# ── Device selection ────────────────────────────────────────────────────────────
|
||||||
# VIBEPOD_DEVICE env var is set by start.sh based on the --cpu / --cuda flag.
|
# VIBEPOD_DEVICE env var is set by start.sh based on the --cpu / --cuda flag.
|
||||||
# Falls back to auto-detection if not set.
|
# Falls back to auto-detection if not set.
|
||||||
@@ -106,6 +115,38 @@ def _env_float(name: str, default: float) -> float:
|
|||||||
return default
|
return default
|
||||||
|
|
||||||
|
|
||||||
|
def _cpu_supports_bf16() -> bool:
|
||||||
|
"""Return True if the CPU has AVX512_BF16 hardware support."""
|
||||||
|
return (
|
||||||
|
hasattr(torch, "cpu")
|
||||||
|
and hasattr(torch.cpu, "is_avx512_bf16_supported")
|
||||||
|
and torch.cpu.is_avx512_bf16_supported()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _configure_cpu_runtime() -> dict[str, object]:
|
||||||
|
logical_cpus = os.cpu_count() or 1
|
||||||
|
default_threads = max(1, logical_cpus // 2) if platform.system() == "Windows" else logical_cpus
|
||||||
|
intra_threads = _env_int("VIBEPOD_CPU_THREADS", default_threads)
|
||||||
|
interop_threads = _env_int("VIBEPOD_CPU_INTEROP_THREADS", 1)
|
||||||
|
mkldnn_enabled = os.environ.get("VIBEPOD_CPU_MKLDNN", "1").strip() != "0"
|
||||||
|
|
||||||
|
torch.set_num_threads(max(1, intra_threads))
|
||||||
|
try:
|
||||||
|
torch.set_num_interop_threads(max(1, interop_threads))
|
||||||
|
except RuntimeError as exc:
|
||||||
|
logger.warning("Could not set CPU inter-op threads: %s", exc)
|
||||||
|
|
||||||
|
torch.backends.mkldnn.enabled = mkldnn_enabled
|
||||||
|
return {
|
||||||
|
"logical_cpus": logical_cpus,
|
||||||
|
"threads": torch.get_num_threads(),
|
||||||
|
"interop_threads": torch.get_num_interop_threads(),
|
||||||
|
"mkldnn_available": torch.backends.mkldnn.is_available(),
|
||||||
|
"mkldnn_enabled": torch.backends.mkldnn.enabled,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# ── Global state ────────────────────────────────────────────────────────────────
|
# ── Global state ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
ModelStatus = Literal["downloading", "loading", "online", "error"]
|
ModelStatus = Literal["downloading", "loading", "online", "error"]
|
||||||
@@ -114,7 +155,7 @@ _processor = None
|
|||||||
_model = None
|
_model = None
|
||||||
_device: str = "cpu"
|
_device: str = "cpu"
|
||||||
_model_status: ModelStatus = "loading"
|
_model_status: ModelStatus = "loading"
|
||||||
_model_error: Optional[str] = None
|
_model_error: str | None = None
|
||||||
_voice_presets: dict[str, object] = {}
|
_voice_presets: dict[str, object] = {}
|
||||||
_load_lock = threading.Lock()
|
_load_lock = threading.Lock()
|
||||||
_generation_lock = asyncio.Lock()
|
_generation_lock = asyncio.Lock()
|
||||||
@@ -161,9 +202,7 @@ def _is_model_cached() -> bool:
|
|||||||
try:
|
try:
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
snapshot_download(
|
snapshot_download(MODEL_ID, local_files_only=True, ignore_patterns=_IGNORE_PATTERNS)
|
||||||
MODEL_ID, local_files_only=True, ignore_patterns=_IGNORE_PATTERNS
|
|
||||||
)
|
|
||||||
return True
|
return True
|
||||||
except Exception:
|
except Exception:
|
||||||
return False
|
return False
|
||||||
@@ -172,9 +211,7 @@ def _is_model_cached() -> bool:
|
|||||||
def _download_model() -> None:
|
def _download_model() -> None:
|
||||||
from huggingface_hub import snapshot_download
|
from huggingface_hub import snapshot_download
|
||||||
|
|
||||||
token: Optional[str] = os.environ.get("HF_TOKEN") or os.environ.get(
|
token: str | None = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
|
||||||
"HUGGINGFACE_TOKEN"
|
|
||||||
)
|
|
||||||
DlTqdm = _make_dl_tqdm()
|
DlTqdm = _make_dl_tqdm()
|
||||||
logger.info("Model not cached — downloading %s...", MODEL_ID)
|
logger.info("Model not cached — downloading %s...", MODEL_ID)
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
@@ -188,7 +225,7 @@ def _download_model() -> None:
|
|||||||
|
|
||||||
def _download_voices() -> None:
|
def _download_voices() -> None:
|
||||||
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
VOICES_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
for name, filename in EN_VOICES.items():
|
for _name, filename in EN_VOICES.items():
|
||||||
dest = VOICES_DIR / filename
|
dest = VOICES_DIR / filename
|
||||||
if not dest.exists():
|
if not dest.exists():
|
||||||
url = f"{VOICE_BASE_URL}/{filename}"
|
url = f"{VOICE_BASE_URL}/{filename}"
|
||||||
@@ -211,8 +248,56 @@ def _init_processor():
|
|||||||
|
|
||||||
def _init_model(device: str):
|
def _init_model(device: str):
|
||||||
logger.info("Loading model on %s...", device)
|
logger.info("Loading model on %s...", device)
|
||||||
load_dtype = torch.bfloat16 if device == "cuda" else torch.float32
|
if device == "cuda":
|
||||||
attn_impl = "flash_attention_2" if device == "cuda" else "sdpa"
|
torch.set_float32_matmul_precision("high")
|
||||||
|
torch.backends.cuda.matmul.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.allow_tf32 = True
|
||||||
|
torch.backends.cudnn.benchmark = True
|
||||||
|
torch.backends.cuda.enable_flash_sdp(True)
|
||||||
|
torch.backends.cuda.enable_mem_efficient_sdp(True)
|
||||||
|
torch.backends.cuda.enable_math_sdp(True)
|
||||||
|
torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
|
||||||
|
logger.info(
|
||||||
|
"PyTorch SDPA backends: flash=%s, mem_efficient=%s, math=%s",
|
||||||
|
torch.backends.cuda.flash_sdp_enabled(),
|
||||||
|
torch.backends.cuda.mem_efficient_sdp_enabled(),
|
||||||
|
torch.backends.cuda.math_sdp_enabled(),
|
||||||
|
)
|
||||||
|
elif device == "cpu":
|
||||||
|
torch.set_float32_matmul_precision("medium")
|
||||||
|
logger.info("CPU runtime configuration: %s", _configure_cpu_runtime())
|
||||||
|
|
||||||
|
cuda_dtype = os.environ.get("VIBEPOD_CUDA_DTYPE", "bf16").lower()
|
||||||
|
if device == "cuda" and cuda_dtype == "fp16":
|
||||||
|
load_dtype = torch.float16
|
||||||
|
elif device == "cuda":
|
||||||
|
load_dtype = torch.bfloat16
|
||||||
|
else:
|
||||||
|
cpu_bf16_env = os.environ.get("VIBEPOD_CPU_BF16", "auto").lower()
|
||||||
|
if cpu_bf16_env == "1":
|
||||||
|
load_dtype = torch.bfloat16
|
||||||
|
logger.info("CPU BF16 forced via VIBEPOD_CPU_BF16=1")
|
||||||
|
elif cpu_bf16_env == "0":
|
||||||
|
load_dtype = torch.float32
|
||||||
|
logger.info("CPU float32 forced via VIBEPOD_CPU_BF16=0")
|
||||||
|
elif _cpu_supports_bf16():
|
||||||
|
load_dtype = torch.bfloat16
|
||||||
|
logger.info("AVX512_BF16 detected — loading model in bfloat16")
|
||||||
|
else:
|
||||||
|
load_dtype = torch.float32
|
||||||
|
logger.info("No AVX512_BF16 — using float32 (set VIBEPOD_CPU_BF16=1 to override)")
|
||||||
|
logger.info("Loading model weights with dtype %s", load_dtype)
|
||||||
|
requested_attn_impl = os.environ.get("VIBEPOD_ATTN_IMPL", "auto").lower()
|
||||||
|
has_flash_attn = importlib.util.find_spec("flash_attn") is not None
|
||||||
|
if requested_attn_impl in {"eager", "sdpa"}:
|
||||||
|
attn_impl = requested_attn_impl
|
||||||
|
elif requested_attn_impl == "flash_attention_2":
|
||||||
|
attn_impl = "flash_attention_2" if has_flash_attn else "sdpa"
|
||||||
|
else:
|
||||||
|
attn_impl = "flash_attention_2" if device == "cuda" and has_flash_attn else "sdpa"
|
||||||
|
logger.info("Using Transformers attention implementation: %s", attn_impl)
|
||||||
|
if device == "cuda" and not has_flash_attn:
|
||||||
|
logger.info("flash_attn is not installed; using PyTorch SDPA attention.")
|
||||||
|
|
||||||
from vibevoice.modular.modeling_vibevoice_streaming_inference import (
|
from vibevoice.modular.modeling_vibevoice_streaming_inference import (
|
||||||
VibeVoiceStreamingForConditionalGenerationInference,
|
VibeVoiceStreamingForConditionalGenerationInference,
|
||||||
@@ -225,9 +310,13 @@ def _init_model(device: str):
|
|||||||
device_map=device,
|
device_map=device,
|
||||||
attn_implementation=attn_impl,
|
attn_implementation=attn_impl,
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception as exc:
|
||||||
|
if attn_impl == "sdpa":
|
||||||
|
raise
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Model load with %s failed; falling back to sdpa", attn_impl, exc_info=True
|
"Model load with %s failed (%s); falling back to sdpa",
|
||||||
|
attn_impl,
|
||||||
|
exc,
|
||||||
)
|
)
|
||||||
model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
|
model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
|
||||||
MODEL_ID,
|
MODEL_ID,
|
||||||
@@ -237,10 +326,272 @@ def _init_model(device: str):
|
|||||||
)
|
)
|
||||||
|
|
||||||
model.eval()
|
model.eval()
|
||||||
|
if device == "cpu":
|
||||||
|
model = _apply_cpu_optimizations(model)
|
||||||
model.set_ddpm_inference_steps(num_steps=_config["default_inference_steps"])
|
model.set_ddpm_inference_steps(num_steps=_config["default_inference_steps"])
|
||||||
|
_install_generation_optimizations(model)
|
||||||
|
if device == "cpu":
|
||||||
|
# Must run after _install_generation_optimizations so the async wrapper
|
||||||
|
# sits outside the profiling wrapper (VibeVoice calls async → profiling → real decode).
|
||||||
|
_install_cpu_pipeline_optimizations(model)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def _apply_cpu_optimizations(model: object) -> object:
|
||||||
|
"""Apply optional post-load CPU optimizations. Returns (possibly new) model object."""
|
||||||
|
|
||||||
|
do_quantize = os.environ.get("VIBEPOD_QUANTIZE", "0") == "1"
|
||||||
|
do_compile = os.environ.get("VIBEPOD_COMPILE", "0") == "1"
|
||||||
|
|
||||||
|
if do_quantize:
|
||||||
|
logger.info("Applying dynamic INT8 quantization to Linear layers...")
|
||||||
|
try:
|
||||||
|
import torch.ao.quantization
|
||||||
|
|
||||||
|
# The diffusion prediction_head operates on small fixed-size tensors where
|
||||||
|
# INT8 pack/unpack overhead exceeds the matmul savings (~+20% regression in
|
||||||
|
# testing). Save and restore it so it stays in float32.
|
||||||
|
saved_prediction_head = None
|
||||||
|
if hasattr(model, "model") and hasattr(model.model, "prediction_head"):
|
||||||
|
saved_prediction_head = model.model.prediction_head
|
||||||
|
del model.model.prediction_head
|
||||||
|
|
||||||
|
model = torch.ao.quantization.quantize_dynamic(
|
||||||
|
model, {torch.nn.Linear}, dtype=torch.qint8
|
||||||
|
)
|
||||||
|
|
||||||
|
if saved_prediction_head is not None:
|
||||||
|
model.model.prediction_head = saved_prediction_head
|
||||||
|
logger.info(
|
||||||
|
"Dynamic INT8 quantization applied (prediction_head excluded — stays float32)."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.info("Dynamic INT8 quantization applied.")
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("Dynamic quantization failed: %s — skipping", exc)
|
||||||
|
|
||||||
|
if do_compile:
|
||||||
|
# torch.compile with inductor on CPU is ineffective for autoregressive TTS:
|
||||||
|
# each token step produces a unique input shape, so every step triggers a new
|
||||||
|
# kernel compile event rather than reusing compiled code. Kept as an escape
|
||||||
|
# hatch but not recommended.
|
||||||
|
compile_mode = os.environ.get("VIBEPOD_COMPILE_MODE", "reduce-overhead")
|
||||||
|
logger.info(
|
||||||
|
"torch.compile enabled (mode=%s) — NOTE: limited benefit for autoregressive"
|
||||||
|
" models on CPU due to dynamic sequence lengths.",
|
||||||
|
compile_mode,
|
||||||
|
)
|
||||||
|
_compile_targets: list[tuple[str, object, str, bool]] = [
|
||||||
|
("forward_tts_lm", model, "forward_tts_lm", True),
|
||||||
|
]
|
||||||
|
if hasattr(model, "model"):
|
||||||
|
inner = model.model
|
||||||
|
if hasattr(inner, "prediction_head"):
|
||||||
|
_compile_targets.append(("prediction_head", inner, "prediction_head", False))
|
||||||
|
if hasattr(inner, "acoustic_tokenizer") and hasattr(inner.acoustic_tokenizer, "decode"):
|
||||||
|
_compile_targets.append(
|
||||||
|
("acoustic_tokenizer.decode", inner.acoustic_tokenizer, "decode", False)
|
||||||
|
)
|
||||||
|
|
||||||
|
for label, obj, attr, dynamic in _compile_targets:
|
||||||
|
try:
|
||||||
|
compiled = torch.compile(
|
||||||
|
getattr(obj, attr),
|
||||||
|
backend="inductor",
|
||||||
|
mode=compile_mode,
|
||||||
|
dynamic=dynamic,
|
||||||
|
)
|
||||||
|
setattr(obj, attr, compiled)
|
||||||
|
logger.info(" compiled: %s", label)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(" torch.compile failed for %s: %s — skipping", label, exc)
|
||||||
|
|
||||||
|
return model
|
||||||
|
|
||||||
|
|
||||||
|
def _install_generation_optimizations(model: object) -> None:
|
||||||
|
"""Patch VibeVoice hot paths without changing model quality settings."""
|
||||||
|
|
||||||
|
def profile_enabled() -> bool:
|
||||||
|
return os.environ.get("VIBEPOD_PROFILE_GENERATION", "0") == "1"
|
||||||
|
|
||||||
|
def profile_sync() -> None:
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
torch.cuda.synchronize()
|
||||||
|
|
||||||
|
def profile_record(self, key: str, elapsed: float) -> None:
|
||||||
|
stats = getattr(self, "_vibepod_profile", None)
|
||||||
|
if stats is None:
|
||||||
|
stats = {}
|
||||||
|
self._vibepod_profile = stats
|
||||||
|
bucket = stats.setdefault(key, {"count": 0, "seconds": 0.0})
|
||||||
|
bucket["count"] += 1
|
||||||
|
bucket["seconds"] += elapsed
|
||||||
|
|
||||||
|
def timed_method(self, key: str, fn, *args, **kwargs):
|
||||||
|
if not profile_enabled():
|
||||||
|
return fn(*args, **kwargs)
|
||||||
|
profile_sync()
|
||||||
|
started = time.perf_counter()
|
||||||
|
result = fn(*args, **kwargs)
|
||||||
|
profile_sync()
|
||||||
|
profile_record(self, key, time.perf_counter() - started)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def prepare_noise_scheduler(self):
|
||||||
|
scheduler = self.model.noise_scheduler
|
||||||
|
cache_key = self.ddpm_inference_steps
|
||||||
|
cache = getattr(self, "_vibepod_scheduler_cache", {})
|
||||||
|
cached = cache.get(cache_key)
|
||||||
|
|
||||||
|
if cached is None:
|
||||||
|
scheduler.set_timesteps(self.ddpm_inference_steps)
|
||||||
|
cached = {
|
||||||
|
"num_inference_steps": scheduler.num_inference_steps,
|
||||||
|
"timesteps": scheduler.timesteps,
|
||||||
|
"sigmas": scheduler.sigmas,
|
||||||
|
}
|
||||||
|
cache[cache_key] = cached
|
||||||
|
self._vibepod_scheduler_cache = cache
|
||||||
|
else:
|
||||||
|
scheduler.num_inference_steps = cached["num_inference_steps"]
|
||||||
|
scheduler.timesteps = cached["timesteps"]
|
||||||
|
scheduler.sigmas = cached["sigmas"]
|
||||||
|
scheduler.model_outputs = [None] * scheduler.config.solver_order
|
||||||
|
scheduler.lower_order_nums = 0
|
||||||
|
scheduler._step_index = None
|
||||||
|
scheduler._begin_index = None
|
||||||
|
|
||||||
|
return scheduler
|
||||||
|
|
||||||
|
def sample_speech_tokens_optimized(self, condition, neg_condition, cfg_scale=3.0):
|
||||||
|
scheduler = prepare_noise_scheduler(self)
|
||||||
|
|
||||||
|
condition = torch.cat([condition, neg_condition], dim=0).to(
|
||||||
|
self.model.prediction_head.device
|
||||||
|
)
|
||||||
|
batch_size = condition.shape[0] // 2
|
||||||
|
speech = torch.randn(batch_size, self.config.acoustic_vae_dim).to(condition)
|
||||||
|
t_batch_cache_key = (
|
||||||
|
self.ddpm_inference_steps,
|
||||||
|
condition.device.type,
|
||||||
|
condition.device.index,
|
||||||
|
condition.dtype,
|
||||||
|
batch_size,
|
||||||
|
)
|
||||||
|
t_batch_cache = getattr(self, "_vibepod_t_batch_cache", {})
|
||||||
|
t_batches = t_batch_cache.get(t_batch_cache_key)
|
||||||
|
if t_batches is None or len(t_batches) != len(scheduler.timesteps):
|
||||||
|
t_batches = [
|
||||||
|
t.repeat(condition.shape[0]).to(device=condition.device, dtype=condition.dtype)
|
||||||
|
for t in scheduler.timesteps
|
||||||
|
]
|
||||||
|
t_batch_cache[t_batch_cache_key] = t_batches
|
||||||
|
self._vibepod_t_batch_cache = t_batch_cache
|
||||||
|
|
||||||
|
for t, t_batch in zip(scheduler.timesteps, t_batches):
|
||||||
|
if batch_size == 1:
|
||||||
|
combined = speech.expand(condition.shape[0], -1)
|
||||||
|
else:
|
||||||
|
combined = torch.cat([speech, speech], dim=0)
|
||||||
|
if profile_enabled():
|
||||||
|
profile_sync()
|
||||||
|
started = time.perf_counter()
|
||||||
|
eps = self.model.prediction_head(combined, t_batch, condition=condition)
|
||||||
|
if profile_enabled():
|
||||||
|
profile_sync()
|
||||||
|
profile_record(self, "diffusion_prediction_head", time.perf_counter() - started)
|
||||||
|
cond_eps, uncond_eps = torch.split(eps, batch_size, dim=0)
|
||||||
|
guided_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
|
||||||
|
if profile_enabled():
|
||||||
|
started = time.perf_counter()
|
||||||
|
speech = scheduler.step(guided_eps, t, speech).prev_sample
|
||||||
|
if profile_enabled():
|
||||||
|
profile_record(self, "diffusion_scheduler_step", time.perf_counter() - started)
|
||||||
|
|
||||||
|
return speech
|
||||||
|
|
||||||
|
forward_lm = model.forward_lm
|
||||||
|
forward_tts_lm = model.forward_tts_lm
|
||||||
|
acoustic_decode = model.model.acoustic_tokenizer.decode
|
||||||
|
|
||||||
|
def forward_lm_profiled(*args, **kwargs):
|
||||||
|
return timed_method(model, "forward_lm", forward_lm, *args, **kwargs)
|
||||||
|
|
||||||
|
def forward_tts_lm_profiled(*args, **kwargs):
|
||||||
|
return timed_method(model, "forward_tts_lm", forward_tts_lm, *args, **kwargs)
|
||||||
|
|
||||||
|
def acoustic_decode_profiled(*args, **kwargs):
|
||||||
|
return timed_method(model, "acoustic_decode", acoustic_decode, *args, **kwargs)
|
||||||
|
|
||||||
|
model.forward_lm = forward_lm_profiled
|
||||||
|
model.forward_tts_lm = forward_tts_lm_profiled
|
||||||
|
model.model.acoustic_tokenizer.decode = acoustic_decode_profiled
|
||||||
|
model.sample_speech_tokens = types.MethodType(sample_speech_tokens_optimized, model)
|
||||||
|
logger.info("Installed VibeVoice generation hot-path optimizations.")
|
||||||
|
|
||||||
|
|
||||||
|
def _install_cpu_pipeline_optimizations(model: object) -> None:
|
||||||
|
"""Attach the decode executor to the model for the optimised generate() loop.
|
||||||
|
|
||||||
|
The JezzWTF/VibeVoice fork's generate() checks for two optional attributes:
|
||||||
|
|
||||||
|
model._vibepod_decode_executor — ThreadPoolExecutor (1 worker) that
|
||||||
|
overlaps acoustic_decode with acoustic_connector + forward_tts_lm.
|
||||||
|
Profiling showed this hides ~72s of decode cost behind tts_lm work,
|
||||||
|
capturing ~96% of the theoretical overlap savings.
|
||||||
|
|
||||||
|
model._vibepod_cfg_executor — intentionally NOT set. Parallel pos/neg
|
||||||
|
forward_tts_lm via a second thread causes MKL OpenMP thread-pool
|
||||||
|
contention on CPU: both threads compete for the same OMP worker pool,
|
||||||
|
making each call slower rather than faster. Net effect: ~6% regression.
|
||||||
|
The hook remains in the fork for potential GPU or future use.
|
||||||
|
|
||||||
|
Attributes default to None, so the fork's generate() falls back to the
|
||||||
|
original sequential behaviour on CUDA or any non-VibePod install.
|
||||||
|
"""
|
||||||
|
global _decode_executor
|
||||||
|
|
||||||
|
if os.environ.get("VIBEPOD_ASYNC_DECODE", "1") != "1":
|
||||||
|
logger.info("CPU async decode disabled via VIBEPOD_ASYNC_DECODE=0.")
|
||||||
|
return
|
||||||
|
|
||||||
|
_decode_executor = concurrent.futures.ThreadPoolExecutor(
|
||||||
|
max_workers=1, thread_name_prefix="vibepod-decode"
|
||||||
|
)
|
||||||
|
model._vibepod_decode_executor = _decode_executor
|
||||||
|
logger.info(
|
||||||
|
"CPU pipeline: decode executor attached — acoustic_decode overlaps "
|
||||||
|
"tts_lm. Disable with VIBEPOD_ASYNC_DECODE=0."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _model_float_dtype() -> torch.dtype:
|
||||||
|
try:
|
||||||
|
return next(_model.parameters()).dtype
|
||||||
|
except StopIteration:
|
||||||
|
return torch.float32
|
||||||
|
|
||||||
|
|
||||||
|
def _move_cached_prompt(value: object, device: str, dtype: torch.dtype) -> object:
|
||||||
|
if torch.is_tensor(value):
|
||||||
|
if torch.is_floating_point(value):
|
||||||
|
return value.to(device=device, dtype=dtype)
|
||||||
|
return value.to(device=device)
|
||||||
|
if isinstance(value, dict):
|
||||||
|
for k in list(value.keys()):
|
||||||
|
value[k] = _move_cached_prompt(value[k], device, dtype)
|
||||||
|
return value
|
||||||
|
if isinstance(value, list):
|
||||||
|
return [_move_cached_prompt(v, device, dtype) for v in value]
|
||||||
|
if isinstance(value, tuple):
|
||||||
|
return tuple(_move_cached_prompt(v, device, dtype) for v in value)
|
||||||
|
if hasattr(value, "key_cache") and hasattr(value, "value_cache"):
|
||||||
|
value.key_cache = [_move_cached_prompt(t, device, dtype) for t in value.key_cache]
|
||||||
|
value.value_cache = [_move_cached_prompt(t, device, dtype) for t in value.value_cache]
|
||||||
|
return value
|
||||||
|
|
||||||
|
|
||||||
def _load_voice_presets(device: str) -> dict[str, object]:
|
def _load_voice_presets(device: str) -> dict[str, object]:
|
||||||
presets = {}
|
presets = {}
|
||||||
for name, filename in EN_VOICES.items():
|
for name, filename in EN_VOICES.items():
|
||||||
@@ -273,19 +624,33 @@ def _load_model_sync() -> None:
|
|||||||
is_cpu = _device == "cpu"
|
is_cpu = _device == "cpu"
|
||||||
_config["device"] = _device
|
_config["device"] = _device
|
||||||
_config["chunk_accum"] = _env_int("VIBEPOD_CHUNK_ACCUM", 4 if is_cpu else 1)
|
_config["chunk_accum"] = _env_int("VIBEPOD_CHUNK_ACCUM", 4 if is_cpu else 1)
|
||||||
_config["prebuffer_secs"] = _env_float("VIBEPOD_PREBUFFER_SECS", 5.0 if is_cpu else 2.0)
|
_config["prebuffer_secs"] = _env_float(
|
||||||
_config["rebuffer_threshold_secs"] = _env_float("VIBEPOD_REBUFFER_THRESHOLD_SECS", 1.0 if is_cpu else 0.4)
|
"VIBEPOD_PREBUFFER_SECS", 24.0 if is_cpu else 5.0
|
||||||
_config["resume_threshold_secs"] = _env_float("VIBEPOD_RESUME_THRESHOLD_SECS", 2.5 if is_cpu else 1.5)
|
)
|
||||||
_config["default_inference_steps"] = _env_int("VIBEPOD_DEFAULT_INFERENCE_STEPS", 8 if is_cpu else 10)
|
_config["rebuffer_threshold_secs"] = _env_float(
|
||||||
|
"VIBEPOD_REBUFFER_THRESHOLD_SECS", 2.0 if is_cpu else 1.0
|
||||||
|
)
|
||||||
|
_config["resume_threshold_secs"] = _env_float(
|
||||||
|
"VIBEPOD_RESUME_THRESHOLD_SECS", 12.0 if is_cpu else 3.0
|
||||||
|
)
|
||||||
|
_config["default_inference_steps"] = _env_int(
|
||||||
|
"VIBEPOD_DEFAULT_INFERENCE_STEPS", 8 if is_cpu else 10
|
||||||
|
)
|
||||||
|
if is_cpu:
|
||||||
|
logical_cpus = os.cpu_count() or 1
|
||||||
|
_config["cpu_threads"] = _env_int(
|
||||||
|
"VIBEPOD_CPU_THREADS",
|
||||||
|
max(1, logical_cpus // 2) if platform.system() == "Windows" else logical_cpus,
|
||||||
|
)
|
||||||
|
_config["cpu_interop_threads"] = _env_int("VIBEPOD_CPU_INTEROP_THREADS", 1)
|
||||||
|
_config["cpu_mkldnn"] = os.environ.get("VIBEPOD_CPU_MKLDNN", "1").strip() != "0"
|
||||||
|
|
||||||
_processor = _init_processor()
|
_processor = _init_processor()
|
||||||
_model = _init_model(_device)
|
_model = _init_model(_device)
|
||||||
_voice_presets = _load_voice_presets(_device)
|
_voice_presets = _load_voice_presets(_device)
|
||||||
|
|
||||||
_model_status = "online"
|
_model_status = "online"
|
||||||
logger.info(
|
logger.info("Model ready on %s. Voices: %s", _device, list(_voice_presets.keys()))
|
||||||
"Model ready on %s. Voices: %s", _device, list(_voice_presets.keys())
|
|
||||||
)
|
|
||||||
logger.info("Configuration: %s", _config)
|
logger.info("Configuration: %s", _config)
|
||||||
|
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
@@ -302,6 +667,8 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
|||||||
thread = threading.Thread(target=_load_model_sync, daemon=True, name="model-loader")
|
thread = threading.Thread(target=_load_model_sync, daemon=True, name="model-loader")
|
||||||
thread.start()
|
thread.start()
|
||||||
yield
|
yield
|
||||||
|
if _decode_executor is not None:
|
||||||
|
_decode_executor.shutdown(wait=False)
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan)
|
app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan)
|
||||||
@@ -314,7 +681,7 @@ class GenerateRequest(BaseModel):
|
|||||||
text: str = Field(..., min_length=1, max_length=10_000)
|
text: str = Field(..., min_length=1, max_length=10_000)
|
||||||
speaker: str = Field(default=DEFAULT_SPEAKER)
|
speaker: str = Field(default=DEFAULT_SPEAKER)
|
||||||
cfg_scale: float = Field(default=1.5, ge=0.5, le=4.0)
|
cfg_scale: float = Field(default=1.5, ge=0.5, le=4.0)
|
||||||
inference_steps: Optional[int] = Field(default=None, ge=5, le=20)
|
inference_steps: int | None = Field(default=None, ge=5, le=20)
|
||||||
|
|
||||||
@field_validator("text")
|
@field_validator("text")
|
||||||
@classmethod
|
@classmethod
|
||||||
@@ -353,8 +720,8 @@ async def health() -> dict:
|
|||||||
|
|
||||||
def _sync_generate(
|
def _sync_generate(
|
||||||
req: GenerateRequest,
|
req: GenerateRequest,
|
||||||
streamer: Optional[object] = None,
|
streamer: object | None = None,
|
||||||
cancel_event: Optional[threading.Event] = None,
|
cancel_event: threading.Event | None = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Blocking inference. Returns the speaker used.
|
"""Blocking inference. Returns the speaker used.
|
||||||
Runs in a thread-pool executor — do not call from the event loop directly.
|
Runs in a thread-pool executor — do not call from the event loop directly.
|
||||||
@@ -364,10 +731,17 @@ def _sync_generate(
|
|||||||
raise RuntimeError("Generation cancelled.")
|
raise RuntimeError("Generation cancelled.")
|
||||||
|
|
||||||
speaker = req.speaker if req.speaker in _voice_presets else DEFAULT_SPEAKER
|
speaker = req.speaker if req.speaker in _voice_presets else DEFAULT_SPEAKER
|
||||||
voice_preset = copy.deepcopy(_voice_presets[speaker])
|
model_dtype = _model_float_dtype()
|
||||||
|
voice_preset = _move_cached_prompt(copy.deepcopy(_voice_presets[speaker]), _device, model_dtype)
|
||||||
|
|
||||||
steps = req.inference_steps if req.inference_steps is not None else _config["default_inference_steps"]
|
steps = (
|
||||||
|
req.inference_steps
|
||||||
|
if req.inference_steps is not None
|
||||||
|
else _config["default_inference_steps"]
|
||||||
|
)
|
||||||
_model.set_ddpm_inference_steps(num_steps=steps)
|
_model.set_ddpm_inference_steps(num_steps=steps)
|
||||||
|
if os.environ.get("VIBEPOD_PROFILE_GENERATION", "0") == "1":
|
||||||
|
_model._vibepod_profile = {}
|
||||||
|
|
||||||
inputs = _processor.process_input_with_cached_prompt(
|
inputs = _processor.process_input_with_cached_prompt(
|
||||||
text=req.text,
|
text=req.text,
|
||||||
@@ -380,19 +754,20 @@ def _sync_generate(
|
|||||||
if torch.is_tensor(v):
|
if torch.is_tensor(v):
|
||||||
inputs[k] = v.to(_device)
|
inputs[k] = v.to(_device)
|
||||||
|
|
||||||
outputs = _model.generate(
|
with torch.inference_mode():
|
||||||
**inputs,
|
_model.generate(
|
||||||
max_new_tokens=None,
|
**inputs,
|
||||||
cfg_scale=req.cfg_scale,
|
max_new_tokens=None,
|
||||||
tokenizer=_processor.tokenizer,
|
cfg_scale=req.cfg_scale,
|
||||||
generation_config={"do_sample": False},
|
tokenizer=_processor.tokenizer,
|
||||||
verbose=True,
|
generation_config={"do_sample": False},
|
||||||
all_prefilled_outputs=copy.deepcopy(voice_preset),
|
verbose=False,
|
||||||
audio_streamer=streamer,
|
show_progress_bar=False,
|
||||||
)
|
return_speech=False,
|
||||||
|
stop_check_fn=cancel_event.is_set if cancel_event else None,
|
||||||
if not outputs.speech_outputs or outputs.speech_outputs[0] is None:
|
all_prefilled_outputs=voice_preset,
|
||||||
raise ValueError("Model returned no audio output.")
|
audio_streamer=streamer,
|
||||||
|
)
|
||||||
|
|
||||||
return speaker
|
return speaker
|
||||||
|
|
||||||
@@ -401,6 +776,22 @@ def _sse(event: dict) -> str:
|
|||||||
return f"data: {json.dumps(event)}\n\n"
|
return f"data: {json.dumps(event)}\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
def _generation_profile() -> dict[str, dict[str, float]] | None:
|
||||||
|
if os.environ.get("VIBEPOD_PROFILE_GENERATION", "0") != "1":
|
||||||
|
return None
|
||||||
|
stats = getattr(_model, "_vibepod_profile", None)
|
||||||
|
if not stats:
|
||||||
|
return {}
|
||||||
|
return {
|
||||||
|
key: {
|
||||||
|
"count": value["count"],
|
||||||
|
"seconds": round(value["seconds"], 3),
|
||||||
|
"avg_ms": round(value["seconds"] * 1000 / value["count"], 3) if value["count"] else 0.0,
|
||||||
|
}
|
||||||
|
for key, value in sorted(stats.items())
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/generate")
|
@app.post("/generate")
|
||||||
async def generate(req: GenerateRequest, request: Request) -> StreamingResponse:
|
async def generate(req: GenerateRequest, request: Request) -> StreamingResponse:
|
||||||
if _model_status != "online":
|
if _model_status != "online":
|
||||||
@@ -417,28 +808,62 @@ async def generate(req: GenerateRequest, request: Request) -> StreamingResponse:
|
|||||||
)
|
)
|
||||||
|
|
||||||
async def event_stream() -> AsyncGenerator[str, None]:
|
async def event_stream() -> AsyncGenerator[str, None]:
|
||||||
from vibevoice.modular.streamer import AsyncAudioStreamer
|
class NonBlockingAudioStreamer:
|
||||||
|
"""Async streamer that keeps GPU->CPU copies out of the model thread."""
|
||||||
|
|
||||||
|
def __init__(self, batch_size: int, stop_signal: object = None) -> None:
|
||||||
|
self.batch_size = batch_size
|
||||||
|
self.stop_signal = stop_signal
|
||||||
|
self.audio_queues = [asyncio.Queue() for _ in range(batch_size)]
|
||||||
|
self.finished_flags = [False for _ in range(batch_size)]
|
||||||
|
self.loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
|
def put(self, audio_chunks: torch.Tensor, sample_indices: torch.Tensor) -> None:
|
||||||
|
for i, sample_idx in enumerate(sample_indices):
|
||||||
|
idx = sample_idx.item()
|
||||||
|
if idx < self.batch_size and not self.finished_flags[idx]:
|
||||||
|
self.loop.call_soon_threadsafe(
|
||||||
|
self.audio_queues[idx].put_nowait,
|
||||||
|
audio_chunks[i].detach(),
|
||||||
|
)
|
||||||
|
|
||||||
|
def end(self, sample_indices: torch.Tensor | None = None) -> None:
|
||||||
|
if sample_indices is None:
|
||||||
|
indices_to_end = range(self.batch_size)
|
||||||
|
else:
|
||||||
|
indices_to_end = [s.item() if torch.is_tensor(s) else s for s in sample_indices]
|
||||||
|
for idx in indices_to_end:
|
||||||
|
if idx < self.batch_size and not self.finished_flags[idx]:
|
||||||
|
self.loop.call_soon_threadsafe(
|
||||||
|
self.audio_queues[idx].put_nowait, self.stop_signal
|
||||||
|
)
|
||||||
|
self.finished_flags[idx] = True
|
||||||
|
|
||||||
start = time.monotonic()
|
start = time.monotonic()
|
||||||
streamer = AsyncAudioStreamer(batch_size=1)
|
streamer = NonBlockingAudioStreamer(batch_size=1)
|
||||||
cancel_event = threading.Event()
|
cancel_event = threading.Event()
|
||||||
|
|
||||||
accum_size = max(1, _config["chunk_accum"])
|
accum_size = max(1, _config["chunk_accum"])
|
||||||
accumulated_chunks = []
|
accumulated_chunks = []
|
||||||
|
chunk_count = 0
|
||||||
|
audio_samples = 0
|
||||||
|
first_chunk_at: float | None = None
|
||||||
|
last_chunk_at: float | None = None
|
||||||
|
max_chunk_gap = 0.0
|
||||||
|
speaker = req.speaker if req.speaker in _voice_presets else DEFAULT_SPEAKER
|
||||||
|
|
||||||
async with _generation_lock:
|
async with _generation_lock:
|
||||||
loop = asyncio.get_event_loop()
|
loop = asyncio.get_event_loop()
|
||||||
future = loop.run_in_executor(
|
future = loop.run_in_executor(
|
||||||
None, functools.partial(_sync_generate, req, streamer, cancel_event)
|
None, functools.partial(_sync_generate, req, streamer, cancel_event)
|
||||||
)
|
)
|
||||||
|
future.add_done_callback(lambda _: streamer.end())
|
||||||
|
|
||||||
# Drain audio chunks as they arrive from the diffusion head.
|
# Drain audio chunks as they arrive from the diffusion head.
|
||||||
# stop_signal=None is the default sentinel that ends the queue.
|
# stop_signal=None is the default sentinel that ends the queue.
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
chunk = await asyncio.wait_for(
|
chunk = await asyncio.wait_for(streamer.audio_queues[0].get(), timeout=120.0)
|
||||||
streamer.audio_queues[0].get(), timeout=120.0
|
|
||||||
)
|
|
||||||
except asyncio.TimeoutError:
|
except asyncio.TimeoutError:
|
||||||
cancel_event.set()
|
cancel_event.set()
|
||||||
future.cancel()
|
future.cancel()
|
||||||
@@ -454,17 +879,45 @@ async def generate(req: GenerateRequest, request: Request) -> StreamingResponse:
|
|||||||
if chunk is None: # stop signal
|
if chunk is None: # stop signal
|
||||||
break
|
break
|
||||||
|
|
||||||
accumulated_chunks.append(chunk.detach().cpu().float())
|
accumulated_chunks.append(chunk.detach())
|
||||||
|
|
||||||
if len(accumulated_chunks) >= accum_size:
|
if len(accumulated_chunks) >= accum_size:
|
||||||
combined = torch.cat(accumulated_chunks, dim=0)
|
now = time.monotonic()
|
||||||
|
if first_chunk_at is None:
|
||||||
|
first_chunk_at = now
|
||||||
|
if last_chunk_at is not None:
|
||||||
|
max_chunk_gap = max(max_chunk_gap, now - last_chunk_at)
|
||||||
|
last_chunk_at = now
|
||||||
|
|
||||||
|
combined = (
|
||||||
|
torch.cat(accumulated_chunks, dim=0)
|
||||||
|
.detach()
|
||||||
|
.to("cpu", dtype=torch.float32)
|
||||||
|
.contiguous()
|
||||||
|
)
|
||||||
|
chunk_count += 1
|
||||||
|
audio_samples += combined.numel()
|
||||||
pcm_b64 = base64.b64encode(combined.numpy().tobytes()).decode()
|
pcm_b64 = base64.b64encode(combined.numpy().tobytes()).decode()
|
||||||
yield _sse({"type": "audio_chunk", "data": pcm_b64})
|
yield _sse({"type": "audio_chunk", "data": pcm_b64})
|
||||||
accumulated_chunks = []
|
accumulated_chunks = []
|
||||||
|
|
||||||
# Flush any remaining chunks
|
# Flush any remaining chunks
|
||||||
if accumulated_chunks:
|
if accumulated_chunks:
|
||||||
combined = torch.cat(accumulated_chunks, dim=0)
|
now = time.monotonic()
|
||||||
|
if first_chunk_at is None:
|
||||||
|
first_chunk_at = now
|
||||||
|
if last_chunk_at is not None:
|
||||||
|
max_chunk_gap = max(max_chunk_gap, now - last_chunk_at)
|
||||||
|
last_chunk_at = now
|
||||||
|
|
||||||
|
combined = (
|
||||||
|
torch.cat(accumulated_chunks, dim=0)
|
||||||
|
.detach()
|
||||||
|
.to("cpu", dtype=torch.float32)
|
||||||
|
.contiguous()
|
||||||
|
)
|
||||||
|
chunk_count += 1
|
||||||
|
audio_samples += combined.numel()
|
||||||
pcm_b64 = base64.b64encode(combined.numpy().tobytes()).decode()
|
pcm_b64 = base64.b64encode(combined.numpy().tobytes()).decode()
|
||||||
yield _sse({"type": "audio_chunk", "data": pcm_b64})
|
yield _sse({"type": "audio_chunk", "data": pcm_b64})
|
||||||
|
|
||||||
@@ -479,17 +932,40 @@ async def generate(req: GenerateRequest, request: Request) -> StreamingResponse:
|
|||||||
yield _sse(
|
yield _sse(
|
||||||
{
|
{
|
||||||
"type": "error",
|
"type": "error",
|
||||||
"message": "Internal server error during generation.",
|
"message": f"Generation failed: {exc}",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
return
|
return
|
||||||
|
|
||||||
elapsed = round(time.monotonic() - start, 1)
|
elapsed = round(time.monotonic() - start, 1)
|
||||||
|
audio_secs = audio_samples / SAMPLE_RATE
|
||||||
|
realtime_factor = audio_secs / elapsed if elapsed > 0 else None
|
||||||
|
profile = _generation_profile()
|
||||||
|
if profile is not None:
|
||||||
|
logger.info("Generation profile: %s", profile)
|
||||||
logger.info("Generation complete in %.1fs", elapsed)
|
logger.info("Generation complete in %.1fs", elapsed)
|
||||||
yield _sse({"type": "complete", "elapsed": elapsed, "speaker": speaker})
|
complete_event = {
|
||||||
|
"type": "complete",
|
||||||
|
"elapsed": elapsed,
|
||||||
|
"speaker": speaker,
|
||||||
|
"audio_secs": round(audio_secs, 2),
|
||||||
|
"realtime_factor": round(realtime_factor, 3) if realtime_factor is not None else None,
|
||||||
|
"chunks": chunk_count,
|
||||||
|
"first_chunk_secs": round(first_chunk_at - start, 2)
|
||||||
|
if first_chunk_at is not None
|
||||||
|
else None,
|
||||||
|
"max_chunk_gap_secs": round(max_chunk_gap, 2),
|
||||||
|
}
|
||||||
|
if profile is not None:
|
||||||
|
complete_event["profile"] = profile
|
||||||
|
yield _sse(complete_event)
|
||||||
|
|
||||||
return StreamingResponse(
|
return StreamingResponse(
|
||||||
event_stream(),
|
event_stream(),
|
||||||
media_type="text/event-stream",
|
media_type="text/event-stream",
|
||||||
headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no"},
|
headers={
|
||||||
|
"Cache-Control": "no-cache, no-transform",
|
||||||
|
"X-Accel-Buffering": "no",
|
||||||
|
"X-Content-Type-Options": "nosniff",
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
import { NextRequest, NextResponse } from "next/server";
|
import { NextRequest, NextResponse } from "next/server";
|
||||||
|
|
||||||
|
export const dynamic = "force-dynamic";
|
||||||
|
export const runtime = "nodejs";
|
||||||
|
|
||||||
export async function POST(request: NextRequest) {
|
export async function POST(request: NextRequest) {
|
||||||
const pythonServerUrl = process.env.VIBEVOICE_SERVER_URL ?? "http://localhost:8000";
|
const pythonServerUrl = process.env.VIBEVOICE_SERVER_URL ?? "http://localhost:8000";
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const body = await request.json() as {
|
const body = (await request.json()) as {
|
||||||
text: string;
|
text: string;
|
||||||
speaker?: string;
|
speaker?: string;
|
||||||
cfg_scale?: number;
|
cfg_scale?: number;
|
||||||
@@ -24,6 +27,7 @@ export async function POST(request: NextRequest) {
|
|||||||
cfg_scale: body.cfg_scale ?? 1.5,
|
cfg_scale: body.cfg_scale ?? 1.5,
|
||||||
inference_steps: body.inference_steps ?? 10,
|
inference_steps: body.inference_steps ?? 10,
|
||||||
}),
|
}),
|
||||||
|
signal: request.signal,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!upstream.ok) {
|
if (!upstream.ok) {
|
||||||
@@ -36,8 +40,9 @@ export async function POST(request: NextRequest) {
|
|||||||
status: 200,
|
status: 200,
|
||||||
headers: {
|
headers: {
|
||||||
"Content-Type": "text/event-stream",
|
"Content-Type": "text/event-stream",
|
||||||
"Cache-Control": "no-cache",
|
"Cache-Control": "no-cache, no-transform",
|
||||||
"Connection": "keep-alive",
|
Connection: "keep-alive",
|
||||||
|
"X-Content-Type-Options": "nosniff",
|
||||||
"X-Accel-Buffering": "no",
|
"X-Accel-Buffering": "no",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -4,8 +4,7 @@ const OFFLINE_RESPONSE = { status: "offline" };
|
|||||||
const COMMON_OPTIONS = { headers: { "Cache-Control": "no-store" } };
|
const COMMON_OPTIONS = { headers: { "Cache-Control": "no-store" } };
|
||||||
|
|
||||||
export async function GET() {
|
export async function GET() {
|
||||||
const pythonServerUrl =
|
const pythonServerUrl = process.env.VIBEVOICE_SERVER_URL ?? "http://localhost:8000";
|
||||||
process.env.VIBEVOICE_SERVER_URL ?? "http://localhost:8000";
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const res = await fetch(`${pythonServerUrl}/health`, {
|
const res = await fetch(`${pythonServerUrl}/health`, {
|
||||||
@@ -27,6 +26,7 @@ export async function GET() {
|
|||||||
message: data.message,
|
message: data.message,
|
||||||
progress: data.progress ?? null,
|
progress: data.progress ?? null,
|
||||||
voices: data.voices ?? [],
|
voices: data.voices ?? [],
|
||||||
|
config: data.config ?? null,
|
||||||
},
|
},
|
||||||
COMMON_OPTIONS
|
COMMON_OPTIONS
|
||||||
);
|
);
|
||||||
|
|||||||
+4
-2
@@ -12,8 +12,10 @@
|
|||||||
--muted: #64748b;
|
--muted: #64748b;
|
||||||
--success: #22c55e;
|
--success: #22c55e;
|
||||||
--error: #ef4444;
|
--error: #ef4444;
|
||||||
--font-sans: ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
--font-sans:
|
||||||
--font-mono: ui-monospace, SFMono-Regular, "SF Mono", Menlo, Consolas, "Liberation Mono", monospace;
|
ui-sans-serif, system-ui, -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
|
||||||
|
--font-mono:
|
||||||
|
ui-monospace, SFMono-Regular, "SF Mono", Menlo, Consolas, "Liberation Mono", monospace;
|
||||||
}
|
}
|
||||||
|
|
||||||
@theme inline {
|
@theme inline {
|
||||||
|
|||||||
+61
-29
@@ -69,19 +69,39 @@ type AppAction =
|
|||||||
|
|
||||||
function reducer(state: AppState, action: AppAction): AppState {
|
function reducer(state: AppState, action: AppAction): AppState {
|
||||||
switch (action.type) {
|
switch (action.type) {
|
||||||
case "SET_SCRIPT": return { ...state, script: action.payload };
|
case "SET_SCRIPT":
|
||||||
case "SET_SPEAKER": return { ...state, speaker: action.payload };
|
return { ...state, script: action.payload };
|
||||||
case "SET_CFG_SCALE": return { ...state, cfgScale: action.payload };
|
case "SET_SPEAKER":
|
||||||
case "SET_INFERENCE_STEPS": return { ...state, inferenceSteps: action.payload };
|
return { ...state, speaker: action.payload };
|
||||||
case "SET_PREBUFFER_SECS": return { ...state, prebufferSecs: action.payload };
|
case "SET_CFG_SCALE":
|
||||||
case "SET_REBUFFER_THRESHOLD": return { ...state, rebufferThresholdSecs: action.payload };
|
return { ...state, cfgScale: action.payload };
|
||||||
case "SET_RESUME_THRESHOLD": return { ...state, resumeThresholdSecs: action.payload };
|
case "SET_INFERENCE_STEPS":
|
||||||
|
return { ...state, inferenceSteps: action.payload };
|
||||||
|
case "SET_PREBUFFER_SECS":
|
||||||
|
return { ...state, prebufferSecs: action.payload };
|
||||||
|
case "SET_REBUFFER_THRESHOLD":
|
||||||
|
return { ...state, rebufferThresholdSecs: action.payload };
|
||||||
|
case "SET_RESUME_THRESHOLD":
|
||||||
|
return { ...state, resumeThresholdSecs: action.payload };
|
||||||
case "START_GENERATION":
|
case "START_GENERATION":
|
||||||
return { ...state, isGenerating: true, audioUrl: null, logs: [], genElapsed: 0, genPct: null };
|
return {
|
||||||
|
...state,
|
||||||
|
isGenerating: true,
|
||||||
|
audioUrl: null,
|
||||||
|
logs: [],
|
||||||
|
genElapsed: 0,
|
||||||
|
genPct: null,
|
||||||
|
};
|
||||||
case "GEN_PROGRESS":
|
case "GEN_PROGRESS":
|
||||||
return { ...state, genElapsed: action.elapsed, genPct: action.pct };
|
return { ...state, genElapsed: action.elapsed, genPct: action.pct };
|
||||||
case "GENERATION_SUCCESS":
|
case "GENERATION_SUCCESS":
|
||||||
return { ...state, isGenerating: false, genElapsed: 0, genPct: null, audioUrl: action.payload };
|
return {
|
||||||
|
...state,
|
||||||
|
isGenerating: false,
|
||||||
|
genElapsed: 0,
|
||||||
|
genPct: null,
|
||||||
|
audioUrl: action.payload,
|
||||||
|
};
|
||||||
case "GENERATION_CANCELLED":
|
case "GENERATION_CANCELLED":
|
||||||
case "GENERATION_ERROR":
|
case "GENERATION_ERROR":
|
||||||
return { ...state, isGenerating: false, genElapsed: 0, genPct: null };
|
return { ...state, isGenerating: false, genElapsed: 0, genPct: null };
|
||||||
@@ -89,21 +109,27 @@ function reducer(state: AppState, action: AppAction): AppState {
|
|||||||
return { ...state, logs: [...state.logs, action.payload] };
|
return { ...state, logs: [...state.logs, action.payload] };
|
||||||
case "SET_SERVER_STATUS": {
|
case "SET_SERVER_STATUS": {
|
||||||
const isNewConfig = !state.serverConfig && action.payload.config;
|
const isNewConfig = !state.serverConfig && action.payload.config;
|
||||||
const deviceChanged = !!(state.serverConfig && action.payload.config && state.serverConfig.device !== action.payload.config.device);
|
const deviceChanged = !!(
|
||||||
|
state.serverConfig &&
|
||||||
|
action.payload.config &&
|
||||||
|
state.serverConfig.device !== action.payload.config.device
|
||||||
|
);
|
||||||
|
|
||||||
const nextSteps = (isNewConfig || deviceChanged)
|
const nextSteps =
|
||||||
|
isNewConfig || deviceChanged
|
||||||
? action.payload.config!.default_inference_steps
|
? action.payload.config!.default_inference_steps
|
||||||
: state.inferenceSteps;
|
: state.inferenceSteps;
|
||||||
|
|
||||||
const nextPrebuffer = (isNewConfig || deviceChanged)
|
const nextPrebuffer =
|
||||||
? action.payload.config!.prebuffer_secs
|
isNewConfig || deviceChanged ? action.payload.config!.prebuffer_secs : state.prebufferSecs;
|
||||||
: state.prebufferSecs;
|
|
||||||
|
|
||||||
const nextRebuffer = (isNewConfig || deviceChanged)
|
const nextRebuffer =
|
||||||
|
isNewConfig || deviceChanged
|
||||||
? action.payload.config!.rebuffer_threshold_secs
|
? action.payload.config!.rebuffer_threshold_secs
|
||||||
: state.rebufferThresholdSecs;
|
: state.rebufferThresholdSecs;
|
||||||
|
|
||||||
const nextResume = (isNewConfig || deviceChanged)
|
const nextResume =
|
||||||
|
isNewConfig || deviceChanged
|
||||||
? action.payload.config!.resume_threshold_secs
|
? action.payload.config!.resume_threshold_secs
|
||||||
: state.resumeThresholdSecs;
|
: state.resumeThresholdSecs;
|
||||||
|
|
||||||
@@ -121,7 +147,8 @@ function reducer(state: AppState, action: AppAction): AppState {
|
|||||||
resumeThresholdSecs: nextResume,
|
resumeThresholdSecs: nextResume,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
default: return state;
|
default:
|
||||||
|
return state;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,9 +157,9 @@ const initialState: AppState = {
|
|||||||
speaker: "carter",
|
speaker: "carter",
|
||||||
cfgScale: 1.5,
|
cfgScale: 1.5,
|
||||||
inferenceSteps: 10,
|
inferenceSteps: 10,
|
||||||
prebufferSecs: 2.0,
|
prebufferSecs: 5.0,
|
||||||
rebufferThresholdSecs: 0.4,
|
rebufferThresholdSecs: 1.0,
|
||||||
resumeThresholdSecs: 1.5,
|
resumeThresholdSecs: 3.0,
|
||||||
isGenerating: false,
|
isGenerating: false,
|
||||||
genElapsed: 0,
|
genElapsed: 0,
|
||||||
genPct: null,
|
genPct: null,
|
||||||
@@ -213,7 +240,10 @@ export default function HomePage() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
poll();
|
poll();
|
||||||
return () => { cancelled = true; clearTimeout(timeoutId); };
|
return () => {
|
||||||
|
cancelled = true;
|
||||||
|
clearTimeout(timeoutId);
|
||||||
|
};
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
const handleGenerate = useCallback(async () => {
|
const handleGenerate = useCallback(async () => {
|
||||||
@@ -241,7 +271,6 @@ export default function HomePage() {
|
|||||||
<Header />
|
<Header />
|
||||||
<main className="flex-1 container mx-auto px-4 py-6 max-w-6xl">
|
<main className="flex-1 container mx-auto px-4 py-6 max-w-6xl">
|
||||||
<div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
|
<div className="grid grid-cols-1 lg:grid-cols-3 gap-6">
|
||||||
|
|
||||||
{/* Left: script + audio player */}
|
{/* Left: script + audio player */}
|
||||||
<div className="lg:col-span-2 flex flex-col gap-6">
|
<div className="lg:col-span-2 flex flex-col gap-6">
|
||||||
<TextInputPanel
|
<TextInputPanel
|
||||||
@@ -261,12 +290,16 @@ export default function HomePage() {
|
|||||||
onCfgScaleChange={(v) => dispatch({ type: "SET_CFG_SCALE", payload: v })}
|
onCfgScaleChange={(v) => dispatch({ type: "SET_CFG_SCALE", payload: v })}
|
||||||
inferenceSteps={state.inferenceSteps}
|
inferenceSteps={state.inferenceSteps}
|
||||||
onInferenceStepsChange={(v) => dispatch({ type: "SET_INFERENCE_STEPS", payload: v })}
|
onInferenceStepsChange={(v) => dispatch({ type: "SET_INFERENCE_STEPS", payload: v })}
|
||||||
prebufferSecs={state.prebufferSecs}
|
prebufferSecs={state.prebufferSecs}
|
||||||
onPrebufferSecsChange={(v) => dispatch({ type: "SET_PREBUFFER_SECS", payload: v })}
|
onPrebufferSecsChange={(v) => dispatch({ type: "SET_PREBUFFER_SECS", payload: v })}
|
||||||
rebufferThresholdSecs={state.rebufferThresholdSecs}
|
rebufferThresholdSecs={state.rebufferThresholdSecs}
|
||||||
onRebufferThresholdChange={(v) => dispatch({ type: "SET_REBUFFER_THRESHOLD", payload: v })}
|
onRebufferThresholdChange={(v) =>
|
||||||
resumeThresholdSecs={state.resumeThresholdSecs}
|
dispatch({ type: "SET_REBUFFER_THRESHOLD", payload: v })
|
||||||
onResumeThresholdChange={(v) => dispatch({ type: "SET_RESUME_THRESHOLD", payload: v })}
|
}
|
||||||
|
resumeThresholdSecs={state.resumeThresholdSecs}
|
||||||
|
onResumeThresholdChange={(v) =>
|
||||||
|
dispatch({ type: "SET_RESUME_THRESHOLD", payload: v })
|
||||||
|
}
|
||||||
onGenerate={handleGenerate}
|
onGenerate={handleGenerate}
|
||||||
onStop={stop}
|
onStop={stop}
|
||||||
onPauseStream={pauseStream}
|
onPauseStream={pauseStream}
|
||||||
@@ -281,7 +314,6 @@ export default function HomePage() {
|
|||||||
/>
|
/>
|
||||||
<StatusLog messages={state.logs} />
|
<StatusLog messages={state.logs} />
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</div>
|
</div>
|
||||||
</main>
|
</main>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -14,15 +14,8 @@ function formatTime(seconds: number): string {
|
|||||||
}
|
}
|
||||||
|
|
||||||
export default function AudioPlayer({ audioUrl }: AudioPlayerProps) {
|
export default function AudioPlayer({ audioUrl }: AudioPlayerProps) {
|
||||||
const {
|
const { isPlaying, currentTime, duration, volume, toggle, seek, setVolume } =
|
||||||
isPlaying,
|
useAudioPlayer(audioUrl);
|
||||||
currentTime,
|
|
||||||
duration,
|
|
||||||
volume,
|
|
||||||
toggle,
|
|
||||||
seek,
|
|
||||||
setVolume,
|
|
||||||
} = useAudioPlayer(audioUrl);
|
|
||||||
|
|
||||||
if (!audioUrl) return null;
|
if (!audioUrl) return null;
|
||||||
|
|
||||||
@@ -56,12 +49,10 @@ export default function AudioPlayer({ audioUrl }: AudioPlayerProps) {
|
|||||||
background: "rgba(45, 212, 191, 0.05)",
|
background: "rgba(45, 212, 191, 0.05)",
|
||||||
}}
|
}}
|
||||||
onMouseEnter={(e) => {
|
onMouseEnter={(e) => {
|
||||||
(e.currentTarget as HTMLButtonElement).style.background =
|
(e.currentTarget as HTMLButtonElement).style.background = "rgba(45, 212, 191, 0.15)";
|
||||||
"rgba(45, 212, 191, 0.15)";
|
|
||||||
}}
|
}}
|
||||||
onMouseLeave={(e) => {
|
onMouseLeave={(e) => {
|
||||||
(e.currentTarget as HTMLButtonElement).style.background =
|
(e.currentTarget as HTMLButtonElement).style.background = "rgba(45, 212, 191, 0.05)";
|
||||||
"rgba(45, 212, 191, 0.05)";
|
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<svg
|
<svg
|
||||||
@@ -115,27 +106,18 @@ export default function AudioPlayer({ audioUrl }: AudioPlayerProps) {
|
|||||||
onClick={toggle}
|
onClick={toggle}
|
||||||
className="w-10 h-10 rounded-full flex items-center justify-center transition-transform active:scale-95 cursor-pointer"
|
className="w-10 h-10 rounded-full flex items-center justify-center transition-transform active:scale-95 cursor-pointer"
|
||||||
style={{
|
style={{
|
||||||
background:
|
background: "linear-gradient(135deg, var(--accent-teal-dim), var(--accent-violet-dim))",
|
||||||
"linear-gradient(135deg, var(--accent-teal-dim), var(--accent-violet-dim))",
|
|
||||||
boxShadow: "0 4px 12px rgba(45, 212, 191, 0.3)",
|
boxShadow: "0 4px 12px rgba(45, 212, 191, 0.3)",
|
||||||
}}
|
}}
|
||||||
aria-label={isPlaying ? "Pause" : "Play"}
|
aria-label={isPlaying ? "Pause" : "Play"}
|
||||||
>
|
>
|
||||||
{isPlaying ? (
|
{isPlaying ? (
|
||||||
<svg
|
<svg className="w-4 h-4 text-white" viewBox="0 0 24 24" fill="currentColor">
|
||||||
className="w-4 h-4 text-white"
|
|
||||||
viewBox="0 0 24 24"
|
|
||||||
fill="currentColor"
|
|
||||||
>
|
|
||||||
<rect x="6" y="4" width="4" height="16" />
|
<rect x="6" y="4" width="4" height="16" />
|
||||||
<rect x="14" y="4" width="4" height="16" />
|
<rect x="14" y="4" width="4" height="16" />
|
||||||
</svg>
|
</svg>
|
||||||
) : (
|
) : (
|
||||||
<svg
|
<svg className="w-4 h-4 text-white" viewBox="0 0 24 24" fill="currentColor">
|
||||||
className="w-4 h-4 text-white"
|
|
||||||
viewBox="0 0 24 24"
|
|
||||||
fill="currentColor"
|
|
||||||
>
|
|
||||||
<polygon points="5 3 19 12 5 21 5 3" />
|
<polygon points="5 3 19 12 5 21 5 3" />
|
||||||
</svg>
|
</svg>
|
||||||
)}
|
)}
|
||||||
@@ -143,9 +125,7 @@ export default function AudioPlayer({ audioUrl }: AudioPlayerProps) {
|
|||||||
|
|
||||||
{/* Duration info */}
|
{/* Duration info */}
|
||||||
<div className="flex-1 flex items-center gap-1 text-sm">
|
<div className="flex-1 flex items-center gap-1 text-sm">
|
||||||
<span style={{ color: "var(--foreground)" }}>
|
<span style={{ color: "var(--foreground)" }}>{formatTime(currentTime)}</span>
|
||||||
{formatTime(currentTime)}
|
|
||||||
</span>
|
|
||||||
<span style={{ color: "var(--muted)" }}>/</span>
|
<span style={{ color: "var(--muted)" }}>/</span>
|
||||||
<span style={{ color: "var(--muted)" }}>{formatTime(duration)}</span>
|
<span style={{ color: "var(--muted)" }}>{formatTime(duration)}</span>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -36,18 +36,27 @@ const STATUS_CONFIG: Record<
|
|||||||
Exclude<ServerStatus, "online">,
|
Exclude<ServerStatus, "online">,
|
||||||
{ color: string; label: (p: DownloadProgress | null) => string }
|
{ color: string; label: (p: DownloadProgress | null) => string }
|
||||||
> = {
|
> = {
|
||||||
offline: { color: "var(--error)", label: () => "Server offline — waiting for connection..." },
|
offline: { color: "var(--error)", label: () => "Server offline — waiting for connection..." },
|
||||||
downloading: { color: "#60a5fa", label: (p) => p && p.total > 0 ? `Downloading model... (${p.done} / ${p.total} files)` : "Downloading model (~1 GB)..." },
|
downloading: {
|
||||||
loading: { color: "#fbbf24", label: () => "Loading model into memory..." },
|
color: "#60a5fa",
|
||||||
error: { color: "var(--error)", label: () => "Server error — check the terminal for details." },
|
label: (p) =>
|
||||||
|
p && p.total > 0
|
||||||
|
? `Downloading model... (${p.done} / ${p.total} files)`
|
||||||
|
: "Downloading model (~1 GB)...",
|
||||||
|
},
|
||||||
|
loading: { color: "#fbbf24", label: () => "Loading model into memory..." },
|
||||||
|
error: { color: "var(--error)", label: () => "Server error — check the terminal for details." },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
function SpinnerIcon() {
|
function SpinnerIcon() {
|
||||||
return (
|
return (
|
||||||
<svg className="animate-spin w-4 h-4" viewBox="0 0 24 24" fill="none">
|
<svg className="animate-spin w-4 h-4" viewBox="0 0 24 24" fill="none">
|
||||||
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
|
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
|
||||||
<path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z" />
|
<path
|
||||||
|
className="opacity-75"
|
||||||
|
fill="currentColor"
|
||||||
|
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z"
|
||||||
|
/>
|
||||||
</svg>
|
</svg>
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -146,7 +155,10 @@ export default function GenerationControls({
|
|||||||
onChange={(e) => onCfgScaleChange(parseFloat(e.target.value))}
|
onChange={(e) => onCfgScaleChange(parseFloat(e.target.value))}
|
||||||
className="w-full"
|
className="w-full"
|
||||||
/>
|
/>
|
||||||
<div className="flex items-center justify-between text-xs" style={{ color: "var(--muted)" }}>
|
<div
|
||||||
|
className="flex items-center justify-between text-xs"
|
||||||
|
style={{ color: "var(--muted)" }}
|
||||||
|
>
|
||||||
<span>Flat (0.5)</span>
|
<span>Flat (0.5)</span>
|
||||||
<span>CFG Scale</span>
|
<span>CFG Scale</span>
|
||||||
<span>Expressive (4.0)</span>
|
<span>Expressive (4.0)</span>
|
||||||
@@ -157,7 +169,7 @@ export default function GenerationControls({
|
|||||||
<div className="flex flex-col gap-2">
|
<div className="flex flex-col gap-2">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<label className="text-sm font-medium" style={{ color: "var(--foreground)" }}>
|
<label className="text-sm font-medium" style={{ color: "var(--foreground)" }}>
|
||||||
Quality vs Speed
|
Speed vs Quality
|
||||||
</label>
|
</label>
|
||||||
<span
|
<span
|
||||||
className="text-sm font-mono px-2 py-0.5 rounded"
|
className="text-sm font-mono px-2 py-0.5 rounded"
|
||||||
@@ -176,7 +188,10 @@ export default function GenerationControls({
|
|||||||
className="w-full"
|
className="w-full"
|
||||||
style={{ "--thumb-color": "var(--accent-violet)" } as React.CSSProperties}
|
style={{ "--thumb-color": "var(--accent-violet)" } as React.CSSProperties}
|
||||||
/>
|
/>
|
||||||
<div className="flex items-center justify-between text-xs" style={{ color: "var(--muted)" }}>
|
<div
|
||||||
|
className="flex items-center justify-between text-xs"
|
||||||
|
style={{ color: "var(--muted)" }}
|
||||||
|
>
|
||||||
<span>Faster (5)</span>
|
<span>Faster (5)</span>
|
||||||
<span>Diffusion Steps</span>
|
<span>Diffusion Steps</span>
|
||||||
<span>Better (20)</span>
|
<span>Better (20)</span>
|
||||||
@@ -207,7 +222,11 @@ export default function GenerationControls({
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
{showAdvanced && (
|
{showAdvanced && (
|
||||||
<div id="advanced-buffering-panel" className="flex flex-col gap-4 pl-2 border-l" style={{ borderColor: "var(--border)" }}>
|
<div
|
||||||
|
id="advanced-buffering-panel"
|
||||||
|
className="flex flex-col gap-4 pl-2 border-l"
|
||||||
|
style={{ borderColor: "var(--border)" }}
|
||||||
|
>
|
||||||
{/* Pre-buffer */}
|
{/* Pre-buffer */}
|
||||||
<div className="flex flex-col gap-2">
|
<div className="flex flex-col gap-2">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
@@ -221,7 +240,7 @@ export default function GenerationControls({
|
|||||||
<input
|
<input
|
||||||
type="range"
|
type="range"
|
||||||
min={0.5}
|
min={0.5}
|
||||||
max={10.0}
|
max={30.0}
|
||||||
step={0.5}
|
step={0.5}
|
||||||
value={prebufferSecs}
|
value={prebufferSecs}
|
||||||
onChange={(e) => onPrebufferSecsChange(parseFloat(e.target.value))}
|
onChange={(e) => onPrebufferSecsChange(parseFloat(e.target.value))}
|
||||||
@@ -232,7 +251,11 @@ export default function GenerationControls({
|
|||||||
{/* Re-buffer threshold */}
|
{/* Re-buffer threshold */}
|
||||||
<div className="flex flex-col gap-2">
|
<div className="flex flex-col gap-2">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<label htmlFor="rebuffer-threshold" className="text-xs font-medium" style={{ color: "var(--foreground)" }}>
|
<label
|
||||||
|
htmlFor="rebuffer-threshold"
|
||||||
|
className="text-xs font-medium"
|
||||||
|
style={{ color: "var(--foreground)" }}
|
||||||
|
>
|
||||||
Re-buffer Threshold
|
Re-buffer Threshold
|
||||||
</label>
|
</label>
|
||||||
<span className="text-xs font-mono" style={{ color: "var(--accent-teal)" }}>
|
<span className="text-xs font-mono" style={{ color: "var(--accent-teal)" }}>
|
||||||
@@ -260,7 +283,11 @@ export default function GenerationControls({
|
|||||||
{/* Resume threshold */}
|
{/* Resume threshold */}
|
||||||
<div className="flex flex-col gap-2">
|
<div className="flex flex-col gap-2">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<label htmlFor="resume-threshold" className="text-xs font-medium" style={{ color: "var(--foreground)" }}>
|
<label
|
||||||
|
htmlFor="resume-threshold"
|
||||||
|
className="text-xs font-medium"
|
||||||
|
style={{ color: "var(--foreground)" }}
|
||||||
|
>
|
||||||
Resume Threshold
|
Resume Threshold
|
||||||
</label>
|
</label>
|
||||||
<span className="text-xs font-mono" style={{ color: "var(--accent-teal)" }}>
|
<span className="text-xs font-mono" style={{ color: "var(--accent-teal)" }}>
|
||||||
@@ -271,7 +298,7 @@ export default function GenerationControls({
|
|||||||
id="resume-threshold"
|
id="resume-threshold"
|
||||||
type="range"
|
type="range"
|
||||||
min={0.5}
|
min={0.5}
|
||||||
max={5.0}
|
max={30.0}
|
||||||
step={0.1}
|
step={0.1}
|
||||||
value={resumeThresholdSecs}
|
value={resumeThresholdSecs}
|
||||||
onChange={(e) => {
|
onChange={(e) => {
|
||||||
@@ -302,7 +329,10 @@ export default function GenerationControls({
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
{serverStatus === "downloading" && (
|
{serverStatus === "downloading" && (
|
||||||
<div className="w-full rounded-full h-1.5 overflow-hidden" style={{ background: "var(--border)" }}>
|
<div
|
||||||
|
className="w-full rounded-full h-1.5 overflow-hidden"
|
||||||
|
style={{ background: "var(--border)" }}
|
||||||
|
>
|
||||||
<div
|
<div
|
||||||
className="h-1.5 rounded-full transition-all duration-500"
|
className="h-1.5 rounded-full transition-all duration-500"
|
||||||
style={{
|
style={{
|
||||||
@@ -315,10 +345,16 @@ export default function GenerationControls({
|
|||||||
)}
|
)}
|
||||||
|
|
||||||
{serverStatus === "loading" && (
|
{serverStatus === "loading" && (
|
||||||
<div className="w-full rounded-full h-1.5 overflow-hidden" style={{ background: "var(--border)" }}>
|
<div
|
||||||
|
className="w-full rounded-full h-1.5 overflow-hidden"
|
||||||
|
style={{ background: "var(--border)" }}
|
||||||
|
>
|
||||||
<div
|
<div
|
||||||
className="h-1.5 rounded-full animate-pulse"
|
className="h-1.5 rounded-full animate-pulse"
|
||||||
style={{ width: "60%", background: "linear-gradient(90deg, #fbbf24, var(--accent-teal))" }}
|
style={{
|
||||||
|
width: "60%",
|
||||||
|
background: "linear-gradient(90deg, #fbbf24, var(--accent-teal))",
|
||||||
|
}}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
@@ -328,11 +364,17 @@ export default function GenerationControls({
|
|||||||
{/* Generation progress bar */}
|
{/* Generation progress bar */}
|
||||||
{isGenerating && (
|
{isGenerating && (
|
||||||
<div className="flex flex-col gap-1.5">
|
<div className="flex flex-col gap-1.5">
|
||||||
<div className="flex items-center justify-between text-xs" style={{ color: "var(--muted)" }}>
|
<div
|
||||||
|
className="flex items-center justify-between text-xs"
|
||||||
|
style={{ color: "var(--muted)" }}
|
||||||
|
>
|
||||||
<span>{genElapsed}s elapsed</span>
|
<span>{genElapsed}s elapsed</span>
|
||||||
<span>{genPct !== null ? `${genPct}%` : "starting..."}</span>
|
<span>{genPct !== null ? `${genPct}%` : "starting..."}</span>
|
||||||
</div>
|
</div>
|
||||||
<div className="w-full rounded-full h-1.5 overflow-hidden" style={{ background: "var(--border)" }}>
|
<div
|
||||||
|
className="w-full rounded-full h-1.5 overflow-hidden"
|
||||||
|
style={{ background: "var(--border)" }}
|
||||||
|
>
|
||||||
<div
|
<div
|
||||||
className="h-1.5 rounded-full transition-all duration-500"
|
className="h-1.5 rounded-full transition-all duration-500"
|
||||||
style={{
|
style={{
|
||||||
@@ -355,7 +397,8 @@ export default function GenerationControls({
|
|||||||
buttonDisabled
|
buttonDisabled
|
||||||
? { background: "var(--border)", color: "var(--muted)" }
|
? { background: "var(--border)", color: "var(--muted)" }
|
||||||
: {
|
: {
|
||||||
background: "linear-gradient(135deg, var(--accent-teal-dim), var(--accent-violet-dim))",
|
background:
|
||||||
|
"linear-gradient(135deg, var(--accent-teal-dim), var(--accent-violet-dim))",
|
||||||
color: "#fff",
|
color: "#fff",
|
||||||
boxShadow: "0 4px 15px rgba(45, 212, 191, 0.2)",
|
boxShadow: "0 4px 15px rgba(45, 212, 191, 0.2)",
|
||||||
}
|
}
|
||||||
@@ -373,7 +416,13 @@ export default function GenerationControls({
|
|||||||
</>
|
</>
|
||||||
) : (
|
) : (
|
||||||
<>
|
<>
|
||||||
<svg className="w-4 h-4" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2">
|
<svg
|
||||||
|
className="w-4 h-4"
|
||||||
|
viewBox="0 0 24 24"
|
||||||
|
fill="none"
|
||||||
|
stroke="currentColor"
|
||||||
|
strokeWidth="2"
|
||||||
|
>
|
||||||
<polygon points="5 3 19 12 5 21 5 3" />
|
<polygon points="5 3 19 12 5 21 5 3" />
|
||||||
</svg>
|
</svg>
|
||||||
Generate Audio
|
Generate Audio
|
||||||
|
|||||||
+22
-25
@@ -6,8 +6,8 @@ type ServerStatus = "checking" | "downloading" | "loading" | "online" | "error"
|
|||||||
type Device = "cpu" | "cuda" | null;
|
type Device = "cpu" | "cuda" | null;
|
||||||
|
|
||||||
// Polling intervals: poll quickly until the server is online, then slow down.
|
// Polling intervals: poll quickly until the server is online, then slow down.
|
||||||
const FAST_INTERVAL_MS = 3000; // while checking / loading
|
const FAST_INTERVAL_MS = 3000; // while checking / loading
|
||||||
const SLOW_INTERVAL_MS = 30000; // once online
|
const SLOW_INTERVAL_MS = 30000; // once online
|
||||||
|
|
||||||
export default function Header() {
|
export default function Header() {
|
||||||
const [status, setStatus] = useState<ServerStatus>("checking");
|
const [status, setStatus] = useState<ServerStatus>("checking");
|
||||||
@@ -31,7 +31,10 @@ export default function Header() {
|
|||||||
intervalRef.current = setInterval(checkHealth, SLOW_INTERVAL_MS);
|
intervalRef.current = setInterval(checkHealth, SLOW_INTERVAL_MS);
|
||||||
}
|
}
|
||||||
// Switch to fast polling if we detect the server went offline/loading
|
// Switch to fast polling if we detect the server went offline/loading
|
||||||
if ((newStatus === "offline" || newStatus === "downloading" || newStatus === "loading") && intervalRef.current) {
|
if (
|
||||||
|
(newStatus === "offline" || newStatus === "downloading" || newStatus === "loading") &&
|
||||||
|
intervalRef.current
|
||||||
|
) {
|
||||||
clearInterval(intervalRef.current);
|
clearInterval(intervalRef.current);
|
||||||
intervalRef.current = setInterval(checkHealth, FAST_INTERVAL_MS);
|
intervalRef.current = setInterval(checkHealth, FAST_INTERVAL_MS);
|
||||||
}
|
}
|
||||||
@@ -95,23 +98,20 @@ export default function Header() {
|
|||||||
const cfg = statusConfig[status];
|
const cfg = statusConfig[status];
|
||||||
|
|
||||||
// Device badge — only shown once the server is online and device is known
|
// Device badge — only shown once the server is online and device is known
|
||||||
const deviceBadge = status === "online" && device ? (
|
const deviceBadge =
|
||||||
<span
|
status === "online" && device ? (
|
||||||
className="px-2 py-0.5 rounded-full text-xs font-semibold tracking-wide uppercase"
|
<span
|
||||||
style={{
|
className="px-2 py-0.5 rounded-full text-xs font-semibold tracking-wide uppercase"
|
||||||
background: device === "cuda"
|
style={{
|
||||||
? "var(--accent-violet-dim)"
|
background: device === "cuda" ? "var(--accent-violet-dim)" : "var(--accent-teal-dim)",
|
||||||
: "var(--accent-teal-dim)",
|
color: device === "cuda" ? "var(--accent-violet)" : "var(--accent-teal)",
|
||||||
color: device === "cuda"
|
border: `1px solid ${device === "cuda" ? "var(--accent-violet-dim)" : "var(--accent-teal-dim)"}`,
|
||||||
? "var(--accent-violet)"
|
}}
|
||||||
: "var(--accent-teal)",
|
title={device === "cuda" ? "Running on NVIDIA GPU" : "Running on CPU"}
|
||||||
border: `1px solid ${device === "cuda" ? "var(--accent-violet-dim)" : "var(--accent-teal-dim)"}`,
|
>
|
||||||
}}
|
{device.toUpperCase()}
|
||||||
title={device === "cuda" ? "Running on NVIDIA GPU" : "Running on CPU"}
|
</span>
|
||||||
>
|
) : null;
|
||||||
{device.toUpperCase()}
|
|
||||||
</span>
|
|
||||||
) : null;
|
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<header
|
<header
|
||||||
@@ -136,8 +136,7 @@ export default function Header() {
|
|||||||
<h1
|
<h1
|
||||||
className="text-xl font-bold tracking-tight"
|
className="text-xl font-bold tracking-tight"
|
||||||
style={{
|
style={{
|
||||||
background:
|
background: "linear-gradient(135deg, var(--accent-teal), var(--accent-violet))",
|
||||||
"linear-gradient(135deg, var(--accent-teal), var(--accent-violet))",
|
|
||||||
WebkitBackgroundClip: "text",
|
WebkitBackgroundClip: "text",
|
||||||
WebkitTextFillColor: "transparent",
|
WebkitTextFillColor: "transparent",
|
||||||
}}
|
}}
|
||||||
@@ -167,9 +166,7 @@ export default function Header() {
|
|||||||
className={`animate-ping absolute inline-flex h-full w-full rounded-full opacity-75 ${cfg.color}`}
|
className={`animate-ping absolute inline-flex h-full w-full rounded-full opacity-75 ${cfg.color}`}
|
||||||
/>
|
/>
|
||||||
)}
|
)}
|
||||||
<span
|
<span className={`relative inline-flex rounded-full h-2 w-2 ${cfg.color}`} />
|
||||||
className={`relative inline-flex rounded-full h-2 w-2 ${cfg.color}`}
|
|
||||||
/>
|
|
||||||
</span>
|
</span>
|
||||||
<span style={{ color: "var(--foreground)" }}>{cfg.label}</span>
|
<span style={{ color: "var(--foreground)" }}>{cfg.label}</span>
|
||||||
</div>
|
</div>
|
||||||
|
|||||||
@@ -47,8 +47,7 @@ export default function StatusLog({ messages }: StatusLogProps) {
|
|||||||
) : (
|
) : (
|
||||||
messages.map((msg, i) => {
|
messages.map((msg, i) => {
|
||||||
const isError =
|
const isError =
|
||||||
msg.toLowerCase().includes("error") ||
|
msg.toLowerCase().includes("error") || msg.toLowerCase().includes("failed");
|
||||||
msg.toLowerCase().includes("failed");
|
|
||||||
const isSuccess =
|
const isSuccess =
|
||||||
msg.toLowerCase().includes("done") ||
|
msg.toLowerCase().includes("done") ||
|
||||||
msg.toLowerCase().includes("complete") ||
|
msg.toLowerCase().includes("complete") ||
|
||||||
|
|||||||
@@ -15,10 +15,7 @@ interface TextInputPanelProps {
|
|||||||
onChange: (text: string) => void;
|
onChange: (text: string) => void;
|
||||||
}
|
}
|
||||||
|
|
||||||
export default function TextInputPanel({
|
export default function TextInputPanel({ value, onChange }: TextInputPanelProps) {
|
||||||
value,
|
|
||||||
onChange,
|
|
||||||
}: TextInputPanelProps) {
|
|
||||||
const charCount = value.length;
|
const charCount = value.length;
|
||||||
const wordCount = value.trim() === "" ? 0 : value.trim().split(/\s+/).length;
|
const wordCount = value.trim() === "" ? 0 : value.trim().split(/\s+/).length;
|
||||||
|
|
||||||
@@ -43,15 +40,12 @@ export default function TextInputPanel({
|
|||||||
color: "var(--muted)",
|
color: "var(--muted)",
|
||||||
}}
|
}}
|
||||||
onMouseEnter={(e) => {
|
onMouseEnter={(e) => {
|
||||||
(e.target as HTMLButtonElement).style.color =
|
(e.target as HTMLButtonElement).style.color = "var(--accent-violet)";
|
||||||
"var(--accent-violet)";
|
(e.target as HTMLButtonElement).style.borderColor = "var(--accent-violet)";
|
||||||
(e.target as HTMLButtonElement).style.borderColor =
|
|
||||||
"var(--accent-violet)";
|
|
||||||
}}
|
}}
|
||||||
onMouseLeave={(e) => {
|
onMouseLeave={(e) => {
|
||||||
(e.target as HTMLButtonElement).style.color = "var(--muted)";
|
(e.target as HTMLButtonElement).style.color = "var(--muted)";
|
||||||
(e.target as HTMLButtonElement).style.borderColor =
|
(e.target as HTMLButtonElement).style.borderColor = "var(--border)";
|
||||||
"var(--border)";
|
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
Load sample script
|
Load sample script
|
||||||
@@ -69,8 +63,7 @@ export default function TextInputPanel({
|
|||||||
}}
|
}}
|
||||||
onMouseLeave={(e) => {
|
onMouseLeave={(e) => {
|
||||||
(e.target as HTMLButtonElement).style.color = "var(--muted)";
|
(e.target as HTMLButtonElement).style.color = "var(--muted)";
|
||||||
(e.target as HTMLButtonElement).style.borderColor =
|
(e.target as HTMLButtonElement).style.borderColor = "var(--border)";
|
||||||
"var(--border)";
|
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
Clear
|
Clear
|
||||||
@@ -98,10 +91,7 @@ export default function TextInputPanel({
|
|||||||
}}
|
}}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
<div
|
<div className="flex items-center justify-between text-xs" style={{ color: "var(--muted)" }}>
|
||||||
className="flex items-center justify-between text-xs"
|
|
||||||
style={{ color: "var(--muted)" }}
|
|
||||||
>
|
|
||||||
<span>
|
<span>
|
||||||
{wordCount} word{wordCount !== 1 ? "s" : ""}
|
{wordCount} word{wordCount !== 1 ? "s" : ""}
|
||||||
</span>
|
</span>
|
||||||
|
|||||||
@@ -55,16 +55,12 @@ export function useAudioPlayer(audioUrl: string | null) {
|
|||||||
() => setState((prev) => ({ ...prev, isPlaying: false, currentTime: 0 })),
|
() => setState((prev) => ({ ...prev, isPlaying: false, currentTime: 0 })),
|
||||||
{ signal }
|
{ signal }
|
||||||
);
|
);
|
||||||
audio.addEventListener(
|
audio.addEventListener("play", () => setState((prev) => ({ ...prev, isPlaying: true })), {
|
||||||
"play",
|
signal,
|
||||||
() => setState((prev) => ({ ...prev, isPlaying: true })),
|
});
|
||||||
{ signal }
|
audio.addEventListener("pause", () => setState((prev) => ({ ...prev, isPlaying: false })), {
|
||||||
);
|
signal,
|
||||||
audio.addEventListener(
|
});
|
||||||
"pause",
|
|
||||||
() => setState((prev) => ({ ...prev, isPlaying: false })),
|
|
||||||
{ signal }
|
|
||||||
);
|
|
||||||
|
|
||||||
return () => {
|
return () => {
|
||||||
audio.pause();
|
audio.pause();
|
||||||
|
|||||||
+174
-126
@@ -3,9 +3,10 @@
|
|||||||
import { useCallback, useEffect, useRef, useState } from "react";
|
import { useCallback, useEffect, useRef, useState } from "react";
|
||||||
|
|
||||||
const SAMPLE_RATE = 24_000;
|
const SAMPLE_RATE = 24_000;
|
||||||
const DEFAULT_PREBUFFER_SECS = 2.0;
|
const DEFAULT_PREBUFFER_SECS = 5.0;
|
||||||
const DEFAULT_REBUFFER_THRESHOLD_SECS = 0.4;
|
const DEFAULT_REBUFFER_THRESHOLD_SECS = 1.0;
|
||||||
const DEFAULT_RESUME_THRESHOLD_SECS = 1.5;
|
const DEFAULT_RESUME_THRESHOLD_SECS = 3.0;
|
||||||
|
const MAX_ADAPTIVE_RESUME_SECS = 30.0;
|
||||||
|
|
||||||
interface GenerateOptions {
|
interface GenerateOptions {
|
||||||
text: string;
|
text: string;
|
||||||
@@ -91,7 +92,7 @@ export function useStreamingGeneration({
|
|||||||
let resumeThresholdSecs = rawResumeThresholdSecs;
|
let resumeThresholdSecs = rawResumeThresholdSecs;
|
||||||
if (resumeThresholdSecs <= rebufferThresholdSecs) {
|
if (resumeThresholdSecs <= rebufferThresholdSecs) {
|
||||||
console.warn(
|
console.warn(
|
||||||
`[useStreamingGeneration] resumeThresholdSecs (${resumeThresholdSecs}) must be greater than rebufferThresholdSecs (${rebufferThresholdSecs}). Clamping resumeThresholdSecs to ${rebufferThresholdSecs + 0.5}.`,
|
`[useStreamingGeneration] resumeThresholdSecs (${resumeThresholdSecs}) must be greater than rebufferThresholdSecs (${rebufferThresholdSecs}). Clamping resumeThresholdSecs to ${rebufferThresholdSecs + 0.5}.`
|
||||||
);
|
);
|
||||||
resumeThresholdSecs = rebufferThresholdSecs + 0.5;
|
resumeThresholdSecs = rebufferThresholdSecs + 0.5;
|
||||||
}
|
}
|
||||||
@@ -104,6 +105,10 @@ export function useStreamingGeneration({
|
|||||||
const isAutoBufferingRef = useRef(false);
|
const isAutoBufferingRef = useRef(false);
|
||||||
const isUserPausedRef = useRef(false);
|
const isUserPausedRef = useRef(false);
|
||||||
const audioUrlRef = useRef<string | null>(null);
|
const audioUrlRef = useRef<string | null>(null);
|
||||||
|
const firstChunkSeenRef = useRef(false);
|
||||||
|
const underrunCountRef = useRef(0);
|
||||||
|
const totalAudioSamplesRef = useRef(0);
|
||||||
|
const adaptiveResumeSecsRef = useRef(DEFAULT_RESUME_THRESHOLD_SECS);
|
||||||
|
|
||||||
const revokeCurrentUrl = useCallback(() => {
|
const revokeCurrentUrl = useCallback(() => {
|
||||||
if (audioUrlRef.current) {
|
if (audioUrlRef.current) {
|
||||||
@@ -122,8 +127,12 @@ export function useStreamingGeneration({
|
|||||||
hasStartedPlaybackRef.current = false;
|
hasStartedPlaybackRef.current = false;
|
||||||
isAutoBufferingRef.current = false;
|
isAutoBufferingRef.current = false;
|
||||||
isUserPausedRef.current = false;
|
isUserPausedRef.current = false;
|
||||||
|
firstChunkSeenRef.current = false;
|
||||||
|
underrunCountRef.current = 0;
|
||||||
|
totalAudioSamplesRef.current = 0;
|
||||||
|
adaptiveResumeSecsRef.current = resumeThresholdSecs;
|
||||||
setIsStreamPaused(false);
|
setIsStreamPaused(false);
|
||||||
}, []);
|
}, [resumeThresholdSecs]);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
return () => {
|
return () => {
|
||||||
@@ -153,139 +162,178 @@ export function useStreamingGeneration({
|
|||||||
hasStartedPlaybackRef.current = true;
|
hasStartedPlaybackRef.current = true;
|
||||||
}, [enqueue]);
|
}, [enqueue]);
|
||||||
|
|
||||||
const handleAudioChunk = useCallback((chunk: Float32Array<ArrayBuffer>) => {
|
const handleAudioChunk = useCallback(
|
||||||
const ctx = audioCtxRef.current;
|
(chunk: Float32Array<ArrayBuffer>) => {
|
||||||
if (!ctx) return;
|
const ctx = audioCtxRef.current;
|
||||||
|
if (!ctx) return;
|
||||||
|
|
||||||
chunksRef.current.push(chunk);
|
chunksRef.current.push(chunk);
|
||||||
|
totalAudioSamplesRef.current += chunk.length;
|
||||||
|
|
||||||
if (!hasStartedPlaybackRef.current) {
|
if (!firstChunkSeenRef.current) {
|
||||||
const bufferedSecs = chunksRef.current.reduce((sum, c) => sum + c.length, 0) / SAMPLE_RATE;
|
firstChunkSeenRef.current = true;
|
||||||
if (bufferedSecs >= prebufferSecs) {
|
onLog("First audio chunk received");
|
||||||
flushBufferedAudio();
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
enqueue(ctx, chunk);
|
|
||||||
if (isUserPausedRef.current) return;
|
|
||||||
|
|
||||||
const ahead = nextStartTimeRef.current - ctx.currentTime;
|
|
||||||
if (ctx.state === "running" && ahead < rebufferThresholdSecs) {
|
|
||||||
ctx.suspend().catch(() => {});
|
|
||||||
isAutoBufferingRef.current = true;
|
|
||||||
} else if (
|
|
||||||
ctx.state === "suspended" &&
|
|
||||||
isAutoBufferingRef.current &&
|
|
||||||
ahead >= resumeThresholdSecs
|
|
||||||
) {
|
|
||||||
ctx.resume().catch(() => {});
|
|
||||||
isAutoBufferingRef.current = false;
|
|
||||||
}
|
|
||||||
}, [enqueue, flushBufferedAudio, prebufferSecs, rebufferThresholdSecs, resumeThresholdSecs]);
|
|
||||||
|
|
||||||
const generate = useCallback(async (options: GenerateOptions) => {
|
|
||||||
if (!options.text.trim()) return;
|
|
||||||
|
|
||||||
resetPlayback();
|
|
||||||
revokeCurrentUrl();
|
|
||||||
audioCtxRef.current = new AudioContext({ sampleRate: SAMPLE_RATE });
|
|
||||||
|
|
||||||
const controller = new AbortController();
|
|
||||||
abortRef.current = controller;
|
|
||||||
|
|
||||||
onStart();
|
|
||||||
onLog(`Voice: ${options.speaker}`);
|
|
||||||
onLog(`CFG ${options.cfgScale.toFixed(1)}, steps ${options.inferenceSteps}`);
|
|
||||||
|
|
||||||
const startedAt = Date.now();
|
|
||||||
const timerId = window.setInterval(() => {
|
|
||||||
onProgress((Date.now() - startedAt) / 1000, null);
|
|
||||||
}, 500);
|
|
||||||
|
|
||||||
try {
|
|
||||||
const res = await fetch("/api/generate", {
|
|
||||||
method: "POST",
|
|
||||||
headers: { "Content-Type": "application/json" },
|
|
||||||
body: JSON.stringify({
|
|
||||||
text: options.text,
|
|
||||||
speaker: options.speaker,
|
|
||||||
cfg_scale: options.cfgScale,
|
|
||||||
inference_steps: options.inferenceSteps,
|
|
||||||
}),
|
|
||||||
signal: controller.signal,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!res.ok || !res.body) {
|
|
||||||
const err = await res.json().catch(() => ({})) as { error?: string };
|
|
||||||
throw new Error(err.error ?? `HTTP ${res.status}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const reader = res.body.getReader();
|
if (!hasStartedPlaybackRef.current) {
|
||||||
const decoder = new TextDecoder();
|
const bufferedSecs = chunksRef.current.reduce((sum, c) => sum + c.length, 0) / SAMPLE_RATE;
|
||||||
let buffer = "";
|
if (bufferedSecs >= prebufferSecs) {
|
||||||
|
onLog(`Playback started after ${bufferedSecs.toFixed(1)}s buffered`);
|
||||||
|
flushBufferedAudio();
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
while (true) {
|
enqueue(ctx, chunk);
|
||||||
const { done, value } = await reader.read();
|
if (isUserPausedRef.current) return;
|
||||||
if (done) break;
|
|
||||||
|
|
||||||
buffer += decoder.decode(value, { stream: true });
|
const ahead = nextStartTimeRef.current - ctx.currentTime;
|
||||||
const lines = buffer.split("\n");
|
if (ctx.state === "running" && !isAutoBufferingRef.current && ahead < rebufferThresholdSecs) {
|
||||||
buffer = lines.pop() ?? "";
|
isAutoBufferingRef.current = true;
|
||||||
|
underrunCountRef.current += 1;
|
||||||
|
adaptiveResumeSecsRef.current = Math.min(
|
||||||
|
MAX_ADAPTIVE_RESUME_SECS,
|
||||||
|
Math.max(resumeThresholdSecs, prebufferSecs + underrunCountRef.current * 2)
|
||||||
|
);
|
||||||
|
ctx.suspend().catch(() => {});
|
||||||
|
onLog(
|
||||||
|
`Buffer underrun ${underrunCountRef.current}; refilling to ${adaptiveResumeSecsRef.current.toFixed(1)}s`
|
||||||
|
);
|
||||||
|
} else if (isAutoBufferingRef.current && ahead >= adaptiveResumeSecsRef.current) {
|
||||||
|
isAutoBufferingRef.current = false;
|
||||||
|
ctx.resume().catch(() => {});
|
||||||
|
onLog(`Buffer recovered with ${ahead.toFixed(1)}s queued`);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
[enqueue, flushBufferedAudio, onLog, prebufferSecs, rebufferThresholdSecs, resumeThresholdSecs]
|
||||||
|
);
|
||||||
|
|
||||||
for (const line of lines) {
|
const generate = useCallback(
|
||||||
if (!line.startsWith("data: ")) continue;
|
async (options: GenerateOptions) => {
|
||||||
const event = JSON.parse(line.slice(6)) as {
|
if (!options.text.trim()) return;
|
||||||
type: "audio_chunk" | "complete" | "error" | "cancelled";
|
|
||||||
data?: string;
|
|
||||||
elapsed?: number;
|
|
||||||
message?: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
if (event.type === "audio_chunk" && event.data) {
|
resetPlayback();
|
||||||
handleAudioChunk(decodeFloat32Chunk(event.data));
|
revokeCurrentUrl();
|
||||||
} else if (event.type === "complete") {
|
audioCtxRef.current = new AudioContext({ sampleRate: SAMPLE_RATE });
|
||||||
if (!hasStartedPlaybackRef.current) {
|
|
||||||
flushBufferedAudio();
|
const controller = new AbortController();
|
||||||
|
abortRef.current = controller;
|
||||||
|
|
||||||
|
onStart();
|
||||||
|
onLog(`Voice: ${options.speaker}`);
|
||||||
|
onLog(`CFG ${options.cfgScale.toFixed(1)}, steps ${options.inferenceSteps}`);
|
||||||
|
|
||||||
|
const startedAt = Date.now();
|
||||||
|
const timerId = window.setInterval(() => {
|
||||||
|
onProgress((Date.now() - startedAt) / 1000, null);
|
||||||
|
}, 500);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const res = await fetch("/api/generate", {
|
||||||
|
method: "POST",
|
||||||
|
headers: { "Content-Type": "application/json" },
|
||||||
|
body: JSON.stringify({
|
||||||
|
text: options.text,
|
||||||
|
speaker: options.speaker,
|
||||||
|
cfg_scale: options.cfgScale,
|
||||||
|
inference_steps: options.inferenceSteps,
|
||||||
|
}),
|
||||||
|
signal: controller.signal,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok || !res.body) {
|
||||||
|
const err = (await res.json().catch(() => ({}))) as { error?: string };
|
||||||
|
throw new Error(err.error ?? `HTTP ${res.status}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const reader = res.body.getReader();
|
||||||
|
const decoder = new TextDecoder();
|
||||||
|
let buffer = "";
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
const { done, value } = await reader.read();
|
||||||
|
if (done) break;
|
||||||
|
|
||||||
|
buffer += decoder.decode(value, { stream: true });
|
||||||
|
const lines = buffer.split("\n");
|
||||||
|
buffer = lines.pop() ?? "";
|
||||||
|
|
||||||
|
for (const line of lines) {
|
||||||
|
if (!line.startsWith("data: ")) continue;
|
||||||
|
const event = JSON.parse(line.slice(6)) as {
|
||||||
|
type: "audio_chunk" | "complete" | "error" | "cancelled";
|
||||||
|
data?: string;
|
||||||
|
elapsed?: number;
|
||||||
|
audio_secs?: number;
|
||||||
|
realtime_factor?: number | null;
|
||||||
|
chunks?: number;
|
||||||
|
first_chunk_secs?: number | null;
|
||||||
|
max_chunk_gap_secs?: number;
|
||||||
|
message?: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
if (event.type === "audio_chunk" && event.data) {
|
||||||
|
handleAudioChunk(decodeFloat32Chunk(event.data));
|
||||||
|
} else if (event.type === "complete") {
|
||||||
|
if (!hasStartedPlaybackRef.current) {
|
||||||
|
flushBufferedAudio();
|
||||||
|
} else if (isAutoBufferingRef.current) {
|
||||||
|
isAutoBufferingRef.current = false;
|
||||||
|
audioCtxRef.current?.resume().catch(() => {});
|
||||||
|
}
|
||||||
|
const wavBlob = buildWav(mergeFloat32Arrays(chunksRef.current), SAMPLE_RATE);
|
||||||
|
const audioUrl = URL.createObjectURL(wavBlob);
|
||||||
|
audioUrlRef.current = audioUrl;
|
||||||
|
const kb = (wavBlob.size / 1024).toFixed(0);
|
||||||
|
const audioSecs = event.audio_secs ?? totalAudioSamplesRef.current / SAMPLE_RATE;
|
||||||
|
const realtimeFactor =
|
||||||
|
event.realtime_factor ??
|
||||||
|
(event.elapsed && event.elapsed > 0 ? audioSecs / event.elapsed : null);
|
||||||
|
const speedText =
|
||||||
|
realtimeFactor === null ? "" : ` - ${realtimeFactor.toFixed(2)}x realtime`;
|
||||||
|
onLog(
|
||||||
|
`Done in ${event.elapsed}s - ${audioSecs.toFixed(1)}s audio${speedText} - ${kb} KB`
|
||||||
|
);
|
||||||
|
if (event.chunks && event.first_chunk_secs !== undefined) {
|
||||||
|
onLog(
|
||||||
|
`Stream: first chunk ${event.first_chunk_secs}s, ${event.chunks} chunks, max gap ${event.max_chunk_gap_secs}s`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
onSuccess(audioUrl);
|
||||||
|
} else if (event.type === "cancelled") {
|
||||||
|
throw new DOMException("Generation cancelled", "AbortError");
|
||||||
|
} else if (event.type === "error") {
|
||||||
|
throw new Error(event.message ?? "Generation failed");
|
||||||
}
|
}
|
||||||
const wavBlob = buildWav(mergeFloat32Arrays(chunksRef.current), SAMPLE_RATE);
|
|
||||||
const audioUrl = URL.createObjectURL(wavBlob);
|
|
||||||
audioUrlRef.current = audioUrl;
|
|
||||||
const kb = (wavBlob.size / 1024).toFixed(0);
|
|
||||||
onLog(`Done in ${event.elapsed}s - ${kb} KB`);
|
|
||||||
onSuccess(audioUrl);
|
|
||||||
} else if (event.type === "cancelled") {
|
|
||||||
throw new DOMException("Generation cancelled", "AbortError");
|
|
||||||
} else if (event.type === "error") {
|
|
||||||
throw new Error(event.message ?? "Generation failed");
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} catch (err) {
|
||||||
|
if (err instanceof Error && err.name === "AbortError") {
|
||||||
|
onLog("Cancelled.");
|
||||||
|
onCancel();
|
||||||
|
} else {
|
||||||
|
const message = err instanceof Error ? err.message : "Unknown error";
|
||||||
|
onLog(`Error: ${message}`);
|
||||||
|
onError();
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
window.clearInterval(timerId);
|
||||||
|
abortRef.current = null;
|
||||||
}
|
}
|
||||||
} catch (err) {
|
},
|
||||||
if (err instanceof Error && err.name === "AbortError") {
|
[
|
||||||
onLog("Cancelled.");
|
flushBufferedAudio,
|
||||||
onCancel();
|
handleAudioChunk,
|
||||||
} else {
|
onCancel,
|
||||||
const message = err instanceof Error ? err.message : "Unknown error";
|
onError,
|
||||||
onLog(`Error: ${message}`);
|
onLog,
|
||||||
onError();
|
onProgress,
|
||||||
}
|
onStart,
|
||||||
} finally {
|
onSuccess,
|
||||||
window.clearInterval(timerId);
|
resetPlayback,
|
||||||
abortRef.current = null;
|
revokeCurrentUrl,
|
||||||
}
|
]
|
||||||
}, [
|
);
|
||||||
flushBufferedAudio,
|
|
||||||
handleAudioChunk,
|
|
||||||
onCancel,
|
|
||||||
onError,
|
|
||||||
onLog,
|
|
||||||
onProgress,
|
|
||||||
onStart,
|
|
||||||
onSuccess,
|
|
||||||
resetPlayback,
|
|
||||||
revokeCurrentUrl,
|
|
||||||
]);
|
|
||||||
|
|
||||||
const pauseStream = useCallback(() => {
|
const pauseStream = useCallback(() => {
|
||||||
isUserPausedRef.current = true;
|
isUserPausedRef.current = true;
|
||||||
|
|||||||
+4
-2
@@ -4,8 +4,10 @@
|
|||||||
"private": true,
|
"private": true,
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"dev": "next dev --turbopack",
|
"dev": "next dev --turbopack",
|
||||||
"build": "next build --turbopack",
|
"build": "next build",
|
||||||
"start": "next start"
|
"start": "next start",
|
||||||
|
"format": "prettier --write .",
|
||||||
|
"format:check": "prettier --check ."
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"next": "15.5.15",
|
"next": "15.5.15",
|
||||||
|
|||||||
Reference in New Issue
Block a user