diff --git a/.env.example b/.env.example index 537770d..dc32c2b 100644 --- a/.env.example +++ b/.env.example @@ -8,3 +8,42 @@ HF_TOKEN= # Override the HuggingFace model cache directory (optional) # HF_HOME=/path/to/hf-cache + +# --------------------------------------------------------------------------- +# Runtime tuning +# --------------------------------------------------------------------------- + +# Force the Python server device. Usually set by `pnpm dev` / `pnpm dev:cpu`. +# VIBEPOD_DEVICE=cuda +# VIBEPOD_DEVICE=cpu + +# CPU mode: keep async decode enabled. This overlaps acoustic decoding with +# language-model work and measured ~20% faster on an 8-thread CPU run. +VIBEPOD_ASYNC_DECODE=1 + +# CPU mode: thread tuning. On an 8-core / 16-thread Ryzen test system, +# 8 worker threads with 1 inter-op thread gave the best wall time, while 12 +# over-subscribed and regressed. +# VIBEPOD_CPU_THREADS=8 +# VIBEPOD_CPU_INTEROP_THREADS=1 + +# CPU mode: playback buffering. CPU generation is slower than realtime, so +# smooth streaming needs a larger initial buffer than CUDA. Lower these for +# faster startup if you are OK with occasional rebuffering. +# VIBEPOD_PREBUFFER_SECS=24 +# VIBEPOD_REBUFFER_THRESHOLD_SECS=2 +# VIBEPOD_RESUME_THRESHOLD_SECS=12 + +# CPU mode: dynamic INT8 quantization is enabled by default in start.sh. +# Set to 0 if you are comparing quality/performance or debugging. +# VIBEPOD_QUANTIZE=1 + +# CUDA mode: dtype and attention selection. Defaults are bf16 + SDPA unless +# optional FlashAttention is explicitly enabled and importable. +# VIBEPOD_CUDA_DTYPE=bf16 +# VIBEPOD_ATTN_IMPL=sdpa +# VIBEPOD_ENABLE_FLASH_ATTN=0 + +# Debug/profiling. Keep disabled for benchmark timing; async CPU profiling +# double-counts overlapped decode work. +# VIBEPOD_PROFILE_GENERATION=0 diff --git a/server/start.sh b/server/start.sh index 995fbc1..060cde5 100755 --- a/server/start.sh +++ b/server/start.sh @@ -136,7 +136,7 @@ if $CPU_MODE; then export VIBEPOD_DEVICE="cpu" export UV_PROJECT_ENVIRONMENT=".venv-cpu" if [[ -z "${VIBEPOD_CPU_THREADS:-}" ]]; then - VIBEPOD_CPU_THREADS="$(uv run --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")" + VIBEPOD_CPU_THREADS="$(uv run --no-sync --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")" export VIBEPOD_CPU_THREADS fi export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$VIBEPOD_CPU_THREADS}" diff --git a/server/vibevoice_server.py b/server/vibevoice_server.py index 2516a59..74fbd94 100644 --- a/server/vibevoice_server.py +++ b/server/vibevoice_server.py @@ -66,12 +66,10 @@ DEFAULT_SPEAKER = "carter" _IGNORE_PATTERNS = ["*.msgpack", "flax_model*", "tf_model*", "rust_model*", "*.ot"] -# ── Pipeline executors ───────────────────────────────────────────────────────── -# _decode_executor: overlaps acoustic_decode with forward_tts_lm (1 worker). -# _cfg_executor: runs positive + negative forward_tts_lm in parallel (1 worker). +# ── Pipeline executor ────────────────────────────────────────────────────────── +# Overlaps acoustic_decode with forward_tts_lm on a background thread (1 worker). _decode_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None -_cfg_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None # ── Device selection ──────────────────────────────────────────────────────────── # VIBEPOD_DEVICE env var is set by start.sh based on the --cpu / --cuda flag. @@ -546,38 +544,37 @@ def _install_generation_optimizations(model: object) -> None: def _install_cpu_pipeline_optimizations(model: object) -> None: - """Attach pipeline executors to the model for the optimised generate() loop. + """Attach the decode executor to the model for the optimised generate() loop. The JezzWTF/VibeVoice fork's generate() checks for two optional attributes: - model._vibepod_decode_executor — ThreadPoolExecutor (1 worker) used to - overlap acoustic_decode with acoustic_connector + forward_tts_lm. + model._vibepod_decode_executor — ThreadPoolExecutor (1 worker) that + overlaps acoustic_decode with acoustic_connector + forward_tts_lm. + Profiling showed this hides ~72s of decode cost behind tts_lm work, + capturing ~96% of the theoretical overlap savings. - model._vibepod_cfg_executor — ThreadPoolExecutor (1 worker) used to - run the positive and negative forward_tts_lm calls in parallel, so - both CFG passes execute concurrently instead of sequentially. + model._vibepod_cfg_executor — intentionally NOT set. Parallel pos/neg + forward_tts_lm via a second thread causes MKL OpenMP thread-pool + contention on CPU: both threads compete for the same OMP worker pool, + making each call slower rather than faster. Net effect: ~6% regression. + The hook remains in the fork for potential GPU or future use. - Both are None by default, making the fork's generate() behave identically - to upstream on CUDA or any machine where these aren't set. + Attributes default to None, so the fork's generate() falls back to the + original sequential behaviour on CUDA or any non-VibePod install. """ - global _decode_executor, _cfg_executor + global _decode_executor if os.environ.get("VIBEPOD_ASYNC_DECODE", "1") != "1": - logger.info("CPU async decode/CFG parallelism disabled via VIBEPOD_ASYNC_DECODE=0.") + logger.info("CPU async decode disabled via VIBEPOD_ASYNC_DECODE=0.") return _decode_executor = concurrent.futures.ThreadPoolExecutor( max_workers=1, thread_name_prefix="vibepod-decode" ) - _cfg_executor = concurrent.futures.ThreadPoolExecutor( - max_workers=1, thread_name_prefix="vibepod-cfg" - ) model._vibepod_decode_executor = _decode_executor - model._vibepod_cfg_executor = _cfg_executor logger.info( - "CPU pipeline: decode executor and CFG executor attached — " - "acoustic_decode overlaps tts_lm, pos/neg CFG runs in parallel. " - "Disable with VIBEPOD_ASYNC_DECODE=0." + "CPU pipeline: decode executor attached — acoustic_decode overlaps " + "tts_lm. Disable with VIBEPOD_ASYNC_DECODE=0." ) @@ -643,9 +640,9 @@ def _load_model_sync() -> None: is_cpu = _device == "cpu" _config["device"] = _device _config["chunk_accum"] = _env_int("VIBEPOD_CHUNK_ACCUM", 4 if is_cpu else 1) - _config["prebuffer_secs"] = _env_float("VIBEPOD_PREBUFFER_SECS", 6.0 if is_cpu else 5.0) - _config["rebuffer_threshold_secs"] = _env_float("VIBEPOD_REBUFFER_THRESHOLD_SECS", 1.5 if is_cpu else 1.0) - _config["resume_threshold_secs"] = _env_float("VIBEPOD_RESUME_THRESHOLD_SECS", 4.0 if is_cpu else 3.0) + _config["prebuffer_secs"] = _env_float("VIBEPOD_PREBUFFER_SECS", 24.0 if is_cpu else 5.0) + _config["rebuffer_threshold_secs"] = _env_float("VIBEPOD_REBUFFER_THRESHOLD_SECS", 2.0 if is_cpu else 1.0) + _config["resume_threshold_secs"] = _env_float("VIBEPOD_RESUME_THRESHOLD_SECS", 12.0 if is_cpu else 3.0) _config["default_inference_steps"] = _env_int("VIBEPOD_DEFAULT_INFERENCE_STEPS", 8 if is_cpu else 10) if is_cpu: logical_cpus = os.cpu_count() or 1 @@ -688,8 +685,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: yield if _decode_executor is not None: _decode_executor.shutdown(wait=False) - if _cfg_executor is not None: - _cfg_executor.shutdown(wait=False) app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan) diff --git a/web/components/GenerationControls.tsx b/web/components/GenerationControls.tsx index f9a7d4c..41bce72 100644 --- a/web/components/GenerationControls.tsx +++ b/web/components/GenerationControls.tsx @@ -157,7 +157,7 @@ export default function GenerationControls({