perf: migrate to JezzWTF/VibeVoice fork, parallel CFG executors

Switch vibevoice dependency from microsoft/VibeVoice to JezzWTF/VibeVoice
fork (commit e76701f) which contains the async decode + parallel CFG
optimisations directly in generate(). Removes the instance-method
patching approach (vibevoice_generate_patch.py deleted).

server/vibevoice_server.py:
- Add _cfg_executor (ThreadPoolExecutor, 1 worker) alongside _decode_executor
- _install_cpu_pipeline_optimizations now sets both executors directly as
  model._vibepod_decode_executor and model._vibepod_cfg_executor
- Both executors shut down in lifespan on exit
- Remove vibevoice_generate_patch import/install (no longer needed)

server/pyproject.toml:
- vibevoice source changed to git+https://github.com/JezzWTF/VibeVoice.git
- No machine-local paths; works identically on any clone
This commit is contained in:
2026-04-30 21:30:07 +01:00
parent 7591d15a52
commit 98e2bf9237
4 changed files with 36 additions and 496 deletions
+26 -24
View File
@@ -66,9 +66,12 @@ DEFAULT_SPEAKER = "carter"
_IGNORE_PATTERNS = ["*.msgpack", "flax_model*", "tf_model*", "rust_model*", "*.ot"]
# ── Decode pipeline executor ────────────────────────────────────────────────────
# ── Pipeline executors ─────────────────────────────────────────────────────────
# _decode_executor: overlaps acoustic_decode with forward_tts_lm (1 worker).
# _cfg_executor: runs positive + negative forward_tts_lm in parallel (1 worker).
_decode_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
_cfg_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
# ── Device selection ────────────────────────────────────────────────────────────
# VIBEPOD_DEVICE env var is set by start.sh based on the --cpu / --cuda flag.
@@ -543,41 +546,38 @@ def _install_generation_optimizations(model: object) -> None:
def _install_cpu_pipeline_optimizations(model: object) -> None:
"""Install the async-decode generate() patch and its thread pool on the model instance.
"""Attach pipeline executors to the model for the optimised generate() loop.
The VibeVoice inner loop runs:
decode(speech_latent) → append → put → connector → tts_lm(pos) → tts_lm(neg)
The JezzWTF/VibeVoice fork's generate() checks for two optional attributes:
connector and both tts_lm calls only need speech_latent/acoustic_embed, not
audio_chunk. The patched generate() reorders this to:
submit decode to thread → connector → tts_lm(pos) → tts_lm(neg)
→ wait for decode future → append → put
model._vibepod_decode_executor — ThreadPoolExecutor (1 worker) used to
overlap acoustic_decode with acoustic_connector + forward_tts_lm.
The patch is applied as an instance method via types.MethodType, which shadows
the class-level generate() and is immune to uv sync reinstalling the package.
model._vibepod_cfg_executor — ThreadPoolExecutor (1 worker) used to
run the positive and negative forward_tts_lm calls in parallel, so
both CFG passes execute concurrently instead of sequentially.
Both are None by default, making the fork's generate() behave identically
to upstream on CUDA or any machine where these aren't set.
"""
global _decode_executor
global _decode_executor, _cfg_executor
if os.environ.get("VIBEPOD_ASYNC_DECODE", "1") != "1":
logger.info("CPU async decode disabled via VIBEPOD_ASYNC_DECODE=0.")
return
try:
import vibevoice_generate_patch
except ImportError:
logger.warning(
"vibevoice_generate_patch not found — async decode unavailable. "
"Ensure vibevoice_generate_patch.py is in the server directory."
)
logger.info("CPU async decode/CFG parallelism disabled via VIBEPOD_ASYNC_DECODE=0.")
return
_decode_executor = concurrent.futures.ThreadPoolExecutor(
max_workers=1, thread_name_prefix="vibepod-decode"
)
vibevoice_generate_patch.install(model, _decode_executor)
_cfg_executor = concurrent.futures.ThreadPoolExecutor(
max_workers=1, thread_name_prefix="vibepod-cfg"
)
model._vibepod_decode_executor = _decode_executor
model._vibepod_cfg_executor = _cfg_executor
logger.info(
"CPU pipeline: patched generate() installed (async decode enabled)"
"acoustic_decode overlaps forward_tts_lm. Disable with VIBEPOD_ASYNC_DECODE=0."
"CPU pipeline: decode executor and CFG executor attached — "
"acoustic_decode overlaps tts_lm, pos/neg CFG runs in parallel. "
"Disable with VIBEPOD_ASYNC_DECODE=0."
)
@@ -688,6 +688,8 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
yield
if _decode_executor is not None:
_decode_executor.shutdown(wait=False)
if _cfg_executor is not None:
_cfg_executor.shutdown(wait=False)
app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan)