mirror of
https://github.com/JezzWTF/vibepod.git
synced 2026-06-01 15:22:14 +00:00
perf(cpu): tune streaming playback
Keep CPU async decode enabled without CFG parallelism, expand CPU buffering defaults for smooth playback, prevent CPU startup from mutating the lockfile during thread autodetection, and document runtime tuning variables in the example environment file.
This commit is contained in:
@@ -8,3 +8,42 @@ HF_TOKEN=
|
|||||||
|
|
||||||
# Override the HuggingFace model cache directory (optional)
|
# Override the HuggingFace model cache directory (optional)
|
||||||
# HF_HOME=/path/to/hf-cache
|
# HF_HOME=/path/to/hf-cache
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Runtime tuning
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
# Force the Python server device. Usually set by `pnpm dev` / `pnpm dev:cpu`.
|
||||||
|
# VIBEPOD_DEVICE=cuda
|
||||||
|
# VIBEPOD_DEVICE=cpu
|
||||||
|
|
||||||
|
# CPU mode: keep async decode enabled. This overlaps acoustic decoding with
|
||||||
|
# language-model work and measured ~20% faster on an 8-thread CPU run.
|
||||||
|
VIBEPOD_ASYNC_DECODE=1
|
||||||
|
|
||||||
|
# CPU mode: thread tuning. On an 8-core / 16-thread Ryzen test system,
|
||||||
|
# 8 worker threads with 1 inter-op thread gave the best wall time, while 12
|
||||||
|
# over-subscribed and regressed.
|
||||||
|
# VIBEPOD_CPU_THREADS=8
|
||||||
|
# VIBEPOD_CPU_INTEROP_THREADS=1
|
||||||
|
|
||||||
|
# CPU mode: playback buffering. CPU generation is slower than realtime, so
|
||||||
|
# smooth streaming needs a larger initial buffer than CUDA. Lower these for
|
||||||
|
# faster startup if you are OK with occasional rebuffering.
|
||||||
|
# VIBEPOD_PREBUFFER_SECS=24
|
||||||
|
# VIBEPOD_REBUFFER_THRESHOLD_SECS=2
|
||||||
|
# VIBEPOD_RESUME_THRESHOLD_SECS=12
|
||||||
|
|
||||||
|
# CPU mode: dynamic INT8 quantization is enabled by default in start.sh.
|
||||||
|
# Set to 0 if you are comparing quality/performance or debugging.
|
||||||
|
# VIBEPOD_QUANTIZE=1
|
||||||
|
|
||||||
|
# CUDA mode: dtype and attention selection. Defaults are bf16 + SDPA unless
|
||||||
|
# optional FlashAttention is explicitly enabled and importable.
|
||||||
|
# VIBEPOD_CUDA_DTYPE=bf16
|
||||||
|
# VIBEPOD_ATTN_IMPL=sdpa
|
||||||
|
# VIBEPOD_ENABLE_FLASH_ATTN=0
|
||||||
|
|
||||||
|
# Debug/profiling. Keep disabled for benchmark timing; async CPU profiling
|
||||||
|
# double-counts overlapped decode work.
|
||||||
|
# VIBEPOD_PROFILE_GENERATION=0
|
||||||
|
|||||||
+1
-1
@@ -136,7 +136,7 @@ if $CPU_MODE; then
|
|||||||
export VIBEPOD_DEVICE="cpu"
|
export VIBEPOD_DEVICE="cpu"
|
||||||
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
||||||
if [[ -z "${VIBEPOD_CPU_THREADS:-}" ]]; then
|
if [[ -z "${VIBEPOD_CPU_THREADS:-}" ]]; then
|
||||||
VIBEPOD_CPU_THREADS="$(uv run --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")"
|
VIBEPOD_CPU_THREADS="$(uv run --no-sync --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")"
|
||||||
export VIBEPOD_CPU_THREADS
|
export VIBEPOD_CPU_THREADS
|
||||||
fi
|
fi
|
||||||
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$VIBEPOD_CPU_THREADS}"
|
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$VIBEPOD_CPU_THREADS}"
|
||||||
|
|||||||
+21
-26
@@ -66,12 +66,10 @@ DEFAULT_SPEAKER = "carter"
|
|||||||
|
|
||||||
_IGNORE_PATTERNS = ["*.msgpack", "flax_model*", "tf_model*", "rust_model*", "*.ot"]
|
_IGNORE_PATTERNS = ["*.msgpack", "flax_model*", "tf_model*", "rust_model*", "*.ot"]
|
||||||
|
|
||||||
# ── Pipeline executors ─────────────────────────────────────────────────────────
|
# ── Pipeline executor ──────────────────────────────────────────────────────────
|
||||||
# _decode_executor: overlaps acoustic_decode with forward_tts_lm (1 worker).
|
# Overlaps acoustic_decode with forward_tts_lm on a background thread (1 worker).
|
||||||
# _cfg_executor: runs positive + negative forward_tts_lm in parallel (1 worker).
|
|
||||||
|
|
||||||
_decode_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
_decode_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
||||||
_cfg_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
|
|
||||||
|
|
||||||
# ── Device selection ────────────────────────────────────────────────────────────
|
# ── Device selection ────────────────────────────────────────────────────────────
|
||||||
# VIBEPOD_DEVICE env var is set by start.sh based on the --cpu / --cuda flag.
|
# VIBEPOD_DEVICE env var is set by start.sh based on the --cpu / --cuda flag.
|
||||||
@@ -546,38 +544,37 @@ def _install_generation_optimizations(model: object) -> None:
|
|||||||
|
|
||||||
|
|
||||||
def _install_cpu_pipeline_optimizations(model: object) -> None:
|
def _install_cpu_pipeline_optimizations(model: object) -> None:
|
||||||
"""Attach pipeline executors to the model for the optimised generate() loop.
|
"""Attach the decode executor to the model for the optimised generate() loop.
|
||||||
|
|
||||||
The JezzWTF/VibeVoice fork's generate() checks for two optional attributes:
|
The JezzWTF/VibeVoice fork's generate() checks for two optional attributes:
|
||||||
|
|
||||||
model._vibepod_decode_executor — ThreadPoolExecutor (1 worker) used to
|
model._vibepod_decode_executor — ThreadPoolExecutor (1 worker) that
|
||||||
overlap acoustic_decode with acoustic_connector + forward_tts_lm.
|
overlaps acoustic_decode with acoustic_connector + forward_tts_lm.
|
||||||
|
Profiling showed this hides ~72s of decode cost behind tts_lm work,
|
||||||
|
capturing ~96% of the theoretical overlap savings.
|
||||||
|
|
||||||
model._vibepod_cfg_executor — ThreadPoolExecutor (1 worker) used to
|
model._vibepod_cfg_executor — intentionally NOT set. Parallel pos/neg
|
||||||
run the positive and negative forward_tts_lm calls in parallel, so
|
forward_tts_lm via a second thread causes MKL OpenMP thread-pool
|
||||||
both CFG passes execute concurrently instead of sequentially.
|
contention on CPU: both threads compete for the same OMP worker pool,
|
||||||
|
making each call slower rather than faster. Net effect: ~6% regression.
|
||||||
|
The hook remains in the fork for potential GPU or future use.
|
||||||
|
|
||||||
Both are None by default, making the fork's generate() behave identically
|
Attributes default to None, so the fork's generate() falls back to the
|
||||||
to upstream on CUDA or any machine where these aren't set.
|
original sequential behaviour on CUDA or any non-VibePod install.
|
||||||
"""
|
"""
|
||||||
global _decode_executor, _cfg_executor
|
global _decode_executor
|
||||||
|
|
||||||
if os.environ.get("VIBEPOD_ASYNC_DECODE", "1") != "1":
|
if os.environ.get("VIBEPOD_ASYNC_DECODE", "1") != "1":
|
||||||
logger.info("CPU async decode/CFG parallelism disabled via VIBEPOD_ASYNC_DECODE=0.")
|
logger.info("CPU async decode disabled via VIBEPOD_ASYNC_DECODE=0.")
|
||||||
return
|
return
|
||||||
|
|
||||||
_decode_executor = concurrent.futures.ThreadPoolExecutor(
|
_decode_executor = concurrent.futures.ThreadPoolExecutor(
|
||||||
max_workers=1, thread_name_prefix="vibepod-decode"
|
max_workers=1, thread_name_prefix="vibepod-decode"
|
||||||
)
|
)
|
||||||
_cfg_executor = concurrent.futures.ThreadPoolExecutor(
|
|
||||||
max_workers=1, thread_name_prefix="vibepod-cfg"
|
|
||||||
)
|
|
||||||
model._vibepod_decode_executor = _decode_executor
|
model._vibepod_decode_executor = _decode_executor
|
||||||
model._vibepod_cfg_executor = _cfg_executor
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"CPU pipeline: decode executor and CFG executor attached — "
|
"CPU pipeline: decode executor attached — acoustic_decode overlaps "
|
||||||
"acoustic_decode overlaps tts_lm, pos/neg CFG runs in parallel. "
|
"tts_lm. Disable with VIBEPOD_ASYNC_DECODE=0."
|
||||||
"Disable with VIBEPOD_ASYNC_DECODE=0."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -643,9 +640,9 @@ def _load_model_sync() -> None:
|
|||||||
is_cpu = _device == "cpu"
|
is_cpu = _device == "cpu"
|
||||||
_config["device"] = _device
|
_config["device"] = _device
|
||||||
_config["chunk_accum"] = _env_int("VIBEPOD_CHUNK_ACCUM", 4 if is_cpu else 1)
|
_config["chunk_accum"] = _env_int("VIBEPOD_CHUNK_ACCUM", 4 if is_cpu else 1)
|
||||||
_config["prebuffer_secs"] = _env_float("VIBEPOD_PREBUFFER_SECS", 6.0 if is_cpu else 5.0)
|
_config["prebuffer_secs"] = _env_float("VIBEPOD_PREBUFFER_SECS", 24.0 if is_cpu else 5.0)
|
||||||
_config["rebuffer_threshold_secs"] = _env_float("VIBEPOD_REBUFFER_THRESHOLD_SECS", 1.5 if is_cpu else 1.0)
|
_config["rebuffer_threshold_secs"] = _env_float("VIBEPOD_REBUFFER_THRESHOLD_SECS", 2.0 if is_cpu else 1.0)
|
||||||
_config["resume_threshold_secs"] = _env_float("VIBEPOD_RESUME_THRESHOLD_SECS", 4.0 if is_cpu else 3.0)
|
_config["resume_threshold_secs"] = _env_float("VIBEPOD_RESUME_THRESHOLD_SECS", 12.0 if is_cpu else 3.0)
|
||||||
_config["default_inference_steps"] = _env_int("VIBEPOD_DEFAULT_INFERENCE_STEPS", 8 if is_cpu else 10)
|
_config["default_inference_steps"] = _env_int("VIBEPOD_DEFAULT_INFERENCE_STEPS", 8 if is_cpu else 10)
|
||||||
if is_cpu:
|
if is_cpu:
|
||||||
logical_cpus = os.cpu_count() or 1
|
logical_cpus = os.cpu_count() or 1
|
||||||
@@ -688,8 +685,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
|||||||
yield
|
yield
|
||||||
if _decode_executor is not None:
|
if _decode_executor is not None:
|
||||||
_decode_executor.shutdown(wait=False)
|
_decode_executor.shutdown(wait=False)
|
||||||
if _cfg_executor is not None:
|
|
||||||
_cfg_executor.shutdown(wait=False)
|
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan)
|
app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan)
|
||||||
|
|||||||
@@ -157,7 +157,7 @@ export default function GenerationControls({
|
|||||||
<div className="flex flex-col gap-2">
|
<div className="flex flex-col gap-2">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<label className="text-sm font-medium" style={{ color: "var(--foreground)" }}>
|
<label className="text-sm font-medium" style={{ color: "var(--foreground)" }}>
|
||||||
Quality vs Speed
|
Speed vs Quality
|
||||||
</label>
|
</label>
|
||||||
<span
|
<span
|
||||||
className="text-sm font-mono px-2 py-0.5 rounded"
|
className="text-sm font-mono px-2 py-0.5 rounded"
|
||||||
@@ -221,7 +221,7 @@ export default function GenerationControls({
|
|||||||
<input
|
<input
|
||||||
type="range"
|
type="range"
|
||||||
min={0.5}
|
min={0.5}
|
||||||
max={10.0}
|
max={30.0}
|
||||||
step={0.5}
|
step={0.5}
|
||||||
value={prebufferSecs}
|
value={prebufferSecs}
|
||||||
onChange={(e) => onPrebufferSecsChange(parseFloat(e.target.value))}
|
onChange={(e) => onPrebufferSecsChange(parseFloat(e.target.value))}
|
||||||
@@ -271,7 +271,7 @@ export default function GenerationControls({
|
|||||||
id="resume-threshold"
|
id="resume-threshold"
|
||||||
type="range"
|
type="range"
|
||||||
min={0.5}
|
min={0.5}
|
||||||
max={5.0}
|
max={30.0}
|
||||||
step={0.1}
|
step={0.1}
|
||||||
value={resumeThresholdSecs}
|
value={resumeThresholdSecs}
|
||||||
onChange={(e) => {
|
onChange={(e) => {
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ const SAMPLE_RATE = 24_000;
|
|||||||
const DEFAULT_PREBUFFER_SECS = 5.0;
|
const DEFAULT_PREBUFFER_SECS = 5.0;
|
||||||
const DEFAULT_REBUFFER_THRESHOLD_SECS = 1.0;
|
const DEFAULT_REBUFFER_THRESHOLD_SECS = 1.0;
|
||||||
const DEFAULT_RESUME_THRESHOLD_SECS = 3.0;
|
const DEFAULT_RESUME_THRESHOLD_SECS = 3.0;
|
||||||
const MAX_ADAPTIVE_RESUME_SECS = 18.0;
|
const MAX_ADAPTIVE_RESUME_SECS = 30.0;
|
||||||
|
|
||||||
interface GenerateOptions {
|
interface GenerateOptions {
|
||||||
text: string;
|
text: string;
|
||||||
|
|||||||
Reference in New Issue
Block a user