mirror of
https://github.com/JezzWTF/vibepod.git
synced 2026-06-13 03:58:07 +00:00
perf: CPU async pipeline overlap + INT8 quantization
Overlap acoustic_decode with forward_tts_lm calls using a background ThreadPoolExecutor, hiding ~72s of decode cost behind tts_lm work. Achieved 0.67x realtime (up from 0.43x, ~56% improvement). - vibevoice_generate_patch.py: patched generate() loop reordered to submit decode to thread before running connector + tts_lm×2, then resolve future. Installed as instance method via types.MethodType so uv sync reinstalling the package cannot revert the patch. - Dynamic INT8 quantization of Linear layers (VIBEPOD_QUANTIZE=1, default on CPU). prediction_head excluded — small fixed-size tensors regressed ~20% with INT8 due to pack/unpack overhead. - Auto-detect AVX512_BF16 and load model in bfloat16 if supported (VIBEPOD_CPU_BF16=auto, overridable with 0/1). - CPU thread count auto-configured from logical CPU count; OMP/MKL env vars set accordingly. Lock file preserved around uv sync --no-sources so CPU mode does not alter the shared uv.lock. - torch.compile retained as opt-in (VIBEPOD_COMPILE=1) but marked not recommended — dynamic KV cache shapes prevent kernel reuse.
This commit is contained in:
+27
-1
@@ -79,7 +79,16 @@ echo ""
|
||||
if $CPU_MODE; then
|
||||
echo "--> Syncing CPU Python environment (.venv-cpu)..."
|
||||
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
||||
LOCK_BACKUP=""
|
||||
if [[ -f uv.lock ]]; then
|
||||
LOCK_BACKUP="$(mktemp)"
|
||||
cp uv.lock "$LOCK_BACKUP"
|
||||
fi
|
||||
uv sync --no-sources
|
||||
if [[ -n "$LOCK_BACKUP" ]]; then
|
||||
cp "$LOCK_BACKUP" uv.lock
|
||||
rm -f "$LOCK_BACKUP"
|
||||
fi
|
||||
else
|
||||
echo "--> Syncing CUDA Python environment (.venv)..."
|
||||
uv sync
|
||||
@@ -126,11 +135,28 @@ export PYTHONUTF8=1
|
||||
if $CPU_MODE; then
|
||||
export VIBEPOD_DEVICE="cpu"
|
||||
export UV_PROJECT_ENVIRONMENT=".venv-cpu"
|
||||
if [[ -z "${VIBEPOD_CPU_THREADS:-}" ]]; then
|
||||
VIBEPOD_CPU_THREADS="$(uv run --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")"
|
||||
export VIBEPOD_CPU_THREADS
|
||||
fi
|
||||
export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$VIBEPOD_CPU_THREADS}"
|
||||
export MKL_NUM_THREADS="${MKL_NUM_THREADS:-$VIBEPOD_CPU_THREADS}"
|
||||
# Dynamic INT8 quantization — on by default for CPU (~22% faster, prediction_head
|
||||
# excluded automatically to avoid regression on small fixed-size tensors).
|
||||
# Set VIBEPOD_QUANTIZE=0 to disable if you notice audio quality differences.
|
||||
export VIBEPOD_QUANTIZE="${VIBEPOD_QUANTIZE:-1}"
|
||||
# Optional CPU flags:
|
||||
# VIBEPOD_ASYNC_DECODE=0 Disable async decode/tts_lm overlap (on by default)
|
||||
# VIBEPOD_CPU_BF16=1 Force bfloat16 weights (auto-detected via AVX512_BF16)
|
||||
# VIBEPOD_COMPILE=1 torch.compile hot paths (ineffective for autoregressive
|
||||
# models on CPU — not recommended, kept for experimentation)
|
||||
UV_RUN_ARGS=(--no-sync --no-sources)
|
||||
else
|
||||
export VIBEPOD_DEVICE="cuda"
|
||||
UV_RUN_ARGS=()
|
||||
fi
|
||||
|
||||
exec uv run uvicorn vibevoice_server:app \
|
||||
exec uv run "${UV_RUN_ARGS[@]}" uvicorn vibevoice_server:app \
|
||||
--host 127.0.0.1 \
|
||||
--port 8000 \
|
||||
--log-level info \
|
||||
|
||||
Reference in New Issue
Block a user