# Copy to .env.local and fill in values # URL of the Python TTS server (used by Next.js API routes) VIBEVOICE_SERVER_URL=http://localhost:8000 # HuggingFace token — required if the model repo is private or gated HF_TOKEN= # Override the HuggingFace model cache directory (optional) # HF_HOME=/path/to/hf-cache # --------------------------------------------------------------------------- # Runtime tuning # --------------------------------------------------------------------------- # Force the Python server device. Usually set by `pnpm dev` / `pnpm dev:cpu`. # VIBEPOD_DEVICE=cuda # VIBEPOD_DEVICE=cpu # CPU mode: keep async decode enabled. This overlaps acoustic decoding with # language-model work and measured ~20% faster on an 8-thread CPU run. VIBEPOD_ASYNC_DECODE=1 # CPU mode: thread tuning. On an 8-core / 16-thread Ryzen test system, # 8 worker threads with 1 inter-op thread gave the best wall time, while 12 # over-subscribed and regressed. # VIBEPOD_CPU_THREADS=8 # VIBEPOD_CPU_INTEROP_THREADS=1 # CPU mode: playback buffering. CPU generation is slower than realtime, so # smooth streaming needs a larger initial buffer than CUDA. Lower these for # faster startup if you are OK with occasional rebuffering. # VIBEPOD_PREBUFFER_SECS=24 # VIBEPOD_REBUFFER_THRESHOLD_SECS=2 # VIBEPOD_RESUME_THRESHOLD_SECS=12 # CPU mode: dynamic INT8 quantization is enabled by default in start.sh. # Set to 0 if you are comparing quality/performance or debugging. # VIBEPOD_QUANTIZE=1 # CUDA mode: dtype and attention selection. Defaults are bf16 + SDPA unless # optional FlashAttention is explicitly enabled and importable. # VIBEPOD_CUDA_DTYPE=bf16 # VIBEPOD_ATTN_IMPL=sdpa # VIBEPOD_ENABLE_FLASH_ATTN=0 # Debug/profiling. Keep disabled for benchmark timing; async CPU profiling # double-counts overlapped decode work. # VIBEPOD_PROFILE_GENERATION=0