# Copy to .env.local and fill in values

# URL of the Python TTS server (used by Next.js API routes)
VIBEVOICE_SERVER_URL=http://localhost:8000

# HuggingFace token — required if the model repo is private or gated
HF_TOKEN=

# Override the HuggingFace model cache directory (optional)
# HF_HOME=/path/to/hf-cache

# ---------------------------------------------------------------------------
# Runtime tuning
# ---------------------------------------------------------------------------

# Force the Python server device. Usually set by `pnpm dev` / `pnpm dev:cpu`.
# VIBEPOD_DEVICE=cuda
# VIBEPOD_DEVICE=cpu

# CPU mode: keep async decode enabled. This overlaps acoustic decoding with
# language-model work and measured ~20% faster on an 8-thread CPU run.
VIBEPOD_ASYNC_DECODE=1

# CPU mode: thread tuning. On an 8-core / 16-thread Ryzen test system,
# 8 worker threads with 1 inter-op thread gave the best wall time, while 12
# over-subscribed and regressed.
# VIBEPOD_CPU_THREADS=8
# VIBEPOD_CPU_INTEROP_THREADS=1

# CPU mode: playback buffering. CPU generation is slower than realtime, so
# smooth streaming needs a larger initial buffer than CUDA. Lower these for
# faster startup if you are OK with occasional rebuffering.
# VIBEPOD_PREBUFFER_SECS=24
# VIBEPOD_REBUFFER_THRESHOLD_SECS=2
# VIBEPOD_RESUME_THRESHOLD_SECS=12

# CPU mode: dynamic INT8 quantization is enabled by default in start.sh.
# Set to 0 if you are comparing quality/performance or debugging.
# VIBEPOD_QUANTIZE=1

# CUDA mode: dtype and attention selection. Defaults are bf16 + SDPA unless
# optional FlashAttention is explicitly enabled and importable.
# VIBEPOD_CUDA_DTYPE=bf16
# VIBEPOD_ATTN_IMPL=sdpa
# VIBEPOD_ENABLE_FLASH_ATTN=0

# Debug/profiling. Keep disabled for benchmark timing; async CPU profiling
# double-counts overlapped decode work.
# VIBEPOD_PROFILE_GENERATION=0