diff --git a/.env.example b/.env.example
index 537770d..dc32c2b 100644
--- a/.env.example
+++ b/.env.example
@@ -8,3 +8,42 @@ HF_TOKEN=
 
 # Override the HuggingFace model cache directory (optional)
 # HF_HOME=/path/to/hf-cache
+
+# ---------------------------------------------------------------------------
+# Runtime tuning
+# ---------------------------------------------------------------------------
+
+# Force the Python server device. Usually set by `pnpm dev` / `pnpm dev:cpu`.
+# VIBEPOD_DEVICE=cuda
+# VIBEPOD_DEVICE=cpu
+
+# CPU mode: keep async decode enabled. This overlaps acoustic decoding with
+# language-model work and measured ~20% faster on an 8-thread CPU run.
+VIBEPOD_ASYNC_DECODE=1
+
+# CPU mode: thread tuning. On an 8-core / 16-thread Ryzen test system,
+# 8 worker threads with 1 inter-op thread gave the best wall time, while 12
+# over-subscribed and regressed.
+# VIBEPOD_CPU_THREADS=8
+# VIBEPOD_CPU_INTEROP_THREADS=1
+
+# CPU mode: playback buffering. CPU generation is slower than realtime, so
+# smooth streaming needs a larger initial buffer than CUDA. Lower these for
+# faster startup if you are OK with occasional rebuffering.
+# VIBEPOD_PREBUFFER_SECS=24
+# VIBEPOD_REBUFFER_THRESHOLD_SECS=2
+# VIBEPOD_RESUME_THRESHOLD_SECS=12
+
+# CPU mode: dynamic INT8 quantization is enabled by default in start.sh.
+# Set to 0 if you are comparing quality/performance or debugging.
+# VIBEPOD_QUANTIZE=1
+
+# CUDA mode: dtype and attention selection. Defaults are bf16 + SDPA unless
+# optional FlashAttention is explicitly enabled and importable.
+# VIBEPOD_CUDA_DTYPE=bf16
+# VIBEPOD_ATTN_IMPL=sdpa
+# VIBEPOD_ENABLE_FLASH_ATTN=0
+
+# Debug/profiling. Keep disabled for benchmark timing; async CPU profiling
+# double-counts overlapped decode work.
+# VIBEPOD_PROFILE_GENERATION=0
diff --git a/server/start.sh b/server/start.sh
index 995fbc1..060cde5 100755
--- a/server/start.sh
+++ b/server/start.sh
@@ -136,7 +136,7 @@ if $CPU_MODE; then
     export VIBEPOD_DEVICE="cpu"
     export UV_PROJECT_ENVIRONMENT=".venv-cpu"
     if [[ -z "${VIBEPOD_CPU_THREADS:-}" ]]; then
-        VIBEPOD_CPU_THREADS="$(uv run --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")"
+        VIBEPOD_CPU_THREADS="$(uv run --no-sync --no-sources python -c "import os; print(max(1, (os.cpu_count() or 2) // 2))")"
         export VIBEPOD_CPU_THREADS
     fi
     export OMP_NUM_THREADS="${OMP_NUM_THREADS:-$VIBEPOD_CPU_THREADS}"
diff --git a/server/vibevoice_server.py b/server/vibevoice_server.py
index 2516a59..74fbd94 100644
--- a/server/vibevoice_server.py
+++ b/server/vibevoice_server.py
@@ -66,12 +66,10 @@ DEFAULT_SPEAKER = "carter"
 
 _IGNORE_PATTERNS = ["*.msgpack", "flax_model*", "tf_model*", "rust_model*", "*.ot"]
 
-# ── Pipeline executors ─────────────────────────────────────────────────────────
-# _decode_executor: overlaps acoustic_decode with forward_tts_lm (1 worker).
-# _cfg_executor:    runs positive + negative forward_tts_lm in parallel (1 worker).
+# ── Pipeline executor ──────────────────────────────────────────────────────────
+# Overlaps acoustic_decode with forward_tts_lm on a background thread (1 worker).
 
 _decode_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
-_cfg_executor: Optional[concurrent.futures.ThreadPoolExecutor] = None
 
 # ── Device selection ────────────────────────────────────────────────────────────
 # VIBEPOD_DEVICE env var is set by start.sh based on the --cpu / --cuda flag.
@@ -546,38 +544,37 @@ def _install_generation_optimizations(model: object) -> None:
 
 
 def _install_cpu_pipeline_optimizations(model: object) -> None:
-    """Attach pipeline executors to the model for the optimised generate() loop.
+    """Attach the decode executor to the model for the optimised generate() loop.
 
     The JezzWTF/VibeVoice fork's generate() checks for two optional attributes:
 
-      model._vibepod_decode_executor  — ThreadPoolExecutor (1 worker) used to
-          overlap acoustic_decode with acoustic_connector + forward_tts_lm.
+      model._vibepod_decode_executor  — ThreadPoolExecutor (1 worker) that
+          overlaps acoustic_decode with acoustic_connector + forward_tts_lm.
+          Profiling showed this hides ~72s of decode cost behind tts_lm work,
+          capturing ~96% of the theoretical overlap savings.
 
-      model._vibepod_cfg_executor     — ThreadPoolExecutor (1 worker) used to
-          run the positive and negative forward_tts_lm calls in parallel, so
-          both CFG passes execute concurrently instead of sequentially.
+      model._vibepod_cfg_executor     — intentionally NOT set. Parallel pos/neg
+          forward_tts_lm via a second thread causes MKL OpenMP thread-pool
+          contention on CPU: both threads compete for the same OMP worker pool,
+          making each call slower rather than faster. Net effect: ~6% regression.
+          The hook remains in the fork for potential GPU or future use.
 
-    Both are None by default, making the fork's generate() behave identically
-    to upstream on CUDA or any machine where these aren't set.
+    Attributes default to None, so the fork's generate() falls back to the
+    original sequential behaviour on CUDA or any non-VibePod install.
     """
-    global _decode_executor, _cfg_executor
+    global _decode_executor
 
     if os.environ.get("VIBEPOD_ASYNC_DECODE", "1") != "1":
-        logger.info("CPU async decode/CFG parallelism disabled via VIBEPOD_ASYNC_DECODE=0.")
+        logger.info("CPU async decode disabled via VIBEPOD_ASYNC_DECODE=0.")
         return
 
     _decode_executor = concurrent.futures.ThreadPoolExecutor(
         max_workers=1, thread_name_prefix="vibepod-decode"
     )
-    _cfg_executor = concurrent.futures.ThreadPoolExecutor(
-        max_workers=1, thread_name_prefix="vibepod-cfg"
-    )
     model._vibepod_decode_executor = _decode_executor
-    model._vibepod_cfg_executor = _cfg_executor
     logger.info(
-        "CPU pipeline: decode executor and CFG executor attached — "
-        "acoustic_decode overlaps tts_lm, pos/neg CFG runs in parallel. "
-        "Disable with VIBEPOD_ASYNC_DECODE=0."
+        "CPU pipeline: decode executor attached — acoustic_decode overlaps "
+        "tts_lm. Disable with VIBEPOD_ASYNC_DECODE=0."
     )
 
 
@@ -643,9 +640,9 @@ def _load_model_sync() -> None:
             is_cpu = _device == "cpu"
             _config["device"] = _device
             _config["chunk_accum"] = _env_int("VIBEPOD_CHUNK_ACCUM", 4 if is_cpu else 1)
-            _config["prebuffer_secs"] = _env_float("VIBEPOD_PREBUFFER_SECS", 6.0 if is_cpu else 5.0)
-            _config["rebuffer_threshold_secs"] = _env_float("VIBEPOD_REBUFFER_THRESHOLD_SECS", 1.5 if is_cpu else 1.0)
-            _config["resume_threshold_secs"] = _env_float("VIBEPOD_RESUME_THRESHOLD_SECS", 4.0 if is_cpu else 3.0)
+            _config["prebuffer_secs"] = _env_float("VIBEPOD_PREBUFFER_SECS", 24.0 if is_cpu else 5.0)
+            _config["rebuffer_threshold_secs"] = _env_float("VIBEPOD_REBUFFER_THRESHOLD_SECS", 2.0 if is_cpu else 1.0)
+            _config["resume_threshold_secs"] = _env_float("VIBEPOD_RESUME_THRESHOLD_SECS", 12.0 if is_cpu else 3.0)
             _config["default_inference_steps"] = _env_int("VIBEPOD_DEFAULT_INFERENCE_STEPS", 8 if is_cpu else 10)
             if is_cpu:
                 logical_cpus = os.cpu_count() or 1
@@ -688,8 +685,6 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
     yield
     if _decode_executor is not None:
         _decode_executor.shutdown(wait=False)
-    if _cfg_executor is not None:
-        _cfg_executor.shutdown(wait=False)
 
 
 app = FastAPI(title="VibePod TTS Server", version="0.1.0", lifespan=lifespan)
diff --git a/web/components/GenerationControls.tsx b/web/components/GenerationControls.tsx
index f9a7d4c..41bce72 100644
--- a/web/components/GenerationControls.tsx
+++ b/web/components/GenerationControls.tsx
@@ -157,7 +157,7 @@ export default function GenerationControls({
       <div className="flex flex-col gap-2">
         <div className="flex items-center justify-between">
           <label className="text-sm font-medium" style={{ color: "var(--foreground)" }}>
-            Quality vs Speed
+            Speed vs Quality
           </label>
           <span
             className="text-sm font-mono px-2 py-0.5 rounded"
@@ -221,7 +221,7 @@ export default function GenerationControls({
             <input
               type="range"
               min={0.5}
-              max={10.0}
+              max={30.0}
               step={0.5}
               value={prebufferSecs}
               onChange={(e) => onPrebufferSecsChange(parseFloat(e.target.value))}
@@ -271,7 +271,7 @@ export default function GenerationControls({
               id="resume-threshold"
               type="range"
               min={0.5}
-              max={5.0}
+              max={30.0}
               step={0.1}
               value={resumeThresholdSecs}
               onChange={(e) => {
diff --git a/web/hooks/useStreamingGeneration.ts b/web/hooks/useStreamingGeneration.ts
index 257f0c8..a8dbcbb 100644
--- a/web/hooks/useStreamingGeneration.ts
+++ b/web/hooks/useStreamingGeneration.ts
@@ -6,7 +6,7 @@ const SAMPLE_RATE = 24_000;
 const DEFAULT_PREBUFFER_SECS = 5.0;
 const DEFAULT_REBUFFER_THRESHOLD_SECS = 1.0;
 const DEFAULT_RESUME_THRESHOLD_SECS = 3.0;
-const MAX_ADAPTIVE_RESUME_SECS = 18.0;
+const MAX_ADAPTIVE_RESUME_SECS = 30.0;
 
 interface GenerateOptions {
   text: string;