feat: add chunked ASR for long audio with env-configurable chunk duration

- Integrate chunking into LocalAI client to avoid GPU OOM on long audio. - Split long files into overlapping chunks; transcribe each chunk; merge segments with corrected timestamps. - Auto-enable chunking when audio duration > LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300s). - Add env variables: LOCALAI_CHUNK_DURATION (default 180) LOCALAI_CHUNK_OVERLAP (default 2) LOCALAI_MAX_SINGLE_REQUEST_DURATION (default 300) - Add unit and integration tests for chunking logic. - Confirmed working end-to-end with vibevoice-cpp-asr on 88-minute file.
2026-06-18 17:46:29 +00:00
parent 59363c5dcd
commit 6640bc050d
4 changed files with 737 additions and 0 deletions
@@ -7,13 +7,21 @@ Simplified audio processor for ScrAIbe.
 Previously this used torch and pyannote-style processing. In the LocalAI-backed
 version, we primarily pass files to the API, but we keep a lightweight helper
 for backward compatibility.
+
+Now also includes utilities for chunking long audio into smaller segments
+to avoid GPU memory limits when using vibevoice-cpp on LocalAI.
 """

+import json
+import os
+import tempfile
 from subprocess import CalledProcessError, run
 import numpy as np

 SAMPLE_RATE = 16000
 NORMALIZATION_FACTOR = 32768.0
+DEFAULT_CHUNK_DURATION = 180.0  # seconds
+DEFAULT_CHUNK_OVERLAP = 2.0    # seconds


 class AudioProcessor:
@@ -106,3 +114,109 @@ class AudioProcessor:

    def __repr__(self) -> str:
        return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
+
+
+def get_audio_duration(file_path: str) -> float:
+    """
+    Get the duration of an audio file in seconds using ffprobe.
+
+    Args:
+        file_path: Path to the audio file.
+
+    Returns:
+        Duration in seconds as a float.
+
+    Raises:
+        RuntimeError: If ffprobe fails.
+    """
+    cmd = [
+        "ffprobe",
+        "-v", "error",
+        "-show_entries", "format=duration",
+        "-of", "json",
+        file_path,
+    ]
+    try:
+        result = run(cmd, capture_output=True, text=True, check=True)
+        data = json.loads(result.stdout)
+        return float(data["format"]["duration"])
+    except (CalledProcessError, json.JSONDecodeError, KeyError) as e:
+        raise RuntimeError(f"Failed to get audio duration for {file_path}: {e}")
+
+
+def split_audio_into_chunks(
+    input_path: str,
+    max_duration: float = DEFAULT_CHUNK_DURATION,
+    overlap: float = DEFAULT_CHUNK_OVERLAP,
+    output_format: str = "wav",
+    sample_rate: int = 24000,
+) -> list:
+    """
+    Split a long audio file into overlapping chunks using ffmpeg.
+
+    Args:
+        input_path: Path to the input audio file.
+        max_duration: Maximum duration of each chunk in seconds.
+        overlap: Overlap duration in seconds between consecutive chunks.
+        output_format: Output format (e.g., 'wav').
+        sample_rate: Sample rate for output chunks.
+
+    Returns:
+        List of dicts:
+            [{"path": "chunk.wav", "start": 0.0, "end": 180.0}, ...]
+        Files must be cleaned up by the caller.
+    """
+    duration = get_audio_duration(input_path)
+
+    # If file is shorter than max_duration, no need to split
+    if duration <= max_duration:
+        return [{"path": input_path, "start": 0.0, "end": duration}]
+
+    chunks = []
+    start = 0.0
+    chunk_id = 0
+
+    while start < duration:
+        chunk_end = min(start + max_duration, duration)
+        chunk_duration = chunk_end - start
+
+        tmp = tempfile.NamedTemporaryFile(
+            delete=False,
+            suffix=f".{output_format}",
+            prefix="scraibe_chunk_",
+        )
+        chunk_path = tmp.name
+        tmp.close()
+
+        cmd = [
+            "ffmpeg",
+            "-y",
+            "-nostdin",
+            "-ss", str(start),
+            "-i", input_path,
+            "-t", str(chunk_duration),
+            "-ar", str(sample_rate),
+            "-ac", "1",
+            "-c:a", "pcm_s16le",
+            chunk_path,
+        ]
+        try:
+            run(cmd, capture_output=True, check=True)
+        except CalledProcessError as e:
+            # Clean up on error
+            if os.path.exists(chunk_path):
+                os.remove(chunk_path)
+            raise RuntimeError(
+                f"Failed to create audio chunk {chunk_id} for {input_path}: {e.stderr.decode()}"
+            )
+
+        chunks.append({
+            "path": chunk_path,
+            "start": start,
+            "end": chunk_end,
+        })
+
+        start += max_duration - overlap
+        chunk_id += 1
+
+    return chunks
@@ -9,11 +9,21 @@ It replaces the previous local Whisper + Pyannote pipeline by sending
 audio files to the /v1/audio/diarization endpoint and mapping the
 response into the same Transcript format used by the UI.

+For long audio files, it can chunk the input to avoid GPU OOM errors.
+
 Environment Variables:
    LOCALAI_API_URL: (required) Base URL of the LocalAI server
                     (e.g., http://localhost:8080)
    LOCALAI_API_KEY: (optional) API key, if configured
    LOCALAI_MODEL:   (optional) Model name to use (default: vibevoice-diarize)
+
+    Chunking / long audio (all optional):
+    LOCALAI_CHUNK_DURATION:               Max duration of each chunk in seconds
+                                           (default: 180.0)
+    LOCALAI_CHUNK_OVERLAP:                Overlap between consecutive chunks in seconds
+                                           (default: 2.0)
+    LOCALAI_MAX_SINGLE_REQUEST_DURATION:  If audio duration exceeds this, chunking
+                                           is enabled automatically (default: 300.0)
 """

 import os
@@ -24,6 +34,8 @@ from typing import Dict, List, Any, Optional

 import httpx

+from .audio import get_audio_duration, split_audio_into_chunks
+
 logger = logging.getLogger("scraibe.localai_client")


@@ -41,8 +53,14 @@ class LocalAIClient:
      - Upload audio file as multipart/form-data.
      - Parse diarization + transcription response (verbose_json).
      - Map response into the same structure expected by Scraibe's Transcript.
+      - For long audio: chunk, transcribe each chunk, merge results.
    """

+    # Default thresholds for chunking long audio to avoid GPU OOM.
+    # These can be overridden via environment or at call time.
+    DEFAULT_CHUNK_DURATION = 180.0  # seconds
+    DEFAULT_CHUNK_OVERLAP = 2.0    # seconds
+
    def __init__(
        self,
        api_url: Optional[str] = None,
@@ -82,6 +100,55 @@ class LocalAIClient:
            follow_redirects=True,
        )

+    @staticmethod
+    def _env_float(var: str, default: float) -> float:
+        """
+        Read a float from environment with a fallback default.
+        """
+        val = (os.getenv(var) or "").strip()
+        if val == "":
+            return default
+        try:
+            return float(val)
+        except ValueError:
+            logger.warning(
+                "Invalid value for %s: %s; using default %s", var, val, default
+            )
+            return default
+
+    def _effective_chunk_duration(self, provided: Optional[float]) -> float:
+        """
+        Resolve chunk_duration using this precedence:
+          1) provided argument
+          2) LOCALAI_CHUNK_DURATION env
+          3) class default
+        """
+        if provided is not None:
+            return provided
+        return self._env_float("LOCALAI_CHUNK_DURATION", self.DEFAULT_CHUNK_DURATION)
+
+    def _effective_chunk_overlap(self, provided: Optional[float]) -> float:
+        """
+        Resolve chunk_overlap:
+          1) provided argument
+          2) LOCALAI_CHUNK_OVERLAP env
+          3) class default
+        """
+        if provided is not None:
+            return provided
+        return self._env_float("LOCALAI_CHUNK_OVERLAP", self.DEFAULT_CHUNK_OVERLAP)
+
+    def _effective_max_single_request_duration(self, provided: Optional[float]) -> float:
+        """
+        Resolve max_single_request_duration:
+          1) provided argument
+          2) LOCALAI_MAX_SINGLE_REQUEST_DURATION env
+          3) default 300.0
+        """
+        if provided is not None:
+            return provided
+        return self._env_float("LOCALAI_MAX_SINGLE_REQUEST_DURATION", 300.0)
+
    def close(self):
        """Close the underlying HTTP client."""
        self._client.close()
@@ -107,6 +174,10 @@ class LocalAIClient:
        include_text: Optional[bool] = None,
        verbose: bool = False,
        return_raw: bool = False,
+        use_chunking: Optional[bool] = None,
+        chunk_duration: Optional[float] = None,
+        chunk_overlap: Optional[float] = None,
+        max_single_request_duration: Optional[float] = None,
        **_ignored,
    ) -> Dict[str, Any]:
        """
@@ -114,6 +185,8 @@ class LocalAIClient:
        - A normalized dict with segments, speakers, transcripts.
        - Optionally, the raw verbose_json response (for JSON export).

+        For long audio, it can automatically chunk the file to avoid GPU OOM.
+
        Args:
            audio_path: Path to the audio file.
            language: Language hint, forwarded if set.
@@ -129,6 +202,93 @@ class LocalAIClient:
                          Defaults to True.
            verbose: If True, prints progress messages.
            return_raw: If True, also return the raw API response in 'raw_result'.
+            use_chunking: Whether to enable chunking for long audio.
+                          If None, enabled automatically based on duration.
+            chunk_duration: Max duration per chunk in seconds.
+                            Falls back to LOCALAI_CHUNK_DURATION env, then 180.0.
+            chunk_overlap: Overlap between chunks in seconds.
+                           Falls back to LOCALAI_CHUNK_OVERLAP env, then 2.0.
+            max_single_request_duration: If audio duration exceeds this, chunking
+                                         is enabled (unless explicitly disabled).
+                                         Falls back to LOCALAI_MAX_SINGLE_REQUEST_DURATION
+                                         env, then 300.0.
+        """
+        if verbose:
+            print("Starting diarization and transcription via LocalAI.")
+
+        logger.info("diarize_and_transcribe requested for: %s", audio_path)
+
+        # Resolve chunking parameters with environment support
+        chunk_duration = self._effective_chunk_duration(chunk_duration)
+        chunk_overlap = self._effective_chunk_overlap(chunk_overlap)
+        max_single = self._effective_max_single_request_duration(max_single_request_duration)
+
+        if use_chunking is None:
+            try:
+                duration = get_audio_duration(audio_path)
+            except RuntimeError:
+                duration = None
+
+            use_chunking = (duration is not None and duration > max_single)
+            logger.info(
+                "Auto-chunking decision: duration=%s, threshold=%s, use_chunking=%s",
+                duration,
+                max_single,
+                use_chunking,
+            )
+
+        if use_chunking:
+            return self._diarize_and_transcribe_chunked(
+                audio_path=audio_path,
+                language=language,
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers,
+                clustering_threshold=clustering_threshold,
+                min_duration_on=min_duration_on,
+                min_duration_off=min_duration_off,
+                response_format=response_format,
+                include_text=include_text,
+                verbose=verbose,
+                return_raw=return_raw,
+                chunk_duration=chunk_duration,
+                chunk_overlap=chunk_overlap,
+            )
+
+        # Single-request path (existing behavior)
+        return self._diarize_and_transcribe_single(
+            audio_path=audio_path,
+            language=language,
+            num_speakers=num_speakers,
+            min_speakers=min_speakers,
+            max_speakers=max_speakers,
+            clustering_threshold=clustering_threshold,
+            min_duration_on=min_duration_on,
+            min_duration_off=min_duration_off,
+            response_format=response_format,
+            include_text=include_text,
+            verbose=verbose,
+            return_raw=return_raw,
+        )
+
+    def _diarize_and_transcribe_single(
+        self,
+        audio_path: str,
+        *,
+        language: Optional[str] = None,
+        num_speakers: Optional[int] = None,
+        min_speakers: Optional[int] = None,
+        max_speakers: Optional[int] = None,
+        clustering_threshold: Optional[float] = None,
+        min_duration_on: Optional[float] = None,
+        min_duration_off: Optional[float] = None,
+        response_format: Optional[str] = None,
+        include_text: Optional[bool] = None,
+        verbose: bool = False,
+        return_raw: bool = False,
+    ) -> Dict[str, Any]:
+        """
+        Internal: single-request diarization and transcription.
        """
        if verbose:
            print("Starting diarization and transcription via LocalAI.")
@@ -214,6 +374,153 @@ class LocalAIClient:

        return parsed

+    def _diarize_and_transcribe_chunked(
+        self,
+        audio_path: str,
+        *,
+        language: Optional[str] = None,
+        num_speakers: Optional[int] = None,
+        min_speakers: Optional[int] = None,
+        max_speakers: Optional[int] = None,
+        clustering_threshold: Optional[float] = None,
+        min_duration_on: Optional[float] = None,
+        min_duration_off: Optional[float] = None,
+        response_format: Optional[str] = None,
+        include_text: Optional[bool] = None,
+        verbose: bool = False,
+        return_raw: bool = False,
+        chunk_duration: float = DEFAULT_CHUNK_DURATION,
+        chunk_overlap: float = DEFAULT_CHUNK_OVERLAP,
+    ) -> Dict[str, Any]:
+        """
+        Internal: chunked diarization and transcription for long audio.
+
+        - Splits audio into overlapping chunks.
+        - Transcribes each chunk via /v1/audio/diarization.
+        - Merges segments with adjusted timestamps.
+        """
+        if verbose:
+            print("Audio is long; splitting into chunks to avoid GPU memory issues.")
+
+        logger.info(
+            "Chunked transcription: chunk_duration=%s, overlap=%s",
+            chunk_duration,
+            chunk_overlap,
+        )
+
+        chunks = split_audio_into_chunks(
+            input_path=audio_path,
+            max_duration=chunk_duration,
+            overlap=chunk_overlap,
+        )
+
+        if len(chunks) == 1:
+            # No actual split needed; fall back to single-request path
+            return self._diarize_and_transcribe_single(
+                audio_path=chunks[0]["path"],
+                language=language,
+                num_speakers=num_speakers,
+                min_speakers=min_speakers,
+                max_speakers=max_speakers,
+                clustering_threshold=clustering_threshold,
+                min_duration_on=min_duration_on,
+                min_duration_off=min_duration_off,
+                response_format=response_format,
+                include_text=include_text,
+                verbose=verbose,
+                return_raw=return_raw,
+            )
+
+        all_segments: List[List[float]] = []
+        all_speakers: List[str] = []
+        all_transcripts: List[str] = []
+        raw_results: List[Dict[str, Any]] = []
+        temp_files = [c["path"] for c in chunks]
+
+        try:
+            for i, chunk_info in enumerate(chunks):
+                chunk_path = chunk_info["path"]
+                chunk_start = chunk_info["start"]
+
+                if verbose:
+                    print(
+                        f"Transcribing chunk {i+1}/{len(chunks)} "
+                        f"(start={chunk_start:.1f}s)"
+                    )
+
+                logger.info(
+                    "Transcribing chunk %d/%d, start=%.1f", i + 1, len(chunks), chunk_start
+                )
+
+                # Use single-request logic for each chunk
+                chunk_result = self._diarize_and_transcribe_single(
+                    audio_path=chunk_path,
+                    language=language,
+                    num_speakers=num_speakers,
+                    min_speakers=min_speakers,
+                    max_speakers=max_speakers,
+                    clustering_threshold=clustering_threshold,
+                    min_duration_on=min_duration_on,
+                    min_duration_off=min_duration_off,
+                    response_format=response_format,
+                    include_text=include_text,
+                    verbose=False,
+                    return_raw=return_raw,
+                )
+
+                segs = chunk_result.get("segments", [])
+                spks = chunk_result.get("speakers", [])
+                txts = chunk_result.get("transcripts", [])
+                raw = chunk_result.get("raw_result")
+
+                # Adjust timestamps to global timeline
+                adjusted_segs = []
+                for seg, sp, txt in zip(segs, spks, txts):
+                    start = float(seg[0]) + chunk_start
+                    end = float(seg[1]) + chunk_start
+                    adjusted_segs.append([start, end])
+                    all_speakers.append(sp)
+                    all_transcripts.append(txt)
+                all_segments.extend(adjusted_segs)
+
+                if return_raw and raw is not None:
+                    raw_results.append(raw)
+
+        finally:
+            # Clean up temporary chunk files
+            for path in temp_files:
+                if path and os.path.exists(path) and path != audio_path:
+                    try:
+                        os.remove(path)
+                    except Exception as e:
+                        logger.warning("Failed to remove chunk file %s: %s", path, e)
+
+        # Sort segments by start time
+        combined = list(zip(all_segments, all_speakers, all_transcripts))
+        combined.sort(key=lambda x: x[0][0])
+        all_segments = [x[0] for x in combined]
+        all_speakers = [x[1] for x in combined]
+        all_transcripts = [x[2] for x in combined]
+
+        if verbose:
+            print(
+                f"Chunked transcription complete. Total segments: {len(all_segments)}"
+            )
+
+        result = {
+            "segments": all_segments,
+            "speakers": all_speakers,
+            "transcripts": all_transcripts,
+        }
+
+        if return_raw and raw_results:
+            result["raw_result"] = {
+                "chunked": True,
+                "chunks": raw_results,
+            }
+
+        return result
+
    def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Convert LocalAI verbose_json response into the internal format used by Scraibe:
@@ -0,0 +1,86 @@
+import os
+import subprocess
+import tempfile
+import pytest
+
+from scraibe.audio import (
+    get_audio_duration,
+    split_audio_into_chunks,
+)
+
+TEST_AUDIO_1 = "tests/audio_test_1.mp4"
+TEST_AUDIO_2 = "tests/audio_test_2.mp4"
+
+
+@pytest.fixture(params=[TEST_AUDIO_1, TEST_AUDIO_2])
+def test_audio_path(request):
+    return request.param
+
+
+def test_get_audio_duration(test_audio_path):
+    dur = get_audio_duration(test_audio_path)
+    assert isinstance(dur, float)
+    assert dur > 0
+
+
+def test_split_audio_into_chunks_no_split_short(test_audio_path):
+    # For short files, should return the same file with no extra chunks
+    chunks = split_audio_into_chunks(
+        input_path=test_audio_path,
+        max_duration=600.0,  # larger than both test files
+        overlap=2.0,
+    )
+    assert len(chunks) == 1
+    assert chunks[0]["path"] == test_audio_path
+    assert chunks[0]["start"] == 0.0
+    dur = get_audio_duration(test_audio_path)
+    assert abs(chunks[0]["end"] - dur) < 0.05
+
+
+def test_split_audio_into_chunks_creates_chunks(tmp_path):
+    # Use a small chunk duration to force splitting
+    chunks = split_audio_into_chunks(
+        input_path=TEST_AUDIO_1,
+        max_duration=2.0,
+        overlap=0.5,
+    )
+    assert len(chunks) > 1
+
+    # Check that each chunk file exists and is non-empty
+    for c in chunks:
+        assert os.path.exists(c["path"])
+        assert os.path.getsize(c["path"]) > 0
+
+    # Check time ordering and overlap
+    for i in range(1, len(chunks)):
+        prev = chunks[i - 1]
+        curr = chunks[i]
+        assert curr["start"] >= prev["start"]
+        assert curr["start"] < prev["end"]  # overlap
+
+    # Cleanup
+    for c in chunks:
+        if os.path.exists(c["path"]):
+            os.remove(c["path"])
+
+
+def test_split_audio_into_chunks_total_coverage(test_audio_path):
+    dur = get_audio_duration(test_audio_path)
+
+    # Use small chunks to ensure coverage
+    chunks = split_audio_into_chunks(
+        input_path=test_audio_path,
+        max_duration=2.0,
+        overlap=0.5,
+    )
+
+    # First chunk starts at 0
+    assert chunks[0]["start"] == 0.0
+
+    # Last chunk end should cover the duration
+    assert chunks[-1]["end"] >= dur - 0.05
+
+    # Cleanup
+    for c in chunks:
+        if os.path.exists(c["path"]):
+            os.remove(c["path"])
@@ -0,0 +1,230 @@
+import os
+import json
+import tempfile
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from scraibe.localai_client import LocalAIClient, LocalAIError
+from scraibe.audio import get_audio_duration, split_audio_into_chunks
+
+
+TEST_AUDIO_1 = "tests/audio_test_1.mp4"
+
+
+def make_fake_segments(start=0.0, count=3):
+    segments = []
+    for i in range(count):
+        s = start + i * 2.0
+        e = s + 2.0
+        segments.append({
+            "start": s,
+            "end": e,
+            "speaker": "SPEAKER_00",
+            "text": f"Segment text {i}",
+        })
+    return segments
+
+
+def fake_localai_response(segments):
+    return {
+        "segments": segments,
+        "text": " ".join(seg["text"] for seg in segments),
+    }
+
+
+@pytest.fixture
+def client():
+    with patch.object(LocalAIClient, "__init__", lambda self, **kw: None):
+        c = LocalAIClient()
+        c.api_url = "http://localhost:8080"
+        c.model = "vibevoice-diarize"
+        c.api_key = None
+        c._client = MagicMock()
+        return c
+
+
+def test_parse_diarization_response(client):
+    segs = make_fake_segments()
+    raw = fake_localai_response(segs)
+
+    out = client._parse_diarization_response(raw)
+
+    assert "segments" in out
+    assert "speakers" in out
+    assert "transcripts" in out
+    assert len(out["segments"]) == len(segs)
+    for i, s in enumerate(segs):
+        assert out["segments"][i][0] == s["start"]
+        assert out["segments"][i][1] == s["end"]
+        assert out["speakers"][i] == s["speaker"]
+        assert out["transcripts"][i] == s["text"]
+
+
+def test_parse_diarization_empty(client):
+    out = client._parse_diarization_response({"segments": []})
+    assert out["segments"] == []
+    assert out["speakers"] == []
+    assert out["transcripts"] == []
+
+
+def test_diarize_and_transcribe_single_happy(client):
+    with patch.object(client, "_client") as mock_client:
+        mock_resp = MagicMock()
+        mock_resp.status_code = 200
+        mock_resp.json.return_value = fake_localai_response(make_fake_segments())
+        mock_client.post.return_value = mock_resp
+
+        result = client.diarize_and_transcribe(
+            audio_path=TEST_AUDIO_1,
+            verbose=False,
+            return_raw=True,
+        )
+
+        assert "segments" in result
+        assert "raw_result" in result
+        assert len(result["segments"]) > 0
+
+
+def test_chunking_triggered_for_long_audio(client):
+    # Simulate long audio by patching get_audio_duration
+    with patch("scraibe.localai_client.get_audio_duration") as mock_dur, \
+         patch.object(client, "_diarize_and_transcribe_chunked") as mock_chunked:
+
+        mock_dur.return_value = 600.0  # 10 minutes
+        mock_chunked.return_value = {
+            "segments": [],
+            "speakers": [],
+            "transcripts": [],
+        }
+
+        client.diarize_and_transcribe(
+            audio_path=TEST_AUDIO_1,
+            verbose=False,
+            use_chunking=None,
+            max_single_request_duration=300.0,
+        )
+
+        mock_chunked.assert_called_once()
+
+
+def test_chunking_not_triggered_for_short_audio(client):
+    with patch("scraibe.localai_client.get_audio_duration") as mock_dur, \
+         patch.object(client, "_diarize_and_transcribe_chunked") as mock_chunked, \
+         patch.object(client, "_diarize_and_transcribe_single") as mock_single:
+
+        mock_dur.return_value = 120.0
+        mock_single.return_value = {
+            "segments": [],
+            "speakers": [],
+            "transcripts": [],
+        }
+
+        client.diarize_and_transcribe(
+            audio_path=TEST_AUDIO_1,
+            verbose=False,
+            use_chunking=None,
+            max_single_request_duration=300.0,
+        )
+
+        mock_chunked.assert_not_called()
+        mock_single.assert_called_once()
+
+
+def test_chunked_transcription_adjusts_timestamps(client):
+    # Mock split_audio_into_chunks to return two chunks
+    chunk1_path = TEST_AUDIO_1
+    chunk2_path = TEST_AUDIO_1  # reusing same file; in real usage different
+
+    chunks = [
+        {"path": chunk1_path, "start": 0.0, "end": 10.0},
+        {"path": chunk2_path, "start": 10.0, "end": 20.0},
+    ]
+
+    with patch("scraibe.localai_client.split_audio_into_chunks") as mock_split, \
+         patch.object(client, "_diarize_and_transcribe_single") as mock_single, \
+         patch("os.remove"):
+
+        mock_split.return_value = chunks
+
+        # First chunk: segments 0–4
+        # Second chunk: segments 0–4 (local times)
+        def side_effect(audio_path, **kw):
+            if audio_path == chunk1_path:
+                segs = make_fake_segments(start=0.0, count=2)
+            else:
+                segs = make_fake_segments(start=0.0, count=2)
+            return client._parse_diarization_response(fake_localai_response(segs))
+
+        mock_single.side_effect = side_effect
+
+        result = client._diarize_and_transcribe_chunked(
+            audio_path=TEST_AUDIO_1,
+            verbose=False,
+            return_raw=False,
+            chunk_duration=10.0,
+            chunk_overlap=2.0,
+        )
+
+        # Check we got 4 segments total
+        assert len(result["segments"]) == 4
+
+        # First two segments should be in [0, 4]
+        assert result["segments"][0][0] == 0.0
+        assert result["segments"][1][0] == 2.0
+
+        # Next two segments should be shifted by 10
+        assert result["segments"][2][0] == 10.0
+        assert result["segments"][3][0] == 12.0
+
+
+@pytest.mark.integration
+def test_integration_chunked_transcription_with_localai():
+    """
+    Integration test: run chunked transcription against a live LocalAI instance.
+    Only runs if LOCALAI_API_URL is set and an audio file is provided.
+    This test is skipped by default unless run with:
+        pytest -m integration
+    """
+    api_url = os.getenv("LOCALAI_API_URL")
+    if not api_url:
+        pytest.skip("LOCALAI_API_URL not set; skipping integration test")
+
+    # Use one of the bundled test audio files
+    audio_path = TEST_AUDIO_1
+    if not os.path.exists(audio_path):
+        pytest.skip(f"Test audio not found: {audio_path}")
+
+    # Force chunking with a very small max_single_request_duration
+    # Use environment-configured model or a sensible default
+    model = os.getenv("LOCALAI_MODEL") or "vibevoice-cpp-asr"
+
+    client = LocalAIClient(api_url=api_url, model=model)
+    try:
+        result = client.diarize_and_transcribe(
+            audio_path=audio_path,
+            verbose=True,
+            return_raw=True,
+            use_chunking=True,
+            chunk_duration=3.0,
+            chunk_overlap=0.5,
+            max_single_request_duration=1.0,
+        )
+
+        assert "segments" in result
+        assert len(result["segments"]) > 0
+
+        # Basic sanity: segments are time-ordered
+        for i in range(1, len(result["segments"])):
+            prev_end = result["segments"][i - 1][1]
+            curr_start = result["segments"][i][0]
+            assert curr_start >= result["segments"][i - 1][0]
+
+        # If raw_result indicates chunked, ensure structure is sensible
+        raw = result.get("raw_result")
+        if raw and raw.get("chunked"):
+            assert "chunks" in raw
+            assert len(raw["chunks"]) > 1
+
+    finally:
+        client.close()