Initial commit: LocalAI-backed ScrAIbe with summarization

2026-06-13 16:38:59 +00:00
parent 46d119b63b
commit 574124558b
10 changed files with 992 additions and 594 deletions
@@ -0,0 +1,237 @@
+"""
+LocalAI Client Module
+---------------------
+
+This module provides a client for communicating with a LocalAI server
+running vibevoice.cpp for transcription and speaker diarization.
+
+It replaces the previous local Whisper + Pyannote pipeline by sending
+audio files to the /v1/audio/diarization endpoint and mapping the
+response into the same Transcript format used by the UI.
+
+Environment Variables:
+    LOCALAI_API_URL: (required) Base URL of the LocalAI server
+                     (e.g., http://localhost:8080)
+    LOCALAI_API_KEY: (optional) API key, if configured
+    LOCALAI_MODEL:   (optional) Model name to use (default: vibevoice-diarize)
+"""
+
+import os
+import io
+import json
+from typing import Dict, List, Any, Optional
+
+import httpx
+
+
+class LocalAIError(Exception):
+    """Raised when the LocalAI API returns an error or unexpected response."""
+    pass
+
+
+class LocalAIClient:
+    """
+    Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp.
+
+    Responsibilities:
+      - Read configuration from environment.
+      - Upload audio file as multipart/form-data.
+      - Parse diarization + transcription response.
+      - Map response into the same structure expected by Scraibe's Transcript.
+    """
+
+    def __init__(
+        self,
+        api_url: Optional[str] = None,
+        api_key: Optional[str] = None,
+        model: Optional[str] = None,
+        timeout: float = 600.0,
+    ):
+        """
+        Args:
+            api_url: LocalAI server URL (e.g., http://localhost:8080).
+                     Falls back to LOCALAI_API_URL env var.
+            api_key: API key, if required. Falls back to LOCALAI_API_KEY.
+            model:   Model name (e.g., vibevoice-diarize).
+                     Falls back to LOCALAI_MODEL or default.
+            timeout: Request timeout in seconds.
+        """
+        self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/")
+        self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None
+        self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize"
+        self.timeout = timeout
+
+        if not self.api_url:
+            raise LocalAIError(
+                "LOCALAI_API_URL is not set. "
+                "Provide the LocalAI server URL via environment or constructor."
+            )
+
+        self._client = httpx.Client(
+            base_url=self.api_url,
+            timeout=self.timeout,
+            follow_redirects=True,
+        )
+
+    def close(self):
+        """Close the underlying HTTP client."""
+        self._client.close()
+
+    def __del__(self):
+        try:
+            self._client.close()
+        except Exception:
+            pass
+
+    def diarize_and_transcribe(
+        self,
+        audio_path: str,
+        *,
+        language: Optional[str] = None,
+        num_speakers: Optional[int] = None,
+        min_speakers: Optional[int] = None,
+        max_speakers: Optional[int] = None,
+        clustering_threshold: Optional[float] = None,
+        min_duration_on: Optional[float] = None,
+        min_duration_off: Optional[float] = None,
+        response_format: Optional[str] = None,
+        include_text: Optional[bool] = None,
+        verbose: bool = False,
+        **_ignored,
+    ) -> Dict[str, Any]:
+        """
+        Send audio to LocalAI /v1/audio/diarization and return a dict
+        in the same style as the previous internal diarization output:
+
+        {
+          "segments": [ [start, end], ... ],
+          "speakers": [ "SPEAKER_00", ... ],
+          "transcripts": [ "text for segment", ... ]
+        }
+
+        Extra kwargs that the old UI used (e.g., whisper-specific) are
+        accepted but ignored.
+
+        Args:
+            audio_path: Path to the audio file.
+            language: Language hint, forwarded if set.
+            num_speakers: Optional exact speaker count.
+            min_speakers: Optional hint.
+            max_speakers: Optional hint.
+            clustering_threshold: Optional clustering threshold.
+            min_duration_on: Optional min segment duration.
+            min_duration_off: Optional min gap duration.
+            response_format: "json", "verbose_json", or "rttm".
+                             Defaults to "verbose_json" if not set.
+            include_text: Whether to request per-segment text.
+                          Defaults to True.
+            verbose: If True, prints progress messages.
+        """
+        if verbose:
+            print("Starting diarization and transcription via LocalAI.")
+
+        # Defaults: use verbose_json + include_text to get both diarization and transcription.
+        if response_format is None:
+            response_format = "verbose_json"
+        if include_text is None:
+            include_text = True
+
+        # Prepare form data
+        data = {
+            "model": self.model,
+            "response_format": response_format,
+            "include_text": str(include_text).lower(),
+        }
+
+        if language is not None:
+            data["language"] = language
+        if num_speakers is not None:
+            data["num_speakers"] = str(num_speakers)
+        if min_speakers is not None:
+            data["min_speakers"] = str(min_speakers)
+        if max_speakers is not None:
+            data["max_speakers"] = str(max_speakers)
+        if clustering_threshold is not None:
+            data["clustering_threshold"] = str(clustering_threshold)
+        if min_duration_on is not None:
+            data["min_duration_on"] = str(min_duration_on)
+        if min_duration_off is not None:
+            data["min_duration_off"] = str(min_duration_off)
+
+        # Open file
+        if not os.path.exists(audio_path):
+            raise LocalAIError(f"Audio file not found: {audio_path}")
+
+        with open(audio_path, "rb") as f:
+            files = {
+                "file": (os.path.basename(audio_path), f, "application/octet-stream")
+            }
+
+            headers = {}
+            if self.api_key:
+                headers["Authorization"] = f"Bearer {self.api_key}"
+
+            # POST /v1/audio/diarization
+            resp = self._client.post(
+                "/v1/audio/diarization",
+                data=data,
+                files=files,
+                headers=headers,
+            )
+
+        if resp.status_code >= 400:
+            body = resp.text
+            raise LocalAIError(
+                f"LocalAI request failed with status {resp.status_code}: {body}"
+            )
+
+        try:
+            result = resp.json()
+        except json.JSONDecodeError:
+            raise LocalAIError(
+                "Failed to parse LocalAI response as JSON."
+            )
+
+        if verbose:
+            print("Diarization and transcription finished. Starting post-processing.")
+
+        return self._parse_diarization_response(result)
+
+    def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Convert LocalAI response into the internal format used by Scraibe:
+        {
+          "segments": [ [start, end], ... ],
+          "speakers": [ "SPEAKER_00", ... ],
+          "transcripts": [ "text for segment", ... ]
+        }
+        """
+        segments = result.get("segments", [])
+
+        if not segments:
+            # If no segments, return empty but valid structure
+            return {
+                "segments": [],
+                "speakers": [],
+                "transcripts": [],
+            }
+
+        out_segments = []
+        out_speakers = []
+        out_transcripts = []
+
+        for seg in segments:
+            start = float(seg.get("start", 0.0))
+            end = float(seg.get("end", 0.0))
+            speaker = seg.get("speaker", "SPEAKER_00")
+            text = seg.get("text", "").strip()
+
+            out_segments.append([start, end])
+            out_speakers.append(speaker)
+            out_transcripts.append(text)
+
+        return {
+            "segments": out_segments,
+            "speakers": out_speakers,
+            "transcripts": out_transcripts,
+        }