""" LocalAI Client Module --------------------- This module provides a client for communicating with a LocalAI server running vibevoice.cpp for transcription and speaker diarization. It replaces the previous local Whisper + Pyannote pipeline by sending audio files to the /v1/audio/diarization endpoint and mapping the response into the same Transcript format used by the UI. Environment Variables: LOCALAI_API_URL: (required) Base URL of the LocalAI server (e.g., http://localhost:8080) LOCALAI_API_KEY: (optional) API key, if configured LOCALAI_MODEL: (optional) Model name to use (default: vibevoice-diarize) """ import os import io import json import logging from typing import Dict, List, Any, Optional import httpx logger = logging.getLogger("scraibe.localai_client") class LocalAIError(Exception): """Raised when the LocalAI API returns an error or unexpected response.""" pass class LocalAIClient: """ Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp. Responsibilities: - Read configuration from environment. - Upload audio file as multipart/form-data. - Parse diarization + transcription response. - Map response into the same structure expected by Scraibe's Transcript. """ def __init__( self, api_url: Optional[str] = None, api_key: Optional[str] = None, model: Optional[str] = None, timeout: float = 600.0, ): """ Args: api_url: LocalAI server URL (e.g., http://localhost:8080). Falls back to LOCALAI_API_URL env var. api_key: API key, if required. Falls back to LOCALAI_API_KEY. model: Model name (e.g., vibevoice-diarize). Falls back to LOCALAI_MODEL or default. timeout: Request timeout in seconds. """ self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/") self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize" self.timeout = timeout if not self.api_url: raise LocalAIError( "LOCALAI_API_URL is not set. " "Provide the LocalAI server URL via environment or constructor." ) logger.info( "Initializing LocalAIClient: url=%s model=%s", self.api_url, self.model, ) self._client = httpx.Client( base_url=self.api_url, timeout=self.timeout, follow_redirects=True, ) def close(self): """Close the underlying HTTP client.""" self._client.close() def __del__(self): try: self._client.close() except Exception: pass def diarize_and_transcribe( self, audio_path: str, *, language: Optional[str] = None, num_speakers: Optional[int] = None, min_speakers: Optional[int] = None, max_speakers: Optional[int] = None, clustering_threshold: Optional[float] = None, min_duration_on: Optional[float] = None, min_duration_off: Optional[float] = None, response_format: Optional[str] = None, include_text: Optional[bool] = None, verbose: bool = False, **_ignored, ) -> Dict[str, Any]: """ Send audio to LocalAI /v1/audio/diarization and return a dict in the same style as the previous internal diarization output: { "segments": [ [start, end], ... ], "speakers": [ "SPEAKER_00", ... ], "transcripts": [ "text for segment", ... ] } Extra kwargs that the old UI used (e.g., whisper-specific) are accepted but ignored. Args: audio_path: Path to the audio file. language: Language hint, forwarded if set. num_speakers: Optional exact speaker count. min_speakers: Optional hint. max_speakers: Optional hint. clustering_threshold: Optional clustering threshold. min_duration_on: Optional min segment duration. min_duration_off: Optional min gap duration. response_format: "json", "verbose_json", or "rttm". Defaults to "verbose_json" if not set. include_text: Whether to request per-segment text. Defaults to True. verbose: If True, prints progress messages. """ if verbose: print("Starting diarization and transcription via LocalAI.") logger.info("diarize_and_transcribe requested for: %s", audio_path) if response_format is None: response_format = "verbose_json" if include_text is None: include_text = True # Prepare form data data = { "model": self.model, "response_format": response_format, "include_text": str(include_text).lower(), } if language is not None: data["language"] = language if num_speakers is not None: data["num_speakers"] = str(num_speakers) if min_speakers is not None: data["min_speakers"] = str(min_speakers) if max_speakers is not None: data["max_speakers"] = str(max_speakers) if clustering_threshold is not None: data["clustering_threshold"] = str(clustering_threshold) if min_duration_on is not None: data["min_duration_on"] = str(min_duration_on) if min_duration_off is not None: data["min_duration_off"] = str(min_duration_off) logger.debug("LocalAI request params: %s", data) # Open file if not os.path.exists(audio_path): raise LocalAIError(f"Audio file not found: {audio_path}") with open(audio_path, "rb") as f: files = { "file": (os.path.basename(audio_path), f, "application/octet-stream") } headers = {} if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" # POST /v1/audio/diarization logger.info("Sending request to LocalAI: /v1/audio/diarization") resp = self._client.post( "/v1/audio/diarization", data=data, files=files, headers=headers, ) logger.info("LocalAI response status: %d", resp.status_code) if resp.status_code >= 400: body = resp.text logger.error("LocalAI error response: %s", body) raise LocalAIError( f"LocalAI request failed with status {resp.status_code}: {body}" ) try: result = resp.json() except json.JSONDecodeError: logger.error("Failed to parse LocalAI response as JSON.") raise LocalAIError( "Failed to parse LocalAI response as JSON." ) if verbose: print("Diarization and transcription finished. Starting post-processing.") return self._parse_diarization_response(result) def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]: """ Convert LocalAI response into the internal format used by Scraibe: { "segments": [ [start, end], ... ], "speakers": [ "SPEAKER_00", ... ], "transcripts": [ "text for segment", ... ] } """ segments = result.get("segments", []) if not segments: logger.warning("LocalAI returned no segments.") return { "segments": [], "speakers": [], "transcripts": [], } out_segments = [] out_speakers = [] out_transcripts = [] for seg in segments: start = float(seg.get("start", 0.0)) end = float(seg.get("end", 0.0)) speaker = seg.get("speaker", "SPEAKER_00") text = seg.get("text", "").strip() out_segments.append([start, end]) out_speakers.append(speaker) out_transcripts.append(text) logger.info( "Parsed %d segments from LocalAI.", len(out_segments), ) return { "segments": out_segments, "speakers": out_speakers, "transcripts": out_transcripts, }