scribe/scraibe/localai_client.py

"""
LocalAI Client Module
---------------------

This module provides a client for communicating with a LocalAI server
running vibevoice.cpp for transcription and speaker diarization.

It replaces the previous local Whisper + Pyannote pipeline by sending
audio files to the /v1/audio/diarization endpoint and mapping the
response into the same Transcript format used by the UI.

Environment Variables:
    LOCALAI_API_URL: (required) Base URL of the LocalAI server
                     (e.g., http://localhost:8080)
    LOCALAI_API_KEY: (optional) API key, if configured
    LOCALAI_MODEL:   (optional) Model name to use (default: vibevoice-diarize)
"""

import os
import io
import json
import logging
from typing import Dict, List, Any, Optional

import httpx

logger = logging.getLogger("scraibe.localai_client")


class LocalAIError(Exception):
    """Raised when the LocalAI API returns an error or unexpected response."""
    pass


class LocalAIClient:
    """
    Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp.

    Responsibilities:
      - Read configuration from environment.
      - Upload audio file as multipart/form-data.
      - Parse diarization + transcription response (verbose_json).
      - Map response into the same structure expected by Scraibe's Transcript.
    """

    def __init__(
        self,
        api_url: Optional[str] = None,
        api_key: Optional[str] = None,
        model: Optional[str] = None,
        timeout: float = 3600.0,
    ):
        """
        Args:
            api_url: LocalAI server URL (e.g., http://localhost:8080).
                     Falls back to LOCALAI_API_URL env var.
            api_key: API key, if required. Falls back to LOCALAI_API_KEY.
            model:   Model name (e.g., vibevoice-diarize).
                     Falls back to LOCALAI_MODEL or default.
            timeout: Request timeout in seconds.
        """
        self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/")
        self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None
        self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize"
        self.timeout = timeout

        if not self.api_url:
            raise LocalAIError(
                "LOCALAI_API_URL is not set. "
                "Provide the LocalAI server URL via environment or constructor."
            )

        logger.info(
            "Initializing LocalAIClient: url=%s model=%s",
            self.api_url,
            self.model,
        )

        self._client = httpx.Client(
            base_url=self.api_url,
            timeout=self.timeout,
            follow_redirects=True,
        )

    def close(self):
        """Close the underlying HTTP client."""
        self._client.close()

    def __del__(self):
        try:
            self._client.close()
        except Exception:
            pass

    def diarize_and_transcribe(
        self,
        audio_path: str,
        *,
        language: Optional[str] = None,
        num_speakers: Optional[int] = None,
        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None,
        clustering_threshold: Optional[float] = None,
        min_duration_on: Optional[float] = None,
        min_duration_off: Optional[float] = None,
        response_format: Optional[str] = None,
        include_text: Optional[bool] = None,
        verbose: bool = False,
        return_raw: bool = False,
        **_ignored,
    ) -> Dict[str, Any]:
        """
        Send audio to LocalAI /v1/audio/diarization and return:
        - A normalized dict with segments, speakers, transcripts.
        - Optionally, the raw verbose_json response (for JSON export).

        Args:
            audio_path: Path to the audio file.
            language: Language hint, forwarded if set.
            num_speakers: Optional exact speaker count.
            min_speakers: Optional hint.
            max_speakers: Optional hint.
            clustering_threshold: Optional clustering threshold.
            min_duration_on: Optional min segment duration.
            min_duration_off: Optional min gap duration.
            response_format: "json", "verbose_json", or "rttm".
                             Defaults to "verbose_json".
            include_text: Whether to request per-segment text.
                          Defaults to True.
            verbose: If True, prints progress messages.
            return_raw: If True, also return the raw API response in 'raw_result'.
        """
        if verbose:
            print("Starting diarization and transcription via LocalAI.")

        logger.info("diarize_and_transcribe requested for: %s", audio_path)

        # Always use verbose_json for diarization + speaker info
        if response_format is None:
            response_format = "verbose_json"
        if include_text is None:
            include_text = True

        # Prepare form data
        data = {
            "model": self.model,
            "response_format": response_format,
            "include_text": str(include_text).lower(),
        }

        if language is not None:
            data["language"] = language
        if num_speakers is not None:
            data["num_speakers"] = str(num_speakers)
        if min_speakers is not None:
            data["min_speakers"] = str(min_speakers)
        if max_speakers is not None:
            data["max_speakers"] = str(max_speakers)
        if clustering_threshold is not None:
            data["clustering_threshold"] = str(clustering_threshold)
        if min_duration_on is not None:
            data["min_duration_on"] = str(min_duration_on)
        if min_duration_off is not None:
            data["min_duration_off"] = str(min_duration_off)

        logger.debug("LocalAI request params: %s", data)

        # Open file
        if not os.path.exists(audio_path):
            raise LocalAIError(f"Audio file not found: {audio_path}")

        with open(audio_path, "rb") as f:
            files = {
                "file": (os.path.basename(audio_path), f, "application/octet-stream")
            }

            headers = {}
            if self.api_key:
                headers["Authorization"] = f"Bearer {self.api_key}"

            # POST /v1/audio/diarization
            logger.info("Sending request to LocalAI: /v1/audio/diarization")
            resp = self._client.post(
                "/v1/audio/diarization",
                data=data,
                files=files,
                headers=headers,
            )

        logger.info("LocalAI response status: %d", resp.status_code)

        if resp.status_code >= 400:
            body = resp.text
            logger.error("LocalAI error response: %s", body)
            raise LocalAIError(
                f"LocalAI request failed with status {resp.status_code}: {body}"
            )

        try:
            raw_result = resp.json()
        except json.JSONDecodeError:
            logger.error("Failed to parse LocalAI response as JSON.")
            raise LocalAIError(
                "Failed to parse LocalAI response as JSON."
            )

        if verbose:
            print("Diarization and transcription finished. Starting post-processing.")

        parsed = self._parse_diarization_response(raw_result)

        if return_raw:
            parsed["raw_result"] = raw_result

        return parsed

    def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Convert LocalAI verbose_json response into the internal format used by Scraibe:
        {
          "segments": [ [start, end], ... ],
          "speakers": [ "SPEAKER_00", ... ],
          "transcripts": [ "text for segment", ... ]
        }
        """
        segments = result.get("segments", [])

        if not segments:
            logger.warning("LocalAI returned no segments.")
            return {
                "segments": [],
                "speakers": [],
                "transcripts": [],
            }

        out_segments = []
        out_speakers = []
        out_transcripts = []

        for seg in segments:
            start = float(seg.get("start", 0.0))
            end = float(seg.get("end", 0.0))
            speaker = seg.get("speaker", "SPEAKER_00")
            text = seg.get("text", "").strip()

            out_segments.append([start, end])
            out_speakers.append(speaker)
            out_transcripts.append(text)

        logger.info(
            "Parsed %d segments from LocalAI.",
            len(out_segments),
        )

        return {
            "segments": out_segments,
            "speakers": out_speakers,
            "transcripts": out_transcripts,
        }