Files
scribe/scraibe/localai_client.py
T
admin 2ea46ada42
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled
Add structured logging for Docker; support LOG_LEVEL env and --log-level
2026-06-13 17:46:25 +00:00

260 lines
8.4 KiB
Python

"""
LocalAI Client Module
---------------------
This module provides a client for communicating with a LocalAI server
running vibevoice.cpp for transcription and speaker diarization.
It replaces the previous local Whisper + Pyannote pipeline by sending
audio files to the /v1/audio/diarization endpoint and mapping the
response into the same Transcript format used by the UI.
Environment Variables:
LOCALAI_API_URL: (required) Base URL of the LocalAI server
(e.g., http://localhost:8080)
LOCALAI_API_KEY: (optional) API key, if configured
LOCALAI_MODEL: (optional) Model name to use (default: vibevoice-diarize)
"""
import os
import io
import json
import logging
from typing import Dict, List, Any, Optional
import httpx
logger = logging.getLogger("scraibe.localai_client")
class LocalAIError(Exception):
"""Raised when the LocalAI API returns an error or unexpected response."""
pass
class LocalAIClient:
"""
Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp.
Responsibilities:
- Read configuration from environment.
- Upload audio file as multipart/form-data.
- Parse diarization + transcription response.
- Map response into the same structure expected by Scraibe's Transcript.
"""
def __init__(
self,
api_url: Optional[str] = None,
api_key: Optional[str] = None,
model: Optional[str] = None,
timeout: float = 600.0,
):
"""
Args:
api_url: LocalAI server URL (e.g., http://localhost:8080).
Falls back to LOCALAI_API_URL env var.
api_key: API key, if required. Falls back to LOCALAI_API_KEY.
model: Model name (e.g., vibevoice-diarize).
Falls back to LOCALAI_MODEL or default.
timeout: Request timeout in seconds.
"""
self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/")
self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None
self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize"
self.timeout = timeout
if not self.api_url:
raise LocalAIError(
"LOCALAI_API_URL is not set. "
"Provide the LocalAI server URL via environment or constructor."
)
logger.info(
"Initializing LocalAIClient: url=%s model=%s",
self.api_url,
self.model,
)
self._client = httpx.Client(
base_url=self.api_url,
timeout=self.timeout,
follow_redirects=True,
)
def close(self):
"""Close the underlying HTTP client."""
self._client.close()
def __del__(self):
try:
self._client.close()
except Exception:
pass
def diarize_and_transcribe(
self,
audio_path: str,
*,
language: Optional[str] = None,
num_speakers: Optional[int] = None,
min_speakers: Optional[int] = None,
max_speakers: Optional[int] = None,
clustering_threshold: Optional[float] = None,
min_duration_on: Optional[float] = None,
min_duration_off: Optional[float] = None,
response_format: Optional[str] = None,
include_text: Optional[bool] = None,
verbose: bool = False,
**_ignored,
) -> Dict[str, Any]:
"""
Send audio to LocalAI /v1/audio/diarization and return a dict
in the same style as the previous internal diarization output:
{
"segments": [ [start, end], ... ],
"speakers": [ "SPEAKER_00", ... ],
"transcripts": [ "text for segment", ... ]
}
Extra kwargs that the old UI used (e.g., whisper-specific) are
accepted but ignored.
Args:
audio_path: Path to the audio file.
language: Language hint, forwarded if set.
num_speakers: Optional exact speaker count.
min_speakers: Optional hint.
max_speakers: Optional hint.
clustering_threshold: Optional clustering threshold.
min_duration_on: Optional min segment duration.
min_duration_off: Optional min gap duration.
response_format: "json", "verbose_json", or "rttm".
Defaults to "verbose_json" if not set.
include_text: Whether to request per-segment text.
Defaults to True.
verbose: If True, prints progress messages.
"""
if verbose:
print("Starting diarization and transcription via LocalAI.")
logger.info("diarize_and_transcribe requested for: %s", audio_path)
if response_format is None:
response_format = "verbose_json"
if include_text is None:
include_text = True
# Prepare form data
data = {
"model": self.model,
"response_format": response_format,
"include_text": str(include_text).lower(),
}
if language is not None:
data["language"] = language
if num_speakers is not None:
data["num_speakers"] = str(num_speakers)
if min_speakers is not None:
data["min_speakers"] = str(min_speakers)
if max_speakers is not None:
data["max_speakers"] = str(max_speakers)
if clustering_threshold is not None:
data["clustering_threshold"] = str(clustering_threshold)
if min_duration_on is not None:
data["min_duration_on"] = str(min_duration_on)
if min_duration_off is not None:
data["min_duration_off"] = str(min_duration_off)
logger.debug("LocalAI request params: %s", data)
# Open file
if not os.path.exists(audio_path):
raise LocalAIError(f"Audio file not found: {audio_path}")
with open(audio_path, "rb") as f:
files = {
"file": (os.path.basename(audio_path), f, "application/octet-stream")
}
headers = {}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
# POST /v1/audio/diarization
logger.info("Sending request to LocalAI: /v1/audio/diarization")
resp = self._client.post(
"/v1/audio/diarization",
data=data,
files=files,
headers=headers,
)
logger.info("LocalAI response status: %d", resp.status_code)
if resp.status_code >= 400:
body = resp.text
logger.error("LocalAI error response: %s", body)
raise LocalAIError(
f"LocalAI request failed with status {resp.status_code}: {body}"
)
try:
result = resp.json()
except json.JSONDecodeError:
logger.error("Failed to parse LocalAI response as JSON.")
raise LocalAIError(
"Failed to parse LocalAI response as JSON."
)
if verbose:
print("Diarization and transcription finished. Starting post-processing.")
return self._parse_diarization_response(result)
def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert LocalAI response into the internal format used by Scraibe:
{
"segments": [ [start, end], ... ],
"speakers": [ "SPEAKER_00", ... ],
"transcripts": [ "text for segment", ... ]
}
"""
segments = result.get("segments", [])
if not segments:
logger.warning("LocalAI returned no segments.")
return {
"segments": [],
"speakers": [],
"transcripts": [],
}
out_segments = []
out_speakers = []
out_transcripts = []
for seg in segments:
start = float(seg.get("start", 0.0))
end = float(seg.get("end", 0.0))
speaker = seg.get("speaker", "SPEAKER_00")
text = seg.get("text", "").strip()
out_segments.append([start, end])
out_speakers.append(speaker)
out_transcripts.append(text)
logger.info(
"Parsed %d segments from LocalAI.",
len(out_segments),
)
return {
"segments": out_segments,
"speakers": out_speakers,
"transcripts": out_transcripts,
}