c43076efd4
- LocalAI client timeout: 600s -> 3600s - Summarizer timeout: 600s -> 3600s - Add task_time_limit=14400s (4h) and soft_time_limit=13500s to transcription task
260 lines
8.5 KiB
Python
260 lines
8.5 KiB
Python
"""
|
|
LocalAI Client Module
|
|
---------------------
|
|
|
|
This module provides a client for communicating with a LocalAI server
|
|
running vibevoice.cpp for transcription and speaker diarization.
|
|
|
|
It replaces the previous local Whisper + Pyannote pipeline by sending
|
|
audio files to the /v1/audio/diarization endpoint and mapping the
|
|
response into the same Transcript format used by the UI.
|
|
|
|
Environment Variables:
|
|
LOCALAI_API_URL: (required) Base URL of the LocalAI server
|
|
(e.g., http://localhost:8080)
|
|
LOCALAI_API_KEY: (optional) API key, if configured
|
|
LOCALAI_MODEL: (optional) Model name to use (default: vibevoice-diarize)
|
|
"""
|
|
|
|
import os
|
|
import io
|
|
import json
|
|
import logging
|
|
from typing import Dict, List, Any, Optional
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger("scraibe.localai_client")
|
|
|
|
|
|
class LocalAIError(Exception):
|
|
"""Raised when the LocalAI API returns an error or unexpected response."""
|
|
pass
|
|
|
|
|
|
class LocalAIClient:
|
|
"""
|
|
Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp.
|
|
|
|
Responsibilities:
|
|
- Read configuration from environment.
|
|
- Upload audio file as multipart/form-data.
|
|
- Parse diarization + transcription response (verbose_json).
|
|
- Map response into the same structure expected by Scraibe's Transcript.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
api_url: Optional[str] = None,
|
|
api_key: Optional[str] = None,
|
|
model: Optional[str] = None,
|
|
timeout: float = 3600.0,
|
|
):
|
|
"""
|
|
Args:
|
|
api_url: LocalAI server URL (e.g., http://localhost:8080).
|
|
Falls back to LOCALAI_API_URL env var.
|
|
api_key: API key, if required. Falls back to LOCALAI_API_KEY.
|
|
model: Model name (e.g., vibevoice-diarize).
|
|
Falls back to LOCALAI_MODEL or default.
|
|
timeout: Request timeout in seconds.
|
|
"""
|
|
self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/")
|
|
self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None
|
|
self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize"
|
|
self.timeout = timeout
|
|
|
|
if not self.api_url:
|
|
raise LocalAIError(
|
|
"LOCALAI_API_URL is not set. "
|
|
"Provide the LocalAI server URL via environment or constructor."
|
|
)
|
|
|
|
logger.info(
|
|
"Initializing LocalAIClient: url=%s model=%s",
|
|
self.api_url,
|
|
self.model,
|
|
)
|
|
|
|
self._client = httpx.Client(
|
|
base_url=self.api_url,
|
|
timeout=self.timeout,
|
|
follow_redirects=True,
|
|
)
|
|
|
|
def close(self):
|
|
"""Close the underlying HTTP client."""
|
|
self._client.close()
|
|
|
|
def __del__(self):
|
|
try:
|
|
self._client.close()
|
|
except Exception:
|
|
pass
|
|
|
|
def diarize_and_transcribe(
|
|
self,
|
|
audio_path: str,
|
|
*,
|
|
language: Optional[str] = None,
|
|
num_speakers: Optional[int] = None,
|
|
min_speakers: Optional[int] = None,
|
|
max_speakers: Optional[int] = None,
|
|
clustering_threshold: Optional[float] = None,
|
|
min_duration_on: Optional[float] = None,
|
|
min_duration_off: Optional[float] = None,
|
|
response_format: Optional[str] = None,
|
|
include_text: Optional[bool] = None,
|
|
verbose: bool = False,
|
|
return_raw: bool = False,
|
|
**_ignored,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Send audio to LocalAI /v1/audio/diarization and return:
|
|
- A normalized dict with segments, speakers, transcripts.
|
|
- Optionally, the raw verbose_json response (for JSON export).
|
|
|
|
Args:
|
|
audio_path: Path to the audio file.
|
|
language: Language hint, forwarded if set.
|
|
num_speakers: Optional exact speaker count.
|
|
min_speakers: Optional hint.
|
|
max_speakers: Optional hint.
|
|
clustering_threshold: Optional clustering threshold.
|
|
min_duration_on: Optional min segment duration.
|
|
min_duration_off: Optional min gap duration.
|
|
response_format: "json", "verbose_json", or "rttm".
|
|
Defaults to "verbose_json".
|
|
include_text: Whether to request per-segment text.
|
|
Defaults to True.
|
|
verbose: If True, prints progress messages.
|
|
return_raw: If True, also return the raw API response in 'raw_result'.
|
|
"""
|
|
if verbose:
|
|
print("Starting diarization and transcription via LocalAI.")
|
|
|
|
logger.info("diarize_and_transcribe requested for: %s", audio_path)
|
|
|
|
# Always use verbose_json for diarization + speaker info
|
|
if response_format is None:
|
|
response_format = "verbose_json"
|
|
if include_text is None:
|
|
include_text = True
|
|
|
|
# Prepare form data
|
|
data = {
|
|
"model": self.model,
|
|
"response_format": response_format,
|
|
"include_text": str(include_text).lower(),
|
|
}
|
|
|
|
if language is not None:
|
|
data["language"] = language
|
|
if num_speakers is not None:
|
|
data["num_speakers"] = str(num_speakers)
|
|
if min_speakers is not None:
|
|
data["min_speakers"] = str(min_speakers)
|
|
if max_speakers is not None:
|
|
data["max_speakers"] = str(max_speakers)
|
|
if clustering_threshold is not None:
|
|
data["clustering_threshold"] = str(clustering_threshold)
|
|
if min_duration_on is not None:
|
|
data["min_duration_on"] = str(min_duration_on)
|
|
if min_duration_off is not None:
|
|
data["min_duration_off"] = str(min_duration_off)
|
|
|
|
logger.debug("LocalAI request params: %s", data)
|
|
|
|
# Open file
|
|
if not os.path.exists(audio_path):
|
|
raise LocalAIError(f"Audio file not found: {audio_path}")
|
|
|
|
with open(audio_path, "rb") as f:
|
|
files = {
|
|
"file": (os.path.basename(audio_path), f, "application/octet-stream")
|
|
}
|
|
|
|
headers = {}
|
|
if self.api_key:
|
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
|
|
|
# POST /v1/audio/diarization
|
|
logger.info("Sending request to LocalAI: /v1/audio/diarization")
|
|
resp = self._client.post(
|
|
"/v1/audio/diarization",
|
|
data=data,
|
|
files=files,
|
|
headers=headers,
|
|
)
|
|
|
|
logger.info("LocalAI response status: %d", resp.status_code)
|
|
|
|
if resp.status_code >= 400:
|
|
body = resp.text
|
|
logger.error("LocalAI error response: %s", body)
|
|
raise LocalAIError(
|
|
f"LocalAI request failed with status {resp.status_code}: {body}"
|
|
)
|
|
|
|
try:
|
|
raw_result = resp.json()
|
|
except json.JSONDecodeError:
|
|
logger.error("Failed to parse LocalAI response as JSON.")
|
|
raise LocalAIError(
|
|
"Failed to parse LocalAI response as JSON."
|
|
)
|
|
|
|
if verbose:
|
|
print("Diarization and transcription finished. Starting post-processing.")
|
|
|
|
parsed = self._parse_diarization_response(raw_result)
|
|
|
|
if return_raw:
|
|
parsed["raw_result"] = raw_result
|
|
|
|
return parsed
|
|
|
|
def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Convert LocalAI verbose_json response into the internal format used by Scraibe:
|
|
{
|
|
"segments": [ [start, end], ... ],
|
|
"speakers": [ "SPEAKER_00", ... ],
|
|
"transcripts": [ "text for segment", ... ]
|
|
}
|
|
"""
|
|
segments = result.get("segments", [])
|
|
|
|
if not segments:
|
|
logger.warning("LocalAI returned no segments.")
|
|
return {
|
|
"segments": [],
|
|
"speakers": [],
|
|
"transcripts": [],
|
|
}
|
|
|
|
out_segments = []
|
|
out_speakers = []
|
|
out_transcripts = []
|
|
|
|
for seg in segments:
|
|
start = float(seg.get("start", 0.0))
|
|
end = float(seg.get("end", 0.0))
|
|
speaker = seg.get("speaker", "SPEAKER_00")
|
|
text = seg.get("text", "").strip()
|
|
|
|
out_segments.append([start, end])
|
|
out_speakers.append(speaker)
|
|
out_transcripts.append(text)
|
|
|
|
logger.info(
|
|
"Parsed %d segments from LocalAI.",
|
|
len(out_segments),
|
|
)
|
|
|
|
return {
|
|
"segments": out_segments,
|
|
"speakers": out_speakers,
|
|
"transcripts": out_transcripts,
|
|
}
|