Initial commit: LocalAI-backed ScrAIbe with summarization
This commit is contained in:
@@ -0,0 +1,237 @@
|
||||
"""
|
||||
LocalAI Client Module
|
||||
---------------------
|
||||
|
||||
This module provides a client for communicating with a LocalAI server
|
||||
running vibevoice.cpp for transcription and speaker diarization.
|
||||
|
||||
It replaces the previous local Whisper + Pyannote pipeline by sending
|
||||
audio files to the /v1/audio/diarization endpoint and mapping the
|
||||
response into the same Transcript format used by the UI.
|
||||
|
||||
Environment Variables:
|
||||
LOCALAI_API_URL: (required) Base URL of the LocalAI server
|
||||
(e.g., http://localhost:8080)
|
||||
LOCALAI_API_KEY: (optional) API key, if configured
|
||||
LOCALAI_MODEL: (optional) Model name to use (default: vibevoice-diarize)
|
||||
"""
|
||||
|
||||
import os
|
||||
import io
|
||||
import json
|
||||
from typing import Dict, List, Any, Optional
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class LocalAIError(Exception):
|
||||
"""Raised when the LocalAI API returns an error or unexpected response."""
|
||||
pass
|
||||
|
||||
|
||||
class LocalAIClient:
|
||||
"""
|
||||
Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp.
|
||||
|
||||
Responsibilities:
|
||||
- Read configuration from environment.
|
||||
- Upload audio file as multipart/form-data.
|
||||
- Parse diarization + transcription response.
|
||||
- Map response into the same structure expected by Scraibe's Transcript.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_url: Optional[str] = None,
|
||||
api_key: Optional[str] = None,
|
||||
model: Optional[str] = None,
|
||||
timeout: float = 600.0,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
api_url: LocalAI server URL (e.g., http://localhost:8080).
|
||||
Falls back to LOCALAI_API_URL env var.
|
||||
api_key: API key, if required. Falls back to LOCALAI_API_KEY.
|
||||
model: Model name (e.g., vibevoice-diarize).
|
||||
Falls back to LOCALAI_MODEL or default.
|
||||
timeout: Request timeout in seconds.
|
||||
"""
|
||||
self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/")
|
||||
self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None
|
||||
self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize"
|
||||
self.timeout = timeout
|
||||
|
||||
if not self.api_url:
|
||||
raise LocalAIError(
|
||||
"LOCALAI_API_URL is not set. "
|
||||
"Provide the LocalAI server URL via environment or constructor."
|
||||
)
|
||||
|
||||
self._client = httpx.Client(
|
||||
base_url=self.api_url,
|
||||
timeout=self.timeout,
|
||||
follow_redirects=True,
|
||||
)
|
||||
|
||||
def close(self):
|
||||
"""Close the underlying HTTP client."""
|
||||
self._client.close()
|
||||
|
||||
def __del__(self):
|
||||
try:
|
||||
self._client.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def diarize_and_transcribe(
|
||||
self,
|
||||
audio_path: str,
|
||||
*,
|
||||
language: Optional[str] = None,
|
||||
num_speakers: Optional[int] = None,
|
||||
min_speakers: Optional[int] = None,
|
||||
max_speakers: Optional[int] = None,
|
||||
clustering_threshold: Optional[float] = None,
|
||||
min_duration_on: Optional[float] = None,
|
||||
min_duration_off: Optional[float] = None,
|
||||
response_format: Optional[str] = None,
|
||||
include_text: Optional[bool] = None,
|
||||
verbose: bool = False,
|
||||
**_ignored,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Send audio to LocalAI /v1/audio/diarization and return a dict
|
||||
in the same style as the previous internal diarization output:
|
||||
|
||||
{
|
||||
"segments": [ [start, end], ... ],
|
||||
"speakers": [ "SPEAKER_00", ... ],
|
||||
"transcripts": [ "text for segment", ... ]
|
||||
}
|
||||
|
||||
Extra kwargs that the old UI used (e.g., whisper-specific) are
|
||||
accepted but ignored.
|
||||
|
||||
Args:
|
||||
audio_path: Path to the audio file.
|
||||
language: Language hint, forwarded if set.
|
||||
num_speakers: Optional exact speaker count.
|
||||
min_speakers: Optional hint.
|
||||
max_speakers: Optional hint.
|
||||
clustering_threshold: Optional clustering threshold.
|
||||
min_duration_on: Optional min segment duration.
|
||||
min_duration_off: Optional min gap duration.
|
||||
response_format: "json", "verbose_json", or "rttm".
|
||||
Defaults to "verbose_json" if not set.
|
||||
include_text: Whether to request per-segment text.
|
||||
Defaults to True.
|
||||
verbose: If True, prints progress messages.
|
||||
"""
|
||||
if verbose:
|
||||
print("Starting diarization and transcription via LocalAI.")
|
||||
|
||||
# Defaults: use verbose_json + include_text to get both diarization and transcription.
|
||||
if response_format is None:
|
||||
response_format = "verbose_json"
|
||||
if include_text is None:
|
||||
include_text = True
|
||||
|
||||
# Prepare form data
|
||||
data = {
|
||||
"model": self.model,
|
||||
"response_format": response_format,
|
||||
"include_text": str(include_text).lower(),
|
||||
}
|
||||
|
||||
if language is not None:
|
||||
data["language"] = language
|
||||
if num_speakers is not None:
|
||||
data["num_speakers"] = str(num_speakers)
|
||||
if min_speakers is not None:
|
||||
data["min_speakers"] = str(min_speakers)
|
||||
if max_speakers is not None:
|
||||
data["max_speakers"] = str(max_speakers)
|
||||
if clustering_threshold is not None:
|
||||
data["clustering_threshold"] = str(clustering_threshold)
|
||||
if min_duration_on is not None:
|
||||
data["min_duration_on"] = str(min_duration_on)
|
||||
if min_duration_off is not None:
|
||||
data["min_duration_off"] = str(min_duration_off)
|
||||
|
||||
# Open file
|
||||
if not os.path.exists(audio_path):
|
||||
raise LocalAIError(f"Audio file not found: {audio_path}")
|
||||
|
||||
with open(audio_path, "rb") as f:
|
||||
files = {
|
||||
"file": (os.path.basename(audio_path), f, "application/octet-stream")
|
||||
}
|
||||
|
||||
headers = {}
|
||||
if self.api_key:
|
||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
# POST /v1/audio/diarization
|
||||
resp = self._client.post(
|
||||
"/v1/audio/diarization",
|
||||
data=data,
|
||||
files=files,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
if resp.status_code >= 400:
|
||||
body = resp.text
|
||||
raise LocalAIError(
|
||||
f"LocalAI request failed with status {resp.status_code}: {body}"
|
||||
)
|
||||
|
||||
try:
|
||||
result = resp.json()
|
||||
except json.JSONDecodeError:
|
||||
raise LocalAIError(
|
||||
"Failed to parse LocalAI response as JSON."
|
||||
)
|
||||
|
||||
if verbose:
|
||||
print("Diarization and transcription finished. Starting post-processing.")
|
||||
|
||||
return self._parse_diarization_response(result)
|
||||
|
||||
def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert LocalAI response into the internal format used by Scraibe:
|
||||
{
|
||||
"segments": [ [start, end], ... ],
|
||||
"speakers": [ "SPEAKER_00", ... ],
|
||||
"transcripts": [ "text for segment", ... ]
|
||||
}
|
||||
"""
|
||||
segments = result.get("segments", [])
|
||||
|
||||
if not segments:
|
||||
# If no segments, return empty but valid structure
|
||||
return {
|
||||
"segments": [],
|
||||
"speakers": [],
|
||||
"transcripts": [],
|
||||
}
|
||||
|
||||
out_segments = []
|
||||
out_speakers = []
|
||||
out_transcripts = []
|
||||
|
||||
for seg in segments:
|
||||
start = float(seg.get("start", 0.0))
|
||||
end = float(seg.get("end", 0.0))
|
||||
speaker = seg.get("speaker", "SPEAKER_00")
|
||||
text = seg.get("text", "").strip()
|
||||
|
||||
out_segments.append([start, end])
|
||||
out_speakers.append(speaker)
|
||||
out_transcripts.append(text)
|
||||
|
||||
return {
|
||||
"segments": out_segments,
|
||||
"speakers": out_speakers,
|
||||
"transcripts": out_transcripts,
|
||||
}
|
||||
Reference in New Issue
Block a user