Initial commit: LocalAI-backed ScrAIbe with summarization

2026-06-13 16:38:59 +00:00
parent 46d119b63b
commit 574124558b
10 changed files with 992 additions and 594 deletions
@@ -1,44 +1,43 @@
-#pytorch Image
+# Lightweight Python base image (no GPU/PyTorch needed)
-FROM pytorch/pytorch:2.3.1-cuda12.1-cudnn8-runtime
+FROM python:3.11-slim
 # Labels
 LABEL maintainer="Jacob Schmieder"
 LABEL email="Jacob.Schmieder@dbfz.de"
 LABEL version="0.1.1.dev"
-LABEL description="Scraibe is a tool for automatic speech recognition and speaker diarization. \
+LABEL description="Scraibe: LocalAI-backed transcription and diarization client with summarization. \
-                    It is based on the Hugging Face Transformers library and the Pyannote library. \
+                    Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization."
                    It is designed to be used with the Whisper model, a lightweight model for automatic \
                    speech recognition and speaker diarization."
 LABEL url="https://github.com/JSchmie/ScrAIbe"
-# Install dependencies
+# Install system dependencies (ffmpeg required)
-WORKDIR /app
+RUN apt update -y && \
-#Enviorment dependencies
+    apt install -y --no-install-recommends ffmpeg && \
 ENV TRANSFORMERS_CACHE=/app/models
 ENV HF_HOME=/app/models
 ENV AUTOT_CACHE=/app/models
 ENV PYANNOTE_CACHE=/app/models/pyannote
 #Copy all necessary files 
 COPY requirements.txt /app/requirements.txt
 COPY README.md /app/README.md
 COPY scraibe /app/scraibe
 #Installing all necessary dependencies and running the application with a personalised Hugging-Face-Token
 RUN apt update -y && apt upgrade -y && \
    apt install -y libsm6 libxrender1 libfontconfig1 && \
    apt clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-RUN conda update --all && \
+# Working directory
-    # conda install -y pip ffmpeg && \
+WORKDIR /app
-    conda install -c conda-forge libsndfile && \
+
-    conda clean --all -y
+# Environment variables for LocalAI (transcription/diarization)
-# RUN pip install torchaudio==0.11.0+cu113 -f https://download.pytorch.org/whl/torch_stable.html
+# Set these via docker run -e or docker-compose
 ENV LOCALAI_API_URL=http://localhost:8080
 ENV LOCALAI_API_KEY=
 ENV LOCALAI_MODEL=vibevoice-diarize
 # Environment variables for Summarizer LLM
 ENV SUMMARIZER_API_URL=http://localhost:8080
 ENV SUMMARIZER_API_KEY=
 ENV SUMMARIZER_MODEL=llama-3.1-8b-instruct
 # Copy and install Python dependencies
 COPY requirements.txt /app/requirements.txt
 RUN pip install --no-cache-dir -r requirements.txt
-# Expose port
+# Copy application code
-EXPOSE 7860
+COPY scraibe /app/scraibe
 # Run the application
 # Expose port (if UI is served)
 EXPOSE 7860
 # Run the application
 ENTRYPOINT ["python3", "-m", "scraibe.cli"]
@@ -5,38 +5,42 @@ build-backend = "poetry_dynamic_versioning.backend"
 [tool.poetry]
 name = "scraibe"
 version = "0.0.0"
-description = "Transcription tool for audio files based on Whisper and Pyannote"
+description = "LocalAI-backed transcription and diarization client using vibevoice.cpp"
 authors = ["Schmieder, Jacob <jacob.schmieder@dbfz.de>"]
 license = "GPL-3.0-or-later"
 readme = ["README.md", "LICENSE"]
 repository = "https://github.com/JSchmie/ScAIbe"
 documentation = "https://jschmie.github.io/ScrAIbe/"
-keywords = ["transcription", "audio", "whisper", "pyannote", "speech-to-text", "speech-recognition"]
+keywords = [
    "transcription",
    "audio",
    "diarization",
    "localai",
    "vibevoice",
    "speech-to-text",
 ]
 classifiers = [
-                'Development Status :: 4 - Beta',
+    "Development Status :: 4 - Beta",
-                'Intended Audience :: Developers',
+    "Intended Audience :: Developers",
-                'License :: OSI Approved :: GNU General Public License v3 (GPLv3)',
+    "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
-                'Programming Language :: Python :: 3.8',
+    "Programming Language :: Python :: 3.9",
-                'Programming Language :: Python :: 3.9',
+    "Programming Language :: Python :: 3.10",
-                'Programming Language :: Python :: 3.10',
+    "Programming Language :: Python :: 3.11",
-                'Programming Language :: Python :: 3.11',
+    "Programming Language :: Python :: 3.12",
-                'Environment :: GPU :: NVIDIA CUDA :: 12 :: 12.1',
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
                'Topic :: Scientific/Engineering :: Artificial Intelligence'
 ]
 packages = [{include = "scraibe"}]
 exclude = [
    "__pycache__",
    "*.pyc",
-        "test"
+    "test",
 ]
 [tool.poetry.dependencies]
 python = "^3.9"
 tqdm = "^4.66.5"
 numpy = "^1.26.4"
-openai-whisper = ">=20231117,<20240931"
+httpx = ">=0.28.0"
 faster-whisper = "^1.0.3"
 "pyannote.audio" = "^3.3.1"
 torch = "^2.1.2"
 [tool.poetry.group.dev.dependencies]
 pytest = "^8.1.1"
@@ -69,5 +73,5 @@ scraibe = "scraibe.cli:cli"
 app = ["scraibe-webui"]
 [tool.ruff.lint.extend-per-file-ignores]
-"__init__.py" = ["E402","F403",'F401']
+"__init__.py" = ["E402", "F403", "F401"]
 "scraibe/misc.py" = ["E722"]
@@ -1,14 +1,3 @@
 tqdm>=4.66.5
 numpy>=1.26.4
-
+httpx>=0.28.0
 openai-whisper==20231117
 faster-whisper~=1.0.3
 pyannote.audio~=3.3.1
 pyannote.core~=5.0.0
 pyannote.database~=5.0.1 
 pyannote.metrics~=3.2.1
 pyannote.pipeline~=3.0.1
 torchaudio>=2.1.2
@@ -1,11 +1,10 @@
-from .autotranscript import *
+from .autotranscript import Scraibe
-from .transcriber import *
+from .localai_client import LocalAIClient, LocalAIError
-from .audio import *
+from .summarizer import SummarizerClient, SummarizerError
-from .transcript_exporter import *
+from .audio import AudioProcessor
-from .diarisation import *
+from .transcript_exporter import Transcript
 from .misc import set_threads, ParseKwargs
-from .misc import *
+from .cli import cli
 from .cli import *
 from ._version import __version__
@@ -2,28 +2,15 @@
 Audio Processor Module
 =======================
-This module provides the AudioProcessor class, utilizing PyTorchaudio for handling audio files.
+Simplified audio processor for ScrAIbe.
 It includes functionalities to load, cut, and manage audio waveforms, offering efficient and
 flexible audio processing.
-Available Classes:
+Previously this used torch and pyannote-style processing. In the LocalAI-backed
- AudioProcessor: Processes audio waveforms and provides methods for loading, 
+version, we primarily pass files to the API, but we keep a lightweight helper
-                    cutting, and handling audio.
+for backward compatibility.
 Usage:
    from .audio_import AudioProcessor
    processor = AudioProcessor.from_file("path/to/audiofile.wav")
    cut_waveform = processor.cut(start=1.0, end=5.0)
 Constants:
 - SAMPLE_RATE (int): Default sample rate for processing.
 - NORMALIZATION_FACTOR (float): Normalization factor for audio waveform.
 """
 from subprocess import CalledProcessError, run
 import numpy as np
 import torch
 SAMPLE_RATE = 16000
 NORMALIZATION_FACTOR = 32768.0
@@ -31,38 +18,25 @@ NORMALIZATION_FACTOR = 32768.0
 class AudioProcessor:
    """
-    Audio Processor class that leverages PyTorchaudio to provide functionalities
+    Lightweight audio processor for loading and cutting audio.
    for loading, cutting, and handling audio waveforms.
    Attributes:
-        waveform: torch.Tensor
+        waveform (np.ndarray): The audio waveform as float32.
-            The audio waveform tensor.
+        sr (int): The sample rate of the audio.
        sr: int
            The sample rate of the audio.
    """
    def __init__(self, waveform: torch.Tensor,
                 sr: int = SAMPLE_RATE) -> None:
        """
        Initialize the AudioProcessor object.
        Args:
            waveform (torch.Tensor): The audio waveform tensor.
            sr (int, optional): The sample rate of the audio. Defaults to SAMPLE_RATE.
        Raises:
            ValueError: If the provided sample rate is not of type int.
    """
    def __init__(self, waveform: np.ndarray, sr: int = SAMPLE_RATE):
        self.waveform = waveform
        self.sr = sr
        if not isinstance(self.sr, int):
-            raise ValueError("Sample rate should be a single value of type int,"
+            raise ValueError(
-                             f"not {len(self.sr)} and type {type(self.sr)}")
+                "Sample rate should be a single value of type int, "
                f"not {len(self.sr)} and type {type(self.sr)}"
            )
    @classmethod
-    def from_file(cls, file: str, *args, **kwargs) -> 'AudioProcessor':
+    def from_file(cls, file: str, *args, **kwargs):
        """
        Create an AudioProcessor instance from an audio file.
@@ -70,55 +44,42 @@ class AudioProcessor:
            file (str): The audio file path.
        Returns:
-            AudioProcessor: An instance of the AudioProcessor class containing the loaded audio.
+            AudioProcessor: Instance with loaded audio.
        """
        audio, sr = cls.load_audio(file, *args, **kwargs)
        audio = torch.from_numpy(audio)
        return cls(audio, sr)
-    def cut(self, start: float, end: float) -> torch.Tensor:
+    def cut(self, start: float, end: float) -> np.ndarray:
        """
-        Cut a segment from the audio waveform between the specified start and end times.
+        Cut a segment from the audio waveform.
        Args:
            start (float): Start time in seconds.
            end (float): End time in seconds.
        Returns:
-            torch.Tensor: The cut waveform segment.
+            np.ndarray: The cut waveform segment.
        """
-
+        start_idx = int(start * self.sr)
-        start = int(start * self.sr)
+        end_idx = int(np.ceil(end * self.sr))
-        if (isinstance(end, float) or isinstance(end, int)) and isinstance(self.sr, int):
+        return self.waveform[start_idx:end_idx]
            end = int(np.ceil(end * self.sr))
        else:
            end = int(torch.ceil(end * self.sr))
        return self.waveform[start:end]
    @staticmethod
    def load_audio(file: str, sr: int = SAMPLE_RATE):
        """
-        Open an audio file and read it as a mono waveform, resampling if necessary.
+        Load an audio file as a mono waveform, resampling if necessary.
-        This method ensures compatibility with pyannote.audio
+        Requires ffmpeg in PATH.
        and requires the ffmpeg CLI in PATH.
        Args:
            file (str): The audio file to open.
-            sr (int, optional): The desired sample rate. Defaults to SAMPLE_RATE.
+            sr (int, optional): The desired sample rate.
        Returns:
-            tuple: A NumPy array containing the audio waveform in float32 dtype
+            tuple: (waveform as np.ndarray[float32], sample rate)
                    and the sample rate.
        Raises:
            RuntimeError: If failed to load audio.
        """
        # This launches a subprocess to decode audio while down-mixing
        # and resampling as necessary.  Requires the ffmpeg CLI in PATH.
        # fmt: off
        cmd = [
            "ffmpeg",
            "-nostdin",
@@ -128,19 +89,20 @@ class AudioProcessor:
            "-ac", "1",
            "-acodec", "pcm_s16le",
            "-ar", str(sr),
-            "-"
+            "-",
        ]
        # fmt: on
        try:
            out = run(cmd, capture_output=True, check=True).stdout
        except CalledProcessError as e:
            raise RuntimeError(
-                f"Failed to load audio: {e.stderr.decode()}") from e
+                f"Failed to load audio: {e.stderr.decode()}"
            ) from e
-        out = np.frombuffer(out, np.int16).flatten().astype(
+        waveform = np.frombuffer(out, np.int16).flatten().astype(
-            np.float32) / NORMALIZATION_FACTOR
+            np.float32
        ) / NORMALIZATION_FACTOR
-        return out, sr
+        return waveform, sr
    def __repr__(self) -> str:
-        return f'TorchAudioProcessor(waveform={len(self.waveform)}, sr={int(self.sr)})'
+        return f"AudioProcessor(waveform_len={len(self.waveform)}, sr={self.sr})"
@@ -1,358 +1,281 @@
 """
-Scraibe Class
+Scraibe Class (LocalAI-backed)
--------------------
+------------------------------
-This class serves as the core of the transcription system, responsible for handling
+Core class for transcription and (optionally) summarization.
 transcription and diarization of audio files. It leverages pretrained models for
 speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio),
 providing an accessible interface for audio processing tasks such as transcription,
 speaker separation, and timestamping.
-By encapsulating the complexities of underlying models, it allows for straightforward
+- Transcription and diarization are delegated to LocalAI (vibevoice.cpp).
-integration into various applications, ranging from transcription services to voice assistants.
+- Summarization is delegated to a separate LLM via /v1/chat/completions.
-Available Classes:
+Public tasks:
- Scraibe: Main class for performing transcription and diarization.
+- transcribe
-                  Includes methods for loading models, processing audio files,
+- transcript_and_summarize (transcribe + generate a detailed summary)
                  and formatting the transcription output.
-Usage:
+Previous task/whisper/pyannote-specific settings are kept for compatibility
-    from scraibe import Scraibe
+but ignored when not relevant.
    model = Scraibe()
    transcript = model.autotranscribe("path/to/audiofile.wav")
 """
 # Standard Library Imports
 import os
-from glob import iglob
+from typing import Union, Optional
 from subprocess import run
 from typing import TypeVar, Union
 from warnings import warn
-# Third-Party Imports
+from .localai_client import LocalAIClient, LocalAIError
-import torch
+from .summarizer import SummarizerClient, SummarizerError
 from numpy import ndarray
 from tqdm import trange
 # Application-Specific Imports
 from .audio import AudioProcessor
 from .diarisation import Diariser
 from .transcriber import Transcriber, load_transcriber, whisper
 from .transcript_exporter import Transcript
 from .misc import SCRAIBE_TORCH_DEVICE
 DiarisationType = TypeVar('DiarisationType')
 class Scraibe:
    """
-    Scraibe is a class responsible for managing the transcription and diarization of audio files.
+    Scraibe now:
-    It serves as the core of the transcription system, incorporating pretrained models
+    - Uses LocalAI for transcription + diarization.
-    for speech-to-text (such as Whisper) and speaker diarization (such as pyannote.audio),
+    - Uses a separate LLM for summarization (when requested).
    allowing for comprehensive audio processing.
-    Attributes:
+    Public methods:
-        transcriber (Transcriber): The transcriber object to handle transcription.
+    - transcribe(audio_file, ...)
-        diariser (Diariser): The diariser object to handle diarization.
+    - transcript_and_summarize(audio_file, ...)
    Methods:
        __init__: Initializes the Scraibe class with appropriate models.
        transcribe: Transcribes an audio file using the whisper model and pyannote diarization model.
        remove_audio_file: Removes the original audio file to avoid disk space issues or ensure data privacy.
        get_audio_file: Gets an audio file as an AudioProcessor object.
    """
-    def __init__(self,
+    def __init__(
-                 whisper_model: Union[bool, str, whisper] = None,
+        self,
        api_url: Optional[str] = None,
        api_key: Optional[str] = None,
        model: Optional[str] = None,
        whisper_model: Union[bool, str] = None,
        whisper_type: str = "whisper",
-                 dia_model: Union[bool, str, DiarisationType] = None,
+        dia_model: Union[bool, str] = None,
-                 **kwargs) -> None:
+        use_auth_token: str = None,
-        """Initializes the Scraibe class.
+        verbose: bool = False,
        **kwargs,
    ) -> None:
        """
        Initialize Scraibe with LocalAI client and summarizer client.
        Args:
-            whisper_model (Union[bool, str, whisper], optional): 
+            api_url: LocalAI server URL for transcription/diarization.
-                                Path to whisper model or whisper model itself.
+                     Falls back to LOCALAI_API_URL env var.
-            whisper_type (str):
+            api_key: API key for LocalAI. Falls back to LOCALAI_API_KEY.
-                                Type of whisper model to load. "whisper" or "faster-whisper".
+            model: Model name for LocalAI (e.g., vibevoice-diarize).
-            diarisation_model (Union[bool, str, DiarisationType], optional): 
+                   Falls back to LOCALAI_MODEL env var.
                                Path to pyannote diarization model or model itself.
            **kwargs: Additional keyword arguments for whisper
                        and pyannote diarization models.
                    e.g.:
-                    - verbose: If True, the class will print additional information.
+            Summarizer uses:
-                    - save_kwargs: If True, the keyword arguments will be saved
+            - SUMMARIZER_API_URL
-                                    for autotranscribe. So you can unload the class and reload it again.
+            - SUMMARIZER_API_KEY
            - SUMMARIZER_MODEL
            These can be overridden via environment or via the transcript_and_summarize
            method if needed.
            Backward-compat (ignored):
            - whisper_model, whisper_type, dia_model, use_auth_token, etc.
        """
        self.verbose = verbose or kwargs.get("verbose", False)
-        if whisper_model is None:
+        try:
-            self.transcriber = load_transcriber(
+            self.client = LocalAIClient(
-                "medium", whisper_type, **kwargs)
+                api_url=api_url,
-        elif isinstance(whisper_model, str):
+                api_key=api_key,
-            self.transcriber = load_transcriber(
+                model=model,
-                whisper_model, whisper_type, **kwargs)
+            )
-        else:
+        except LocalAIError as e:
-            self.transcriber = whisper_model
+            raise LocalAIError(f"Failed to initialize LocalAI client: {e}")
-        if dia_model is None:
+        # Summarizer is lazy-initialized if needed
-            self.diariser = Diariser.load_model(**kwargs)
+        self._summarizer: Optional[SummarizerClient] = None
        elif isinstance(dia_model, str):
            self.diariser = Diariser.load_model(dia_model, **kwargs)
        else:
            self.diariser: Diariser = dia_model
        if kwargs.get("verbose"):
            print("Scraibe initialized all models successfully loaded.")
            self.verbose = True
        else:
            self.verbose = False
        # Save kwargs for autotranscribe if you want to unload the class and load it again.
        if kwargs.get('save_setup'):
            self.params = dict(whisper_model=whisper_model,
                               dia_model=dia_model,
                               **kwargs)
        else:
            self.params = {}
        self.device = kwargs.get(
            "device", SCRAIBE_TORCH_DEVICE)
    def autotranscribe(self, audio_file: Union[str, torch.Tensor, ndarray],
                       remove_original: bool = False,
                       **kwargs) -> Transcript:
        """
        Transcribes an audio file using the whisper model and pyannote diarization model.
        Args:
            audio_file (Union[str, torch.Tensor, ndarray]): 
                            Path to audio file or a tensor representing the audio.
            remove_original (bool, optional): If True, the original audio file will
                                                be removed after transcription.
            *args: Additional positional arguments for diarization and transcription.
            **kwargs: Additional keyword arguments for diarization and transcription.
        Returns:
            Transcript: A Transcript object containing the transcription,
                        which can be exported to different formats.
        """
        if kwargs.get("verbose"):
            self.verbose = kwargs.get("verbose")
        # Get audio file as an AudioProcessor object
        audio_file: AudioProcessor = self.get_audio_file(audio_file)
        # Prepare waveform and sample rate for diarization
        dia_audio = {
            "waveform": audio_file.waveform.reshape(1, len(audio_file.waveform)).to(self.device),
            "sample_rate": audio_file.sr
        }
        if self.verbose:
-            print("Starting diarisation.")
+            print("Scraibe initialized. Using LocalAI for transcription and diarization.")
-        diarisation = self.diariser.diarization(dia_audio, **kwargs)
+    def _ensure_summarizer(
-
+        self,
-        if not diarisation["segments"]:
+        api_url: Optional[str] = None,
-            print("No segments found. Try to run transcription without diarisation.")
+        api_key: Optional[str] = None,
-
+        model: Optional[str] = None,
-            transcript = self.transcriber.transcribe(
+    ) -> SummarizerClient:
                audio_file.waveform, **kwargs)
            final_transcript = {0: {"speakers": 'SPEAKER_01',
                                    "segments": [0, len(audio_file.waveform)],
                                    "text": transcript}}
            return Transcript(final_transcript)
        if self.verbose:
            print("Diarisation finished. Starting transcription.")
        # Transcribe each segment and store the results
        final_transcript = dict()
        for i in trange(len(diarisation["segments"]), desc="Transcribing", disable=not self.verbose):
            seg = diarisation["segments"][i]
            audio = audio_file.cut(seg[0], seg[1])
            transcript = self.transcriber.transcribe(audio, **kwargs)
            final_transcript[i] = {"speakers": diarisation["speakers"][i],
                                   "segments": seg,
                                   "text": transcript}
        # Remove original file if needed
        if remove_original:
            if kwargs.get("shred") is True:
                self.remove_audio_file(audio_file, shred=True)
            else:
                self.remove_audio_file(audio_file, shred=False)
        return Transcript(final_transcript)
    def diarization(self, audio_file: Union[str, torch.Tensor, ndarray],
                    **kwargs) -> dict:
        """
-        Perform diarization on an audio file using the pyannote diarization model.
+        Lazy-init summarizer client.
        """
        if self._summarizer is not None:
            return self._summarizer
        try:
            self._summarizer = SummarizerClient(
                api_url=api_url,
                api_key=api_key,
                model=model,
            )
        except SummarizerError as e:
            raise SummarizerError(f"Failed to initialize Summarizer client: {e}")
        return self._summarizer
    # -----------------
    # Primary public API
    # -----------------
    def transcribe(
        self,
        audio_file: Union[str],
        **kwargs,
    ) -> str:
        """
        Transcribe the provided audio file using LocalAI.
        Uses /v1/audio/diarization with vibevoice.cpp, then concatenates
        all segment texts.
        Args:
-            audio_file (Union[str, torch.Tensor, ndarray]):
+            audio_file (str): Path to the audio file.
-                The audio source which can either be a path to the audio file or a tensor representation.
+            **kwargs: Additional keyword arguments (some forwarded, others ignored).
            **kwargs: 
                Additional keyword arguments for diarization.
        Returns:
-            dict: 
+            str: The concatenated transcribed text.
                A dictionary containing the results of the diarization process.
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
                raise FileNotFoundError(f"Audio file not found: {audio_file}")
        else:
            raise TypeError(
                "In LocalAI mode, audio_file must be a file path (str)."
            )
-        # Get audio file as an AudioProcessor object
+        verbose = kwargs.get("verbose", self.verbose)
        audio_file: AudioProcessor = self.get_audio_file(audio_file)
-        # Prepare waveform and sample rate for diarization
+        try:
-        dia_audio = {
+            result = self.client.diarize_and_transcribe(
-            "waveform": audio_file.waveform.reshape(1, len(audio_file.waveform)).to(self.device),
+                audio_path=audio_file,
-            "sample_rate": audio_file.sr
+                include_text=True,
                verbose=verbose,
                **kwargs,
            )
        except LocalAIError as e:
            raise LocalAIError(f"Error during LocalAI transcription: {e}")
        transcripts = result.get("transcripts", [])
        return " ".join(t.strip() for t in transcripts if t.strip())
    def transcript_and_summarize(
        self,
        audio_file: Union[str],
        *,
        summarizer_api_url: Optional[str] = None,
        summarizer_api_key: Optional[str] = None,
        summarizer_model: Optional[str] = None,
        **kwargs,
    ) -> dict:
        """
        Transcribe the audio file and generate a detailed summary.
        Steps:
        - Transcribe via LocalAI.
        - Build a plain-text transcript (with speaker labels).
        - Summarize the transcript using the configured LLM.
        Returns:
            dict with:
              - transcript: full transcript text (with speaker labels)
              - summary: final detailed summary (markdown-ready)
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
                raise FileNotFoundError(f"Audio file not found: {audio_file}")
        else:
            raise TypeError(
                "In LocalAI mode, audio_file must be a file path (str)."
            )
        verbose = kwargs.get("verbose", self.verbose)
        # 1) Get diarized + transcribed result
        try:
            result = self.client.diarize_and_transcribe(
                audio_path=audio_file,
                include_text=True,
                verbose=verbose,
                **kwargs,
            )
        except LocalAIError as e:
            raise LocalAIError(f"Error during LocalAI transcription: {e}")
        segments = result.get("segments", [])
        speakers = result.get("speakers", [])
        transcripts = result.get("transcripts", [])
        if not segments:
            return {
                "transcript": "",
                "summary": "No transcript content to summarize.",
            }
-        print("Starting diarisation.")
+        # 2) Build full transcript text with speaker labels
        lines = []
        for seg, speaker, text in zip(segments, speakers, transcripts):
            start, end = seg
            ts = self._format_timestamp(start)
            line = f"[{ts}] {speaker}: {text.strip()}"
            lines.append(line)
-        diarisation = self.diariser.diarization(dia_audio, **kwargs)
+        full_transcript = "\n\n".join(lines)
-        return diarisation
+        # 3) Summarize
        try:
            summarizer = self._ensure_summarizer(
                api_url=summarizer_api_url,
                api_key=summarizer_api_key,
                model=summarizer_model,
            )
        except SummarizerError as e:
            raise SummarizerError(f"Failed to initialize summarizer: {e}")
-    def transcribe(self, audio_file: Union[str, torch.Tensor, ndarray],
+        try:
-                   **kwargs):
+            summary = summarizer.summarize_transcript(full_transcript)
-        """
+        except SummarizerError as e:
-            Transcribe the provided audio file.
+            raise SummarizerError(f"Error during summarization: {e}")
-            Args:
+        return {
-                audio_file (Union[str, torch.Tensor, ndarray]):
+            "transcript": full_transcript,
-                    The audio source, which can either be a path or a tensor representation.
+            "summary": summary,
-                **kwargs: 
+        }
                    Additional keyword arguments for transcription.
-            Returns:
+    # -----------------
-                str:
+    # Helpers
-                    The transcribed text from the audio source.
+    # -----------------
        """
        audio_file: AudioProcessor = self.get_audio_file(audio_file)
        return self.transcriber.transcribe(audio_file.waveform, **kwargs)
    def update_transcriber(self, whisper_model: Union[str, whisper], **kwargs) -> None:
        """
        Update the transcriber model.
        Args:
            whisper_model (Union[str, whisper]):
                The new whisper model to use for transcription.
            **kwargs:
                Additional keyword arguments for the transcriber model.
            Returns:
                None
        """
        _old_model = self.transcriber.model_name
        if isinstance(whisper_model, str):
            self.transcriber = load_transcriber(whisper_model, **kwargs)
        elif isinstance(whisper_model, Transcriber):
            self.transcriber = whisper_model
        else:
            warn(
                f"Invalid model type. Please provide a valid model. Fallback to old {_old_model} Model.", RuntimeWarning)
        return None
    def update_diariser(self, dia_model: Union[str, DiarisationType], **kwargs) -> None:
        """
        Update the diariser model.
        Args:
            dia_model (Union[str, DiarisationType]):
                The new diariser model to use for diarization.
            **kwargs:
                Additional keyword arguments for the diariser model.
            Returns:
                None
        """
        if isinstance(dia_model, str):
            self.diariser = Diariser.load_model(dia_model, **kwargs)
        elif isinstance(dia_model, Diariser):
            self.diariser = dia_model
        else:
            warn("Invalid model type. Please provide a valid model. Fallback to old Model.", RuntimeWarning)
        return None
    @staticmethod
-    def remove_audio_file(audio_file: str,
+    def _format_timestamp(seconds: float) -> str:
                          shred: bool = False) -> None:
        """
-        Removes the original audio file to avoid disk space issues or ensure data privacy.
+        Format seconds into MM:SS or HH:MM:SS.
        """
        m, s = divmod(int(seconds), 60)
        h, m = divmod(m, 60)
        if h > 0:
            return f"{h:02d}:{m:02d}:{s:02d}"
        return f"{m:02d}:{s:02d}"
-        Args:
+    @staticmethod
-            audio_file_path (str): Path to the audio file.
+    def remove_audio_file(audio_file: str, shred: bool = False) -> None:
-            shred (bool, optional): If True, the audio file will be shredded,
+        """
-                                    not just removed.
+        Remove the original audio file.
        """
        if not os.path.exists(audio_file):
            raise ValueError(f"Audiofile {audio_file} does not exist.")
        if shred:
            import subprocess
            import warnings
            from glob import iglob
-            warn("Shredding audiofile can take a long time.", RuntimeWarning)
+            warnings.warn("Shredding audiofile can take a long time.", RuntimeWarning)
-            gen = iglob(f'{audio_file}', recursive=True)
+            gen = iglob(f"{audio_file}", recursive=True)
-            cmd = ['shred', '-zvu', '-n', '10', f'{audio_file}']
+            cmd = ["shred", "-zvu", "-n", "10", f"{audio_file}"]
            if os.path.isdir(audio_file):
                raise ValueError(f"Audiofile {audio_file} is a directory.")
            for file in gen:
-                print(f'shredding {file} now\n')
+                print(f"shredding {file} now\n")
-
+                subprocess.run(cmd, check=True)
                run(cmd, check=True)
        else:
            os.remove(audio_file)
            print(f"Audiofile {audio_file} removed.")
    @staticmethod
    def get_audio_file(audio_file: Union[str, torch.Tensor, ndarray]) -> AudioProcessor:
        """Gets an audio file as TorchAudioProcessor.
        Args:
            audio_file (Union[str, torch.Tensor, ndarray]): Path to the audio file or 
                                                        a tensor representing the audio.
            *args: Additional positional arguments.
            **kwargs: Additional keyword arguments.
        Returns:
            AudioProcessor: An object containing the waveform and sample rate in
                            torch.Tensor format.
        """
        if isinstance(audio_file, str):
            audio_file = AudioProcessor.from_file(audio_file)
        elif isinstance(audio_file, torch.Tensor):
            audio_file = AudioProcessor(audio_file[0], audio_file[1])
        elif isinstance(audio_file, ndarray):
            audio_file = AudioProcessor(torch.Tensor(audio_file[0]),
                                        audio_file[1])
        if not isinstance(audio_file, AudioProcessor):
            raise ValueError(f'Audiofile must be of type AudioProcessor,'
                             f'not {type(audio_file)}')
        return audio_file
    def __repr__(self):
-        return f"Scraibe(transcriber={self.transcriber}, diariser={self.diariser})"
+        return "Scraibe(LocalAI-backed)"
@@ -3,23 +3,21 @@ Command-Line Interface (CLI) for the Scraibe class,
 allowing for user interaction to transcribe and diarize audio files.
 The function includes arguments for specifying the audio files, model paths,
 output formats, and other options necessary for transcription.
 This version is adapted for LocalAI-based transcription and diarization.
 """
 import os
 import json
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
 from whisper.tokenizer import LANGUAGES, TO_LANGUAGE_CODE
 from torch.cuda import is_available
 from .autotranscript import Scraibe
 from .misc import set_threads
 def cli():
    """
    Command-Line Interface (CLI) for the Scraibe class, allowing for user interaction to transcribe
-    and diarize audio files. The function includes arguments for specifying the audio files, model paths, 
+    and diarize audio files via a LocalAI server.
    output formats, and other options necessary for transcription.
    This function can be executed from the command line to perform transcription tasks, providing a 
    user-friendly way to access the Scraibe class functionalities.
    """
    def str2bool(string):
@@ -28,59 +26,160 @@ def cli():
            return str2val[string]
        else:
            raise ValueError(
-                f"Expected one of {set(str2val.keys())}, got {string}")
+                f"Expected one of {set(str2val.keys())}, got {string}"
            )
    parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-    parser.add_argument("-f", "--audio-files", nargs="+", type=str, default=None,
+    parser.add_argument(
-                        help="List of audio files to transcribe.")
+        "-f",
        "--audio-files",
        nargs="+",
        type=str,
        default=None,
        help="List of audio files to transcribe.",
    )
-    parser.add_argument("--whisper-type", type=str, default="whisper",
+    # LocalAI connection (env vars preferred, but CLI overrides allowed)
    parser.add_argument(
        "--localai-api-url",
        type=str,
        default=None,
        help="LocalAI server URL (e.g., http://localhost:8080). "
             "Overrides LOCALAI_API_URL env var if provided.",
    )
    parser.add_argument(
        "--localai-api-key",
        type=str,
        default=None,
        help="LocalAI API key. Overrides LOCALAI_API_KEY env var if provided.",
    )
    parser.add_argument(
        "--localai-model",
        type=str,
        default=None,
        help="Model name to use on LocalAI (e.g., vibevoice-diarize). "
             "Overrides LOCALAI_MODEL env var if provided.",
    )
    # Summarizer overrides (env vars are primary)
    parser.add_argument(
        "--summarizer-api-url",
        type=str,
        default=None,
        help="Summarization LLM API URL (e.g., http://localhost:8080). "
             "Overrides SUMMARIZER_API_URL env var if provided.",
    )
    parser.add_argument(
        "--summarizer-api-key",
        type=str,
        default=None,
        help="Summarization LLM API key. Overrides SUMMARIZER_API_KEY env var if provided.",
    )
    parser.add_argument(
        "--summarizer-model",
        type=str,
        default=None,
        help="Model name for summarization. Overrides SUMMARIZER_MODEL env var if provided.",
    )
    # Kept for backward compatibility with UI / existing scripts; ignored by LocalAI client.
    parser.add_argument(
        "--whisper-type",
        type=str,
        default="whisper",
        choices=["whisper", "faster-whisper"],
-                        help="Type of Whisper model to use ('whisper' or 'faster-whisper').")
+        help="[Backward compatibility] Type of Whisper model. Ignored when using LocalAI.",
    )
-    parser.add_argument("--whisper-model-name", default="medium",
+    parser.add_argument(
-                        help="Name of the Whisper model to use.")
+        "--whisper-model-name",
        default="medium",
        help="[Backward compatibility] Whisper model name. Ignored when using LocalAI.",
    )
-    parser.add_argument("--whisper-model-directory", type=str, default=None,
+    parser.add_argument(
-                        help="Path to save Whisper model files; defaults to ./models/whisper.")
+        "--whisper-model-directory",
        type=str,
        default=None,
        help="[Backward compatibility] Whisper model directory. Ignored when using LocalAI.",
    )
-    parser.add_argument("--diarization-directory", type=str, default=None,
+    parser.add_argument(
-                        help="Path to the diarization model directory.")
+        "--diarization-directory",
        type=str,
        default=None,
        help="[Backward compatibility] Diarization model directory. Ignored when using LocalAI.",
    )
-    parser.add_argument("--hf-token", default=None, type=str,
+    parser.add_argument(
-                        help="HuggingFace token for private model download.")
+        "--hf-token",
        default=None,
        type=str,
        help="[Backward compatibility] HuggingFace token. Ignored when using LocalAI.",
    )
-    parser.add_argument("--inference-device",
+    parser.add_argument(
-                        default="cuda" if is_available() else "cpu",
+        "--inference-device",
-                        help="Device to use for PyTorch inference.")
+        default="cpu",
        help="[Backward compatibility] Device for inference. Ignored when using LocalAI.",
    )
-    parser.add_argument("--num-threads", type=int, default=None,
+    parser.add_argument(
-                        help="Number of threads used by torch for CPU inference; '\
+        "--num-threads",
-                            'overrides MKL_NUM_THREADS/OMP_NUM_THREADS.")
+        type=int,
        default=None,
        help="Number of threads used for CPU operations; overrides MKL_NUM_THREADS/OMP_NUM_THREADS.",
    )
-    parser.add_argument("--output-directory", "-o", type=str, default=".",
+    parser.add_argument(
-                        help="Directory to save the transcription outputs.")
+        "--output-directory",
        "-o",
        type=str,
        default=".",
        help="Directory to save the transcription outputs.",
    )
-    parser.add_argument("--output-format", "-of", type=str, default="txt",
+    parser.add_argument(
        "--output-format",
        "-of",
        type=str,
        default="txt",
        choices=["txt", "json", "md", "html"],
-                        help="Format of the output file; defaults to txt.")
+        help="Format of the output file; defaults to txt.",
    )
-    parser.add_argument("--verbose-output", type=str2bool, default=True,
+    parser.add_argument(
-                        help="Enable or disable progress and debug messages.")
+        "--verbose-output",
        type=str2bool,
        default=True,
        help="Enable or disable progress and debug messages.",
    )
-    parser.add_argument("--task", type=str, default='autotranscribe',
+    parser.add_argument(
-                        choices=["autotranscribe", "diarization",
+        "--task",
-                                 "autotranscribe+translate", "translate", 'transcribe'],
+        type=str,
-                        help="Choose to perform transcription, diarization, or translation. \
+        default="transcribe",
-                        If set to translate, the output will be translated to English.")
+        choices=[
            "transcribe",
            "transcript_and_summarize",
        ],
        help="Task to perform: 'transcribe' or 'transcript_and_summarize'.",
    )
-    parser.add_argument("--language", type=str, default=None,
+    parser.add_argument(
-                        choices=sorted(
+        "--language",
-                            LANGUAGES.keys()) + sorted([k.title() for k in TO_LANGUAGE_CODE.keys()]),
+        type=str,
-                        help="Language spoken in the audio. Specify None to perform language detection.")
+        default=None,
-    parser.add_argument("--num-speakers", type=int, default=2,
+        help="Language spoken in the audio. Specify None to perform language detection.",
-                        help="Number of speakers in the audio.")
+    )
    parser.add_argument(
        "--num-speakers",
        type=int,
        default=None,
        help="Number of speakers in the audio.",
    )
    args = parser.parse_args()
@@ -96,65 +195,64 @@ def cli():
    set_threads(arg_dict.pop("num_threads"))
-    class_kwargs = {'whisper_model': arg_dict.pop("whisper_model_name"),
+    # Build kwargs for Scraibe (LocalAI-backed)
-                    'whisper_type':arg_dict.pop("whisper_type"),
+    class_kwargs = {
-                    'dia_model': arg_dict.pop("diarization_directory"),
+        "api_url": arg_dict.pop("localai_api_url"),
-                    'use_auth_token': arg_dict.pop("hf_token"),
+        "api_key": arg_dict.pop("localai_api_key"),
        "model": arg_dict.pop("localai_model"),
        # kept for backward compatibility, but ignored:
        "whisper_model": arg_dict.pop("whisper_model_name"),
        "whisper_type": arg_dict.pop("whisper_type"),
        "dia_model": arg_dict.pop("diarization_directory"),
        "use_auth_token": arg_dict.pop("hf_token"),
        "verbose": arg_dict.pop("verbose_output"),
    }
    if arg_dict["whisper_model_directory"]:
        class_kwargs["download_root"] = arg_dict.pop("whisper_model_directory")
    model = Scraibe(**class_kwargs)
    if arg_dict["audio_files"]:
        audio_files = arg_dict.pop("audio_files")
-        if task == "autotranscribe" or task == "autotranscribe+translate":
+        if task == "transcribe":
            for audio in audio_files:
-                if task == "autotranscribe+translate":
+                out = model.transcribe(
                    task = "translate"
                else:
                    task = "transcribe"
                out = model.autotranscribe(
                    audio,
                        task=task, 
                    language=arg_dict.pop("language"),
                    verbose=arg_dict.pop("verbose_output"),
-                        num_speakers=arg_dict.pop("num_speakers")
+                    num_speakers=arg_dict.pop("num_speakers"),
                )
                basename = audio.split("/")[-1].split(".")[0]
                print(f'Saving {basename}.{out_format} to {out_folder}')
                out.save(os.path.join(
                    out_folder, f"{basename}.{out_format}"))
        elif task == "diarization":
            for audio in audio_files:
                if arg_dict.pop("verbose_output"):
                    print("Verbose not implemented for diarization.")
                out = model.diarization(audio)
                basename = audio.split("/")[-1].split(".")[0]
                path = os.path.join(out_folder, f"{basename}.{out_format}")
-
+                print(f"Saving {basename}.{out_format} to {out_folder}")
-                print(f'Saving {basename}.{out_format} to {out_folder}')
+                with open(path, "w", encoding="utf-8") as f:
                with open(path, "w") as f:
                    json.dump(json.dumps(out, indent=1), f)
        elif task == "transcribe" or task == "translate":
            for audio in audio_files:
                out = model.transcribe(audio, task=task,
                                        language=arg_dict.pop("language"),
                                        verbose=arg_dict.pop("verbose_output"))
                basename = audio.split("/")[-1].split(".")[0]
                path = os.path.join(out_folder, f"{basename}.{out_format}")
                with open(path, "w") as f:
                    f.write(out)
        elif task == "transcript_and_summarize":
            for audio in audio_files:
                result = model.transcript_and_summarize(
                    audio,
                    summarizer_api_url=arg_dict.pop("summarizer_api_url"),
                    summarizer_api_key=arg_dict.pop("summarizer_api_key"),
                    summarizer_model=arg_dict.pop("summarizer_model"),
                    language=arg_dict.pop("language"),
                    verbose=arg_dict.pop("verbose_output"),
                    num_speakers=arg_dict.pop("num_speakers"),
                )
                transcript_text = result.get("transcript", "")
                summary_text = result.get("summary", "")
                basename = audio.split("/")[-1].split(".")[0]
                # Always use .md for transcript_and_summarize
                md_path = os.path.join(out_folder, f"{basename}.md")
                print(f"Saving {basename}.md (transcript + summary) to {out_folder}")
                with open(md_path, "w", encoding="utf-8") as f:
                    f.write("# Transcript\n\n")
                    f.write(transcript_text)
                    f.write("\n\n# Summary\n\n")
                    f.write(summary_text)
 if __name__ == "__main__":
    cli()
@@ -0,0 +1,237 @@
 """
 LocalAI Client Module
 ---------------------
 This module provides a client for communicating with a LocalAI server
 running vibevoice.cpp for transcription and speaker diarization.
 It replaces the previous local Whisper + Pyannote pipeline by sending
 audio files to the /v1/audio/diarization endpoint and mapping the
 response into the same Transcript format used by the UI.
 Environment Variables:
    LOCALAI_API_URL: (required) Base URL of the LocalAI server
                     (e.g., http://localhost:8080)
    LOCALAI_API_KEY: (optional) API key, if configured
    LOCALAI_MODEL:   (optional) Model name to use (default: vibevoice-diarize)
 """
 import os
 import io
 import json
 from typing import Dict, List, Any, Optional
 import httpx
 class LocalAIError(Exception):
    """Raised when the LocalAI API returns an error or unexpected response."""
    pass
 class LocalAIClient:
    """
    Thin HTTP client for LocalAI /v1/audio/diarization with vibevoice.cpp.
    Responsibilities:
      - Read configuration from environment.
      - Upload audio file as multipart/form-data.
      - Parse diarization + transcription response.
      - Map response into the same structure expected by Scraibe's Transcript.
    """
    def __init__(
        self,
        api_url: Optional[str] = None,
        api_key: Optional[str] = None,
        model: Optional[str] = None,
        timeout: float = 600.0,
    ):
        """
        Args:
            api_url: LocalAI server URL (e.g., http://localhost:8080).
                     Falls back to LOCALAI_API_URL env var.
            api_key: API key, if required. Falls back to LOCALAI_API_KEY.
            model:   Model name (e.g., vibevoice-diarize).
                     Falls back to LOCALAI_MODEL or default.
            timeout: Request timeout in seconds.
        """
        self.api_url = (api_url or os.getenv("LOCALAI_API_URL")).strip().rstrip("/")
        self.api_key = api_key or os.getenv("LOCALAI_API_KEY") or None
        self.model = model or os.getenv("LOCALAI_MODEL") or "vibevoice-diarize"
        self.timeout = timeout
        if not self.api_url:
            raise LocalAIError(
                "LOCALAI_API_URL is not set. "
                "Provide the LocalAI server URL via environment or constructor."
            )
        self._client = httpx.Client(
            base_url=self.api_url,
            timeout=self.timeout,
            follow_redirects=True,
        )
    def close(self):
        """Close the underlying HTTP client."""
        self._client.close()
    def __del__(self):
        try:
            self._client.close()
        except Exception:
            pass
    def diarize_and_transcribe(
        self,
        audio_path: str,
        *,
        language: Optional[str] = None,
        num_speakers: Optional[int] = None,
        min_speakers: Optional[int] = None,
        max_speakers: Optional[int] = None,
        clustering_threshold: Optional[float] = None,
        min_duration_on: Optional[float] = None,
        min_duration_off: Optional[float] = None,
        response_format: Optional[str] = None,
        include_text: Optional[bool] = None,
        verbose: bool = False,
        **_ignored,
    ) -> Dict[str, Any]:
        """
        Send audio to LocalAI /v1/audio/diarization and return a dict
        in the same style as the previous internal diarization output:
        {
          "segments": [ [start, end], ... ],
          "speakers": [ "SPEAKER_00", ... ],
          "transcripts": [ "text for segment", ... ]
        }
        Extra kwargs that the old UI used (e.g., whisper-specific) are
        accepted but ignored.
        Args:
            audio_path: Path to the audio file.
            language: Language hint, forwarded if set.
            num_speakers: Optional exact speaker count.
            min_speakers: Optional hint.
            max_speakers: Optional hint.
            clustering_threshold: Optional clustering threshold.
            min_duration_on: Optional min segment duration.
            min_duration_off: Optional min gap duration.
            response_format: "json", "verbose_json", or "rttm".
                             Defaults to "verbose_json" if not set.
            include_text: Whether to request per-segment text.
                          Defaults to True.
            verbose: If True, prints progress messages.
        """
        if verbose:
            print("Starting diarization and transcription via LocalAI.")
        # Defaults: use verbose_json + include_text to get both diarization and transcription.
        if response_format is None:
            response_format = "verbose_json"
        if include_text is None:
            include_text = True
        # Prepare form data
        data = {
            "model": self.model,
            "response_format": response_format,
            "include_text": str(include_text).lower(),
        }
        if language is not None:
            data["language"] = language
        if num_speakers is not None:
            data["num_speakers"] = str(num_speakers)
        if min_speakers is not None:
            data["min_speakers"] = str(min_speakers)
        if max_speakers is not None:
            data["max_speakers"] = str(max_speakers)
        if clustering_threshold is not None:
            data["clustering_threshold"] = str(clustering_threshold)
        if min_duration_on is not None:
            data["min_duration_on"] = str(min_duration_on)
        if min_duration_off is not None:
            data["min_duration_off"] = str(min_duration_off)
        # Open file
        if not os.path.exists(audio_path):
            raise LocalAIError(f"Audio file not found: {audio_path}")
        with open(audio_path, "rb") as f:
            files = {
                "file": (os.path.basename(audio_path), f, "application/octet-stream")
            }
            headers = {}
            if self.api_key:
                headers["Authorization"] = f"Bearer {self.api_key}"
            # POST /v1/audio/diarization
            resp = self._client.post(
                "/v1/audio/diarization",
                data=data,
                files=files,
                headers=headers,
            )
        if resp.status_code >= 400:
            body = resp.text
            raise LocalAIError(
                f"LocalAI request failed with status {resp.status_code}: {body}"
            )
        try:
            result = resp.json()
        except json.JSONDecodeError:
            raise LocalAIError(
                "Failed to parse LocalAI response as JSON."
            )
        if verbose:
            print("Diarization and transcription finished. Starting post-processing.")
        return self._parse_diarization_response(result)
    def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """
        Convert LocalAI response into the internal format used by Scraibe:
        {
          "segments": [ [start, end], ... ],
          "speakers": [ "SPEAKER_00", ... ],
          "transcripts": [ "text for segment", ... ]
        }
        """
        segments = result.get("segments", [])
        if not segments:
            # If no segments, return empty but valid structure
            return {
                "segments": [],
                "speakers": [],
                "transcripts": [],
            }
        out_segments = []
        out_speakers = []
        out_transcripts = []
        for seg in segments:
            start = float(seg.get("start", 0.0))
            end = float(seg.get("end", 0.0))
            speaker = seg.get("speaker", "SPEAKER_00")
            text = seg.get("text", "").strip()
            out_segments.append([start, end])
            out_speakers.append(speaker)
            out_transcripts.append(text)
        return {
            "segments": out_segments,
            "speakers": out_speakers,
            "transcripts": out_transcripts,
        }
@@ -1,77 +1,52 @@
 import os
 import yaml
 from argparse import Action
 from ast import literal_eval
 from torch.cuda import is_available
 from torch import get_num_threads, set_num_threads
 CACHE_DIR = os.getenv(
    "AUTOT_CACHE",
    os.path.expanduser("~/.cache/torch/models"),
 )
 os.environ["PYANNOTE_CACHE"] = os.getenv(
    "PYANNOTE_CACHE",
    os.path.join(CACHE_DIR, "pyannote"),
 )
 # Legacy paths kept for backward compatibility (ignored by LocalAI client)
 WHISPER_DEFAULT_PATH = os.path.join(CACHE_DIR, "whisper")
 PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote")
-PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml") \
+PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")
    if os.path.exists(os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")) \
    else ('Jaikinator/ScrAIbe', 'pyannote/speaker-diarization-3.1')
 SCRAIBE_TORCH_DEVICE =  os.getenv("SCRAIBE_TORCH_DEVICE", "cuda" if is_available() else "cpu")
-SCRAIBE_NUM_THREADS = os.getenv("SCRAIBE_NUM_THREADS", min(8, get_num_threads()))
+def set_threads(parse_threads=None, yaml_threads=None):
 def config_diarization_yaml(file_path: str, path_to_segmentation: str = None) -> None:
    """Configure diarization pipeline from a YAML file.
    This function updates the YAML file to use the given segmentation model
    offline, and avoids manual file manipulation.
    Args:
        file_path (str): Path to the YAML file.
        path_to_segmentation (str, optional): Optional path to the segmentation model.
    Raises:
        FileNotFoundError: If the segmentation model file is not found.
    """
-    with open(file_path, "r") as stream:
+    Configure number of threads.
        yml = yaml.safe_load(stream)
-    segmentation_path = path_to_segmentation or os.path.join(
+    In LocalAI mode, this is mainly kept for backward compatibility.
-        PYANNOTE_DEFAULT_PATH, "pytorch_model.bin")
+    """
-    yml["pipeline"]["params"]["segmentation"] = segmentation_path
+    chosen = None
    if not os.path.exists(segmentation_path):
        raise FileNotFoundError(
            f"Segmentation model not found at {segmentation_path}")
    with open(file_path, "w") as stream:
        yaml.dump(yml, stream)
 def set_threads(parse_threads=None,
                yaml_threads=None):
    global SCRAIBE_NUM_THREADS
    if parse_threads is not None:
        if not isinstance(parse_threads, int):
-            # probably covered with int type of parser arg
+            raise ValueError(
-            raise ValueError(f"Type of --num-threads must be int, but the type is {type(parse_threads)}")
+                f"Type of --num-threads must be int, but the type is {type(parse_threads)}"
            )
        elif parse_threads < 1:
-            raise ValueError(f"Number of threads must be a positive integer, {parse_threads} was given")
+            raise ValueError(
                f"Number of threads must be a positive integer, {parse_threads} was given"
            )
        else:
-            set_num_threads(parse_threads)
+            chosen = parse_threads
            SCRAIBE_NUM_THREADS = parse_threads
    elif yaml_threads is not None:
        if not isinstance(yaml_threads, int):
-            raise ValueError(f"Type of num_threads must be int, but the type is {type(yaml_threads)}")
+            raise ValueError(
                f"Type of num_threads must be int, but the type is {type(yaml_threads)}"
            )
        elif yaml_threads < 1:
-            raise ValueError(f"Number of threads must be a positive integer, {yaml_threads} was given")
+            raise ValueError(
                f"Number of threads must be a positive integer, {yaml_threads} was given"
            )
        else:
-            set_num_threads(yaml_threads)
+            chosen = yaml_threads
-            SCRAIBE_NUM_THREADS = yaml_threads
+
    if chosen is not None:
        os.environ["OMP_NUM_THREADS"] = str(chosen)
        os.environ["MKL_NUM_THREADS"] = str(chosen)
 class ParseKwargs(Action):
    """
@@ -81,7 +56,7 @@ class ParseKwargs(Action):
    def __call__(self, parser, namespace, values, option_string=None):
        setattr(namespace, self.dest, dict())
        for value in values:
-            key, value = value.split('=')
+            key, value = value.split("=")
            try:
                value = literal_eval(value)
            except:
@@ -0,0 +1,212 @@
 """
 Summarizer Module
 -----------------
 Provides a client to summarize long transcripts via an LLM endpoint.
 Behavior:
 - Chunks transcript into 10,240-character segments.
 - Generates a summary for each chunk.
 - Combines all chunk summaries and produces a final, detailed summary.
 Environment Variables:
 - SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080)
 - SUMMARIZER_API_KEY: (optional) API key, if required
 - SUMMARIZER_MODEL:   (optional) Model name (e.g., llama-3.1-8b-instruct)
 """
 import os
 import json
 from typing import Optional
 import httpx
 class SummarizerError(Exception):
    """Raised when the summarization API call fails."""
    pass
 class SummarizerClient:
    """
    HTTP client for an OpenAI-compatible chat completions endpoint.
    Used to summarize long transcripts in chunks.
    """
    CHUNK_SIZE = 10_240  # characters per chunk
    def __init__(
        self,
        api_url: Optional[str] = None,
        api_key: Optional[str] = None,
        model: Optional[str] = None,
        timeout: float = 600.0,
    ):
        self.api_url = (api_url or os.getenv("SUMMARIZER_API_URL")).strip().rstrip("/")
        self.api_key = api_key or os.getenv("SUMMARIZER_API_KEY") or None
        self.model = model or os.getenv("SUMMARIZER_MODEL") or "llama-3.1-8b-instruct"
        self.timeout = timeout
        if not self.api_url:
            raise SummarizerError(
                "SUMMARIZER_API_URL is not set. "
                "Provide the summarization LLM URL via environment or constructor."
            )
        self._client = httpx.Client(
            base_url=self.api_url,
            timeout=self.timeout,
            follow_redirects=True,
        )
    def close(self):
        self._client.close()
    def __del__(self):
        try:
            self._client.close()
        except Exception:
            pass
    def summarize_transcript(self, transcript: str) -> str:
        """
        Summarize a (possibly very long) transcript.
        Strategy:
        - Split transcript into chunks of CHUNK_SIZE characters.
        - Generate a detailed summary for each chunk.
        - Combine all chunk summaries and generate a final, concise but thorough summary.
        The final summary should make it clear:
        - What was discussed
        - Main issues
        - Outcomes / decisions
        - Next steps / action items
        """
        if not transcript.strip():
            return "No transcript provided to summarize."
        # 1) Chunk the transcript
        chunks = self._chunk_text(transcript)
        # 2) Summarize each chunk
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            summary = self._summarize_chunk(chunk, i, len(chunks))
            chunk_summaries.append(summary)
        # 3) Combine and summarize summaries
        combined = "\n\n".join(chunk_summaries)
        final_summary = self._summarize_combined(combined)
        return final_summary
    def _chunk_text(self, text: str) -> list[str]:
        """Split text into chunks of CHUNK_SIZE characters."""
        chunks = []
        start = 0
        while start < len(text):
            end = start + self.CHUNK_SIZE
            if end >= len(text):
                chunks.append(text[start:])
                break
            # Try to break at a reasonable boundary (newline or space)
            break_pos = text.rfind("\n", start, end)
            if break_pos == -1:
                break_pos = text.rfind(" ", start, end)
            if break_pos == -1 or break_pos <= start:
                break_pos = end
            chunks.append(text[start:break_pos].strip())
            start = break_pos
        return chunks
    def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
        system_prompt = (
            "You are an expert legal and business meeting summarizer. "
            "You will receive a segment of a longer transcript. "
            "Provide a detailed, structured summary of this segment, focusing on: "
            "- Topics discussed\n"
            "- Key points and arguments\n"
            "- Decisions and agreements\n"
            "- Action items and responsibilities\n"
            "- Any risks, conflicts, or open issues\n\n"
            "Be concise but complete. Use bullet points when helpful. "
            "Do not add information that is not present in the transcript."
        )
        user_prompt = (
            f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
            f"{chunk}"
        )
        return self._chat_completion(system_prompt, user_prompt)
    def _summarize_combined(self, combined_summaries: str) -> str:
        system_prompt = (
            "You are an expert legal and business meeting summarizer. "
            "You will receive several intermediate summaries of a longer conversation. "
            "Produce a single, comprehensive summary that makes it clear: "
            "- The overall purpose and context of the discussion\n"
            "- The main issues and topics addressed\n"
            "- Key arguments and positions (briefly)\n"
            "- Decisions and outcomes\n"
            "- Action items, responsibilities, and next steps\n"
            "- Any unresolved issues or risks\n\n"
            "The summary should be detailed enough that a reader who was not present "
            "can understand what happened and what is expected going forward. "
            "Use clear, concise language and bullet points where appropriate."
        )
        user_prompt = (
            "Here are the intermediate summaries from different parts of the same conversation:\n\n"
            f"{combined_summaries}"
        )
        return self._chat_completion(system_prompt, user_prompt)
    def _chat_completion(self, system_prompt: str, user_prompt: str) -> str:
        """
        Call OpenAI-compatible /v1/chat/completions endpoint.
        """
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            "temperature": 0.3,
        }
        headers = {
            "Content-Type": "application/json",
        }
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"
        resp = self._client.post(
            "/v1/chat/completions",
            json=payload,
            headers=headers,
        )
        if resp.status_code >= 400:
            raise SummarizerError(
                f"Summarizer API error {resp.status_code}: {resp.text}"
            )
        try:
            data = resp.json()
        except json.JSONDecodeError:
            raise SummarizerError(
                "Failed to parse summarizer response as JSON."
            )
        # Extract assistant message
        try:
            content = data["choices"][0]["message"]["content"]
            return content.strip()
        except (KeyError, IndexError, TypeError):
            raise SummarizerError(
                "Unexpected summarizer response format: "
                f"{json.dumps(data, indent=2)}"
            )