Use verbose_json diarization, add JSON+TXT email feature

2026-06-14 05:36:45 +00:00
parent f6db48b1d0
commit b9d25a39dd
4 changed files with 421 additions and 54 deletions
@@ -17,7 +17,7 @@ but ignored when not relevant.

 import os
 import logging
-from typing import Union, Optional
+from typing import Union, Optional, Dict, Any

 from .localai_client import LocalAIClient, LocalAIError
 from .summarizer import SummarizerClient, SummarizerError
@@ -120,21 +120,21 @@ class Scraibe:

    def transcribe(
        self,
-        audio_file: Union[str],
+        audio_file: str,
+        *,
+        for_export: bool = False,
        **kwargs,
-    ) -> str:
+    ) -> Union[str, Dict[str, Any]]:
        """
        Transcribe the provided audio file using LocalAI.

-        Uses /v1/audio/diarization with vibevoice.cpp, then concatenates
-        all segment texts.
-
-        Args:
-            audio_file (str): Path to the audio file.
-            **kwargs: Additional keyword arguments (some forwarded, others ignored).
-
+        Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
        Returns:
-            str: The concatenated transcribed text.
+          - If for_export=False: plain transcript text (str).
+          - If for_export=True: dict with:
+              - transcript: plain text
+              - segments: list[segment] with speaker labels
+              - raw_result: full verbose_json from LocalAI (if present)
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
@@ -152,31 +152,70 @@ class Scraibe:
                audio_path=audio_file,
                include_text=True,
                verbose=verbose,
+                return_raw=True,
                **kwargs,
            )
        except LocalAIError as e:
            logger.error("Error during LocalAI transcription: %s", e)
            raise LocalAIError(f"Error during LocalAI transcription: {e}")

+        segments = result.get("segments", [])
+        speakers = result.get("speakers", [])
        transcripts = result.get("transcripts", [])
-        text = " ".join(t.strip() for t in transcripts if t.strip())
-        logger.info("transcribe completed, length=%d chars", len(text))
-        return text
+
+        # Build simple transcript text
+        if for_export:
+            # Include speaker-labeled transcript
+            lines = []
+            for seg, speaker, text in zip(segments, speakers, transcripts):
+                start, end = seg
+                ts = self._format_timestamp(start)
+                line = f"[{ts}] {speaker}: {text.strip()}"
+                lines.append(line)
+            full_text = "\n\n".join(lines)
+        else:
+            # Legacy: space-joined text
+            full_text = " ".join(t.strip() for t in transcripts if t.strip())
+
+        logger.info("transcribe completed, length=%d chars", len(full_text))
+
+        if for_export:
+            # Return richer structure for JSON export
+            raw_result = result.get("raw_result")
+            return {
+                "transcript": full_text,
+                "segments": [
+                    {
+                        "id": i,
+                        "speaker": sp,
+                        "start": seg[0],
+                        "end": seg[1],
+                        "text": txt,
+                    }
+                    for i, (seg, sp, txt) in enumerate(
+                        zip(segments, speakers, transcripts)
+                    )
+                ],
+                "raw_result": raw_result if raw_result is not None else None,
+            }
+
+        return full_text

    def transcript_and_summarize(
        self,
-        audio_file: Union[str],
+        audio_file: str,
        *,
        summarizer_api_url: Optional[str] = None,
        summarizer_api_key: Optional[str] = None,
        summarizer_model: Optional[str] = None,
+        for_export: bool = False,
        **kwargs,
    ) -> dict:
        """
        Transcribe the audio file and generate a detailed summary.

        Steps:
-        - Transcribe via LocalAI.
+        - Transcribe via LocalAI (verbose_json).
        - Build a plain-text transcript (with speaker labels).
        - Summarize the transcript using the configured LLM.

@@ -184,6 +223,8 @@ class Scraibe:
            dict with:
              - transcript: full transcript text (with speaker labels)
              - summary: final detailed summary (markdown-ready)
+              - segments: (if for_export) list[segment] with speaker labels
+              - raw_result: (if for_export) full verbose_json from LocalAI
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
@@ -202,6 +243,7 @@ class Scraibe:
                audio_path=audio_file,
                include_text=True,
                verbose=verbose,
+                return_raw=True,
                **kwargs,
            )
        except LocalAIError as e:
@@ -249,11 +291,30 @@ class Scraibe:

        logger.info("transcript_and_summarize completed.")

-        return {
+        out = {
            "transcript": full_transcript,
            "summary": summary,
        }

+        if for_export:
+            # Add segments and raw_result for JSON export
+            raw_result = result.get("raw_result")
+            out["segments"] = [
+                {
+                    "id": i,
+                    "speaker": sp,
+                    "start": seg[0],
+                    "end": seg[1],
+                    "text": txt,
+                }
+                for i, (seg, sp, txt) in enumerate(
+                    zip(segments, speakers, transcripts)
+                )
+            ]
+            out["raw_result"] = raw_result if raw_result is not None else None
+
+        return out
+
    # -----------------
    # Helpers
    # -----------------