Use verbose_json diarization, add JSON+TXT email feature

2026-06-14 05:36:45 +00:00
parent f6db48b1d0
commit b9d25a39dd
4 changed files with 421 additions and 54 deletions
@@ -39,7 +39,7 @@ class LocalAIClient:
    Responsibilities:
      - Read configuration from environment.
      - Upload audio file as multipart/form-data.
-      - Parse diarization + transcription response.
+      - Parse diarization + transcription response (verbose_json).
      - Map response into the same structure expected by Scraibe's Transcript.
    """

@@ -106,20 +106,13 @@ class LocalAIClient:
        response_format: Optional[str] = None,
        include_text: Optional[bool] = None,
        verbose: bool = False,
+        return_raw: bool = False,
        **_ignored,
    ) -> Dict[str, Any]:
        """
-        Send audio to LocalAI /v1/audio/diarization and return a dict
-        in the same style as the previous internal diarization output:
-
-        {
-          "segments": [ [start, end], ... ],
-          "speakers": [ "SPEAKER_00", ... ],
-          "transcripts": [ "text for segment", ... ]
-        }
-
-        Extra kwargs that the old UI used (e.g., whisper-specific) are
-        accepted but ignored.
+        Send audio to LocalAI /v1/audio/diarization and return:
+        - A normalized dict with segments, speakers, transcripts.
+        - Optionally, the raw verbose_json response (for JSON export).

        Args:
            audio_path: Path to the audio file.
@@ -131,16 +124,18 @@ class LocalAIClient:
            min_duration_on: Optional min segment duration.
            min_duration_off: Optional min gap duration.
            response_format: "json", "verbose_json", or "rttm".
-                             Defaults to "verbose_json" if not set.
+                             Defaults to "verbose_json".
            include_text: Whether to request per-segment text.
                          Defaults to True.
            verbose: If True, prints progress messages.
+            return_raw: If True, also return the raw API response in 'raw_result'.
        """
        if verbose:
            print("Starting diarization and transcription via LocalAI.")

        logger.info("diarize_and_transcribe requested for: %s", audio_path)

+        # Always use verbose_json for diarization + speaker info
        if response_format is None:
            response_format = "verbose_json"
        if include_text is None:
@@ -202,7 +197,7 @@ class LocalAIClient:
            )

        try:
-            result = resp.json()
+            raw_result = resp.json()
        except json.JSONDecodeError:
            logger.error("Failed to parse LocalAI response as JSON.")
            raise LocalAIError(
@@ -212,11 +207,16 @@ class LocalAIClient:
        if verbose:
            print("Diarization and transcription finished. Starting post-processing.")

-        return self._parse_diarization_response(result)
+        parsed = self._parse_diarization_response(raw_result)
+
+        if return_raw:
+            parsed["raw_result"] = raw_result
+
+        return parsed

    def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """
-        Convert LocalAI response into the internal format used by Scraibe:
+        Convert LocalAI verbose_json response into the internal format used by Scraibe:
        {
          "segments": [ [start, end], ... ],
          "speakers": [ "SPEAKER_00", ... ],