Use verbose_json diarization, add JSON+TXT email feature

2026-06-14 05:36:45 +00:00
parent f6db48b1d0
commit b9d25a39dd
4 changed files with 421 additions and 54 deletions
@@ -17,7 +17,7 @@ but ignored when not relevant.

 import os
 import logging
-from typing import Union, Optional
+from typing import Union, Optional, Dict, Any

 from .localai_client import LocalAIClient, LocalAIError
 from .summarizer import SummarizerClient, SummarizerError
@@ -120,21 +120,21 @@ class Scraibe:

    def transcribe(
        self,
-        audio_file: Union[str],
+        audio_file: str,
+        *,
+        for_export: bool = False,
        **kwargs,
-    ) -> str:
+    ) -> Union[str, Dict[str, Any]]:
        """
        Transcribe the provided audio file using LocalAI.

-        Uses /v1/audio/diarization with vibevoice.cpp, then concatenates
-        all segment texts.
-
-        Args:
-            audio_file (str): Path to the audio file.
-            **kwargs: Additional keyword arguments (some forwarded, others ignored).
-
+        Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
        Returns:
-            str: The concatenated transcribed text.
+          - If for_export=False: plain transcript text (str).
+          - If for_export=True: dict with:
+              - transcript: plain text
+              - segments: list[segment] with speaker labels
+              - raw_result: full verbose_json from LocalAI (if present)
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
@@ -152,31 +152,70 @@ class Scraibe:
                audio_path=audio_file,
                include_text=True,
                verbose=verbose,
+                return_raw=True,
                **kwargs,
            )
        except LocalAIError as e:
            logger.error("Error during LocalAI transcription: %s", e)
            raise LocalAIError(f"Error during LocalAI transcription: {e}")

+        segments = result.get("segments", [])
+        speakers = result.get("speakers", [])
        transcripts = result.get("transcripts", [])
-        text = " ".join(t.strip() for t in transcripts if t.strip())
-        logger.info("transcribe completed, length=%d chars", len(text))
-        return text
+
+        # Build simple transcript text
+        if for_export:
+            # Include speaker-labeled transcript
+            lines = []
+            for seg, speaker, text in zip(segments, speakers, transcripts):
+                start, end = seg
+                ts = self._format_timestamp(start)
+                line = f"[{ts}] {speaker}: {text.strip()}"
+                lines.append(line)
+            full_text = "\n\n".join(lines)
+        else:
+            # Legacy: space-joined text
+            full_text = " ".join(t.strip() for t in transcripts if t.strip())
+
+        logger.info("transcribe completed, length=%d chars", len(full_text))
+
+        if for_export:
+            # Return richer structure for JSON export
+            raw_result = result.get("raw_result")
+            return {
+                "transcript": full_text,
+                "segments": [
+                    {
+                        "id": i,
+                        "speaker": sp,
+                        "start": seg[0],
+                        "end": seg[1],
+                        "text": txt,
+                    }
+                    for i, (seg, sp, txt) in enumerate(
+                        zip(segments, speakers, transcripts)
+                    )
+                ],
+                "raw_result": raw_result if raw_result is not None else None,
+            }
+
+        return full_text

    def transcript_and_summarize(
        self,
-        audio_file: Union[str],
+        audio_file: str,
        *,
        summarizer_api_url: Optional[str] = None,
        summarizer_api_key: Optional[str] = None,
        summarizer_model: Optional[str] = None,
+        for_export: bool = False,
        **kwargs,
    ) -> dict:
        """
        Transcribe the audio file and generate a detailed summary.

        Steps:
-        - Transcribe via LocalAI.
+        - Transcribe via LocalAI (verbose_json).
        - Build a plain-text transcript (with speaker labels).
        - Summarize the transcript using the configured LLM.

@@ -184,6 +223,8 @@ class Scraibe:
            dict with:
              - transcript: full transcript text (with speaker labels)
              - summary: final detailed summary (markdown-ready)
+              - segments: (if for_export) list[segment] with speaker labels
+              - raw_result: (if for_export) full verbose_json from LocalAI
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
@@ -202,6 +243,7 @@ class Scraibe:
                audio_path=audio_file,
                include_text=True,
                verbose=verbose,
+                return_raw=True,
                **kwargs,
            )
        except LocalAIError as e:
@@ -249,11 +291,30 @@ class Scraibe:

        logger.info("transcript_and_summarize completed.")

-        return {
+        out = {
            "transcript": full_transcript,
            "summary": summary,
        }

+        if for_export:
+            # Add segments and raw_result for JSON export
+            raw_result = result.get("raw_result")
+            out["segments"] = [
+                {
+                    "id": i,
+                    "speaker": sp,
+                    "start": seg[0],
+                    "end": seg[1],
+                    "text": txt,
+                }
+                for i, (seg, sp, txt) in enumerate(
+                    zip(segments, speakers, transcripts)
+                )
+            ]
+            out["raw_result"] = raw_result if raw_result is not None else None
+
+        return out
+
    # -----------------
    # Helpers
    # -----------------
@@ -0,0 +1,147 @@
+"""
+Email sender module for ScrAIbe.
+
+Sends transcription outputs (TXT, JSON, etc.) via SMTP.
+All credentials are configured via environment variables.
+"""
+
+import os
+import smtplib
+import logging
+from email import encoders
+from email.mime.base import MIMEBase
+from email.mime.multipart import MIMEMultipart
+from email.mime.text import MIMEText
+from typing import List, Optional
+
+logger = logging.getLogger("scraibe.email_sender")
+
+
+class EmailError(Exception):
+    pass
+
+
+def get_email_config():
+    """
+    Read email configuration from environment variables.
+    Raises EmailError if required fields are missing.
+    """
+    smtp_host = os.getenv("EMAIL_SMTP_HOST")
+    smtp_port = os.getenv("EMAIL_SMTP_PORT")
+    smtp_user = os.getenv("EMAIL_SMTP_USER")
+    smtp_password = os.getenv("EMAIL_SMTP_PASSWORD")
+    from_address = os.getenv("EMAIL_FROM_ADDRESS")
+    use_tls_str = os.getenv("EMAIL_SMTP_USE_TLS", "true").strip().lower()
+    use_tls = use_tls_str not in ("false", "0", "no")
+
+    if not all([smtp_host, smtp_port, smtp_user, smtp_password, from_address]):
+        raise EmailError(
+            "Email configuration incomplete. "
+            "Ensure EMAIL_SMTP_HOST, EMAIL_SMTP_PORT, EMAIL_SMTP_USER, "
+            "EMAIL_SMTP_PASSWORD, and EMAIL_FROM_ADDRESS are set."
+        )
+
+    return {
+        "smtp_host": smtp_host,
+        "smtp_port": int(smtp_port),
+        "smtp_user": smtp_user,
+        "smtp_password": smtp_password,
+        "from_address": from_address,
+        "use_tls": use_tls,
+    }
+
+
+def send_email(
+    to: str,
+    subject: str,
+    body: str,
+    attachments: List[str],
+    cc: Optional[str] = None,
+) -> bool:
+    """
+    Send an email with optional file attachments.
+
+    Args:
+        to: Comma-separated list of recipient email addresses.
+        subject: Email subject.
+        body: Email body (plain text).
+        attachments: List of file paths to attach.
+        cc: Comma-separated list of CC email addresses (optional).
+
+    Returns:
+        True if sent successfully.
+
+    Raises:
+        EmailError if sending fails.
+    """
+    try:
+        cfg = get_email_config()
+    except EmailError as e:
+        logger.error("Email configuration error: %s", e)
+        raise
+
+    # Parse recipients
+    to_list = [addr.strip() for addr in to.split(",") if addr.strip()]
+    cc_list = [addr.strip() for addr in cc.split(",") if addr.strip()] if cc else []
+
+    if not to_list:
+        raise EmailError("No valid 'To' email addresses provided.")
+
+    # Build message
+    msg = MIMEMultipart()
+    msg["From"] = cfg["from_address"]
+    msg["To"] = ", ".join(to_list)
+    if cc_list:
+        msg["Cc"] = ", ".join(cc_list)
+    msg["Subject"] = subject
+
+    msg.attach(MIMEText(body, "plain"))
+
+    # Attach files
+    for file_path in attachments:
+        if not os.path.isfile(file_path):
+            logger.warning("Attachment file not found, skipping: %s", file_path)
+            continue
+
+        try:
+            with open(file_path, "rb") as f:
+                part = MIMEBase("application", "octet-stream")
+                part.set_payload(f.read())
+                encoders.encode_base64(part)
+                part.add_header(
+                    "Content-Disposition",
+                    "attachment",
+                    filename=os.path.basename(file_path),
+                )
+                msg.attach(part)
+        except Exception as e:
+            logger.warning("Failed to attach file %s: %s", file_path, e)
+
+    # Connect and send
+    try:
+        if cfg["use_tls"]:
+            server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
+            server.ehlo()
+            server.starttls()
+            server.ehlo()
+        else:
+            server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
+            server.ehlo()
+
+        server.login(cfg["smtp_user"], cfg["smtp_password"])
+        server.sendmail(
+            cfg["from_address"],
+            to_list + cc_list,
+            msg.as_string(),
+        )
+        server.quit()
+        logger.info(
+            "Email sent to %s (CC: %s)",
+            to_list,
+            cc_list or "None",
+        )
+        return True
+
+    except Exception as e:
+        logger.error("Failed to send email: %s", e)
+        raise EmailError(f"Failed to send email: {e}")
@@ -39,7 +39,7 @@ class LocalAIClient:
    Responsibilities:
      - Read configuration from environment.
      - Upload audio file as multipart/form-data.
-      - Parse diarization + transcription response.
+      - Parse diarization + transcription response (verbose_json).
      - Map response into the same structure expected by Scraibe's Transcript.
    """

@@ -106,20 +106,13 @@ class LocalAIClient:
        response_format: Optional[str] = None,
        include_text: Optional[bool] = None,
        verbose: bool = False,
+        return_raw: bool = False,
        **_ignored,
    ) -> Dict[str, Any]:
        """
-        Send audio to LocalAI /v1/audio/diarization and return a dict
-        in the same style as the previous internal diarization output:
-
-        {
-          "segments": [ [start, end], ... ],
-          "speakers": [ "SPEAKER_00", ... ],
-          "transcripts": [ "text for segment", ... ]
-        }
-
-        Extra kwargs that the old UI used (e.g., whisper-specific) are
-        accepted but ignored.
+        Send audio to LocalAI /v1/audio/diarization and return:
+        - A normalized dict with segments, speakers, transcripts.
+        - Optionally, the raw verbose_json response (for JSON export).

        Args:
            audio_path: Path to the audio file.
@@ -131,16 +124,18 @@ class LocalAIClient:
            min_duration_on: Optional min segment duration.
            min_duration_off: Optional min gap duration.
            response_format: "json", "verbose_json", or "rttm".
-                             Defaults to "verbose_json" if not set.
+                             Defaults to "verbose_json".
            include_text: Whether to request per-segment text.
                          Defaults to True.
            verbose: If True, prints progress messages.
+            return_raw: If True, also return the raw API response in 'raw_result'.
        """
        if verbose:
            print("Starting diarization and transcription via LocalAI.")

        logger.info("diarize_and_transcribe requested for: %s", audio_path)

+        # Always use verbose_json for diarization + speaker info
        if response_format is None:
            response_format = "verbose_json"
        if include_text is None:
@@ -202,7 +197,7 @@ class LocalAIClient:
            )

        try:
-            result = resp.json()
+            raw_result = resp.json()
        except json.JSONDecodeError:
            logger.error("Failed to parse LocalAI response as JSON.")
            raise LocalAIError(
@@ -212,11 +207,16 @@ class LocalAIClient:
        if verbose:
            print("Diarization and transcription finished. Starting post-processing.")

-        return self._parse_diarization_response(result)
+        parsed = self._parse_diarization_response(raw_result)
+
+        if return_raw:
+            parsed["raw_result"] = raw_result
+
+        return parsed

    def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """
-        Convert LocalAI response into the internal format used by Scraibe:
+        Convert LocalAI verbose_json response into the internal format used by Scraibe:
        {
          "segments": [ [start, end], ... ],
          "speakers": [ "SPEAKER_00", ... ],
@@ -7,13 +7,16 @@ Runs the Web GUI that:
 - Sends audio to LocalAI for transcription + diarization
 - Optionally sends transcript to a second LLM for summarization
 - Returns transcript (and summary) in the browser
+- Optionally emails transcript files (TXT + JSON)

 This is the default entrypoint when running in Docker.
 """

 import os
+import json
 import logging
 import tempfile
+from datetime import datetime

 import gradio as gr

@@ -70,10 +73,23 @@ def create_app():
        )

    # Helper: run transcription via LocalAI API
-    def run_transcribe(audio_path, task, language, num_speakers):
+    def run_transcribe(
+        audio_path,
+        task,
+        language,
+        num_speakers,
+        send_email_flag,
+        email_to,
+        email_cc,
+        email_subject,
+    ):
        if not audio_path:
            raise ValueError("No audio file provided.")

+        email_status = ""
+        attachments = []
+
+        # Ensure we use rich export mode (for JSON with diarization)
        try:
            if task == "transcript_and_summarize":
                result = scraibe.transcript_and_summarize(
@@ -81,11 +97,14 @@ def create_app():
                    language=language or None,
                    num_speakers=int(num_speakers) if num_speakers else None,
                    verbose=True,
+                    for_export=True,
                )
                transcript_text = result.get("transcript", "")
                summary_text = result.get("summary", "")
+                segments = result.get("segments", [])
+                raw_result = result.get("raw_result")

-                # Save as .md
+                # Save as .md (transcript + summary)
                md_path = tempfile.mktemp(suffix=".md")
                with open(md_path, "w", encoding="utf-8") as f:
                    f.write("# Transcript\n\n")
@@ -93,32 +112,74 @@ def create_app():
                    f.write("\n\n# Summary\n\n")
                    f.write(summary_text)

-                return (
-                    transcript_text,
-                    summary_text,
-                    md_path,
-                    "Transcription and summarization completed.",
-                )
+                # Save as .txt (plain transcript)
+                txt_path = tempfile.mktemp(suffix=".txt")
+                with open(txt_path, "w", encoding="utf-8") as f:
+                    f.write(transcript_text)
+
+                # Save as .json (diarization + transcript + summary)
+                json_data = {
+                    "task": "transcript_and_summarize",
+                    "transcript": transcript_text,
+                    "summary": summary_text,
+                    "segments": segments,
+                    "metadata": {
+                        "timestamp": datetime.utcnow().isoformat()
+                    },
+                }
+                if raw_result is not None:
+                    json_data["raw_result"] = raw_result
+
+                json_path = tempfile.mktemp(suffix=".json")
+                with open(json_path, "w", encoding="utf-8") as f:
+                    json.dump(json_data, f, indent=2, ensure_ascii=False)
+
+                # Prepare attachments for email
+                if send_email_flag:
+                    attachments = [txt_path, json_path]
+
+                status_msg = "Transcription and summarization completed."
+
            else:
-                # Default: transcribe only
-                text = scraibe.transcribe(
+                # transcribe only (with diarization)
+                result = scraibe.transcribe(
                    audio_file=audio_path,
                    language=language or None,
                    num_speakers=int(num_speakers) if num_speakers else None,
                    verbose=True,
+                    for_export=True,
                )
+                transcript_text = result.get("transcript", "")
+                segments = result.get("segments", [])
+                raw_result = result.get("raw_result")

-                # Save as .txt
+                # Save as .txt (plain transcript)
                txt_path = tempfile.mktemp(suffix=".txt")
                with open(txt_path, "w", encoding="utf-8") as f:
-                    f.write(text)
+                    f.write(transcript_text)
+
+                # Save as .json (diarization + transcript)
+                json_data = {
+                    "task": "transcribe",
+                    "transcript": transcript_text,
+                    "segments": segments,
+                    "metadata": {
+                        "timestamp": datetime.utcnow().isoformat()
+                    },
+                }
+                if raw_result is not None:
+                    json_data["raw_result"] = raw_result
+
+                json_path = tempfile.mktemp(suffix=".json")
+                with open(json_path, "w", encoding="utf-8") as f:
+                    json.dump(json_data, f, indent=2, ensure_ascii=False)
+
+                # Prepare attachments for email
+                if send_email_flag:
+                    attachments = [txt_path, json_path]
+
+                status_msg = "Transcription completed."

-                return (
-                    text,
-                    "",
-                    txt_path,
-                    "Transcription completed.",
-                )
        except Exception as e:
            logger.error("Error during transcription: %s", e)
            return (
@@ -126,6 +187,54 @@ def create_app():
                "",
                None,
                f"Error: {e}",
+                "",
+            )
+
+        # Handle email after successful transcription
+        if send_email_flag and attachments:
+            try:
+                from .email_sender import send_email, EmailError
+            except ImportError:
+                email_status = "Email feature unavailable (email_sender not found)."
+            else:
+                to = (email_to or "").strip()
+                cc = (email_cc or "").strip()
+                subject = (email_subject or "").strip()
+
+                if not to:
+                    email_status = "Email not sent: 'To' address is empty."
+                else:
+                    if not subject:
+                        subject = f"ScrAIbe Transcript - {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"
+
+                    body = (
+                        "Please find the transcription files attached.\n"
+                        "This message was generated by ScrAIbe.\n"
+                    )
+
+                    try:
+                        send_email(
+                            to=to,
+                            cc=cc or None,
+                            subject=subject,
+                            body=body,
+                            attachments=attachments,
+                        )
+                        email_status = "Transcript files sent via email."
+                    except EmailError as e:
+                        email_status = f"Email failed: {e}"
+                    except Exception as e:
+                        email_status = f"Email failed: {e}"
+
+        # Use md_path for file_output in transcript_and_summarize, else txt_path
+        file_path = md_path if task == "transcript_and_summarize" else txt_path
+
+        return (
+            transcript_text,
+            summary_text if task == "transcript_and_summarize" else "",
+            file_path,
+            status_msg,
+            email_status,
        )

    # Load header/footer HTML if present
@@ -180,6 +289,31 @@ def create_app():
                        precision=0,
                    )

+                # Email options
+                send_email_checkbox = gr.Checkbox(
+                    label="Send transcript files via email"
+                )
+
+                with gr.Group(visible=False) as email_group:
+                    email_to = gr.Textbox(
+                        label="To (comma-separated)",
+                        placeholder="e.g. name@example.com",
+                    )
+                    email_cc = gr.Textbox(
+                        label="CC (optional, comma-separated)",
+                        placeholder="e.g. manager@example.com",
+                    )
+                    email_subject = gr.Textbox(
+                        label="Subject (optional)",
+                        placeholder="Default: ScrAIbe Transcript - <date>",
+                    )
+
+                send_email_checkbox.change(
+                    fn=lambda v: gr.update(visible=v),
+                    inputs=[send_email_checkbox],
+                    outputs=[email_group],
+                )
+
                transcribe_btn = gr.Button("Start", variant="primary")

            with gr.Column(scale=3):
@@ -201,6 +335,11 @@ def create_app():
                    label="Status",
                    interactive=False,
                )
+                email_status_text = gr.Textbox(
+                    label="Email status",
+                    interactive=False,
+                    visible=True,
+                )

        # Footer
        if footer_html:
@@ -218,20 +357,34 @@ def create_app():
            outputs=[summary_text],
        )

-        def on_transcribe(audio, task, language, num_speakers):
+        def on_transcribe(
+            audio,
+            task,
+            language,
+            num_speakers,
+            send_email_flag,
+            email_to_val,
+            email_cc_val,
+            email_subject_val,
+        ):
            if not audio:
                return (
                    "",
                    "",
                    None,
                    "Please upload or record audio.",
+                    "",
                )

-            transcript, summary, file_path, msg = run_transcribe(
+            transcript, summary, file_path, status_msg, email_status = run_transcribe(
                audio_path=audio,
                task=task,
                language=language,
                num_speakers=num_speakers,
+                send_email_flag=bool(send_email_flag),
+                email_to=email_to_val,
+                email_cc=email_cc_val,
+                email_subject=email_subject_val,
            )

            show_summary = bool(summary)
@@ -239,7 +392,8 @@ def create_app():
                transcript,
                summary,
                file_path if file_path else None,
-                msg,
+                status_msg,
+                email_status,
            )

        transcribe_btn.click(
@@ -249,12 +403,17 @@ def create_app():
                task_choice,
                language_input,
                num_speakers_input,
+                send_email_checkbox,
+                email_to,
+                email_cc,
+                email_subject,
            ],
            outputs=[
                output_text,
                summary_text,
                file_output,
                status_text,
+                email_status_text,
            ],
        )