Use verbose_json diarization, add JSON+TXT email feature

2026-06-14 05:36:45 +00:00
parent f6db48b1d0
commit b9d25a39dd
4 changed files with 421 additions and 54 deletions
@@ -17,7 +17,7 @@ but ignored when not relevant.
 import os
 import logging
-from typing import Union, Optional
+from typing import Union, Optional, Dict, Any
 from .localai_client import LocalAIClient, LocalAIError
 from .summarizer import SummarizerClient, SummarizerError
@@ -120,21 +120,21 @@ class Scraibe:
    def transcribe(
        self,
-        audio_file: Union[str],
+        audio_file: str,
        *,
        for_export: bool = False,
        **kwargs,
-    ) -> str:
+    ) -> Union[str, Dict[str, Any]]:
        """
        Transcribe the provided audio file using LocalAI.
-        Uses /v1/audio/diarization with vibevoice.cpp, then concatenates
+        Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
        all segment texts.
        Args:
            audio_file (str): Path to the audio file.
            **kwargs: Additional keyword arguments (some forwarded, others ignored).
        Returns:
-            str: The concatenated transcribed text.
+          - If for_export=False: plain transcript text (str).
          - If for_export=True: dict with:
              - transcript: plain text
              - segments: list[segment] with speaker labels
              - raw_result: full verbose_json from LocalAI (if present)
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
@@ -152,31 +152,70 @@ class Scraibe:
                audio_path=audio_file,
                include_text=True,
                verbose=verbose,
                return_raw=True,
                **kwargs,
            )
        except LocalAIError as e:
            logger.error("Error during LocalAI transcription: %s", e)
            raise LocalAIError(f"Error during LocalAI transcription: {e}")
        segments = result.get("segments", [])
        speakers = result.get("speakers", [])
        transcripts = result.get("transcripts", [])
-        text = " ".join(t.strip() for t in transcripts if t.strip())
+
-        logger.info("transcribe completed, length=%d chars", len(text))
+        # Build simple transcript text
-        return text
+        if for_export:
            # Include speaker-labeled transcript
            lines = []
            for seg, speaker, text in zip(segments, speakers, transcripts):
                start, end = seg
                ts = self._format_timestamp(start)
                line = f"[{ts}] {speaker}: {text.strip()}"
                lines.append(line)
            full_text = "\n\n".join(lines)
        else:
            # Legacy: space-joined text
            full_text = " ".join(t.strip() for t in transcripts if t.strip())
        logger.info("transcribe completed, length=%d chars", len(full_text))
        if for_export:
            # Return richer structure for JSON export
            raw_result = result.get("raw_result")
            return {
                "transcript": full_text,
                "segments": [
                    {
                        "id": i,
                        "speaker": sp,
                        "start": seg[0],
                        "end": seg[1],
                        "text": txt,
                    }
                    for i, (seg, sp, txt) in enumerate(
                        zip(segments, speakers, transcripts)
                    )
                ],
                "raw_result": raw_result if raw_result is not None else None,
            }
        return full_text
    def transcript_and_summarize(
        self,
-        audio_file: Union[str],
+        audio_file: str,
        *,
        summarizer_api_url: Optional[str] = None,
        summarizer_api_key: Optional[str] = None,
        summarizer_model: Optional[str] = None,
        for_export: bool = False,
        **kwargs,
    ) -> dict:
        """
        Transcribe the audio file and generate a detailed summary.
        Steps:
-        - Transcribe via LocalAI.
+        - Transcribe via LocalAI (verbose_json).
        - Build a plain-text transcript (with speaker labels).
        - Summarize the transcript using the configured LLM.
@@ -184,6 +223,8 @@ class Scraibe:
            dict with:
              - transcript: full transcript text (with speaker labels)
              - summary: final detailed summary (markdown-ready)
              - segments: (if for_export) list[segment] with speaker labels
              - raw_result: (if for_export) full verbose_json from LocalAI
        """
        if isinstance(audio_file, str):
            if not os.path.exists(audio_file):
@@ -202,6 +243,7 @@ class Scraibe:
                audio_path=audio_file,
                include_text=True,
                verbose=verbose,
                return_raw=True,
                **kwargs,
            )
        except LocalAIError as e:
@@ -249,11 +291,30 @@ class Scraibe:
        logger.info("transcript_and_summarize completed.")
-        return {
+        out = {
            "transcript": full_transcript,
            "summary": summary,
        }
        if for_export:
            # Add segments and raw_result for JSON export
            raw_result = result.get("raw_result")
            out["segments"] = [
                {
                    "id": i,
                    "speaker": sp,
                    "start": seg[0],
                    "end": seg[1],
                    "text": txt,
                }
                for i, (seg, sp, txt) in enumerate(
                    zip(segments, speakers, transcripts)
                )
            ]
            out["raw_result"] = raw_result if raw_result is not None else None
        return out
    # -----------------
    # Helpers
    # -----------------
@@ -0,0 +1,147 @@
 """
 Email sender module for ScrAIbe.
 Sends transcription outputs (TXT, JSON, etc.) via SMTP.
 All credentials are configured via environment variables.
 """
 import os
 import smtplib
 import logging
 from email import encoders
 from email.mime.base import MIMEBase
 from email.mime.multipart import MIMEMultipart
 from email.mime.text import MIMEText
 from typing import List, Optional
 logger = logging.getLogger("scraibe.email_sender")
 class EmailError(Exception):
    pass
 def get_email_config():
    """
    Read email configuration from environment variables.
    Raises EmailError if required fields are missing.
    """
    smtp_host = os.getenv("EMAIL_SMTP_HOST")
    smtp_port = os.getenv("EMAIL_SMTP_PORT")
    smtp_user = os.getenv("EMAIL_SMTP_USER")
    smtp_password = os.getenv("EMAIL_SMTP_PASSWORD")
    from_address = os.getenv("EMAIL_FROM_ADDRESS")
    use_tls_str = os.getenv("EMAIL_SMTP_USE_TLS", "true").strip().lower()
    use_tls = use_tls_str not in ("false", "0", "no")
    if not all([smtp_host, smtp_port, smtp_user, smtp_password, from_address]):
        raise EmailError(
            "Email configuration incomplete. "
            "Ensure EMAIL_SMTP_HOST, EMAIL_SMTP_PORT, EMAIL_SMTP_USER, "
            "EMAIL_SMTP_PASSWORD, and EMAIL_FROM_ADDRESS are set."
        )
    return {
        "smtp_host": smtp_host,
        "smtp_port": int(smtp_port),
        "smtp_user": smtp_user,
        "smtp_password": smtp_password,
        "from_address": from_address,
        "use_tls": use_tls,
    }
 def send_email(
    to: str,
    subject: str,
    body: str,
    attachments: List[str],
    cc: Optional[str] = None,
 ) -> bool:
    """
    Send an email with optional file attachments.
    Args:
        to: Comma-separated list of recipient email addresses.
        subject: Email subject.
        body: Email body (plain text).
        attachments: List of file paths to attach.
        cc: Comma-separated list of CC email addresses (optional).
    Returns:
        True if sent successfully.
    Raises:
        EmailError if sending fails.
    """
    try:
        cfg = get_email_config()
    except EmailError as e:
        logger.error("Email configuration error: %s", e)
        raise
    # Parse recipients
    to_list = [addr.strip() for addr in to.split(",") if addr.strip()]
    cc_list = [addr.strip() for addr in cc.split(",") if addr.strip()] if cc else []
    if not to_list:
        raise EmailError("No valid 'To' email addresses provided.")
    # Build message
    msg = MIMEMultipart()
    msg["From"] = cfg["from_address"]
    msg["To"] = ", ".join(to_list)
    if cc_list:
        msg["Cc"] = ", ".join(cc_list)
    msg["Subject"] = subject
    msg.attach(MIMEText(body, "plain"))
    # Attach files
    for file_path in attachments:
        if not os.path.isfile(file_path):
            logger.warning("Attachment file not found, skipping: %s", file_path)
            continue
        try:
            with open(file_path, "rb") as f:
                part = MIMEBase("application", "octet-stream")
                part.set_payload(f.read())
                encoders.encode_base64(part)
                part.add_header(
                    "Content-Disposition",
                    "attachment",
                    filename=os.path.basename(file_path),
                )
                msg.attach(part)
        except Exception as e:
            logger.warning("Failed to attach file %s: %s", file_path, e)
    # Connect and send
    try:
        if cfg["use_tls"]:
            server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
            server.ehlo()
            server.starttls()
            server.ehlo()
        else:
            server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
            server.ehlo()
        server.login(cfg["smtp_user"], cfg["smtp_password"])
        server.sendmail(
            cfg["from_address"],
            to_list + cc_list,
            msg.as_string(),
        )
        server.quit()
        logger.info(
            "Email sent to %s (CC: %s)",
            to_list,
            cc_list or "None",
        )
        return True
    except Exception as e:
        logger.error("Failed to send email: %s", e)
        raise EmailError(f"Failed to send email: {e}")
@@ -39,7 +39,7 @@ class LocalAIClient:
    Responsibilities:
      - Read configuration from environment.
      - Upload audio file as multipart/form-data.
-      - Parse diarization + transcription response.
+      - Parse diarization + transcription response (verbose_json).
      - Map response into the same structure expected by Scraibe's Transcript.
    """
@@ -106,20 +106,13 @@ class LocalAIClient:
        response_format: Optional[str] = None,
        include_text: Optional[bool] = None,
        verbose: bool = False,
        return_raw: bool = False,
        **_ignored,
    ) -> Dict[str, Any]:
        """
-        Send audio to LocalAI /v1/audio/diarization and return a dict
+        Send audio to LocalAI /v1/audio/diarization and return:
-        in the same style as the previous internal diarization output:
+        - A normalized dict with segments, speakers, transcripts.
-
+        - Optionally, the raw verbose_json response (for JSON export).
        {
          "segments": [ [start, end], ... ],
          "speakers": [ "SPEAKER_00", ... ],
          "transcripts": [ "text for segment", ... ]
        }
        Extra kwargs that the old UI used (e.g., whisper-specific) are
        accepted but ignored.
        Args:
            audio_path: Path to the audio file.
@@ -131,16 +124,18 @@ class LocalAIClient:
            min_duration_on: Optional min segment duration.
            min_duration_off: Optional min gap duration.
            response_format: "json", "verbose_json", or "rttm".
-                             Defaults to "verbose_json" if not set.
+                             Defaults to "verbose_json".
            include_text: Whether to request per-segment text.
                          Defaults to True.
            verbose: If True, prints progress messages.
            return_raw: If True, also return the raw API response in 'raw_result'.
        """
        if verbose:
            print("Starting diarization and transcription via LocalAI.")
        logger.info("diarize_and_transcribe requested for: %s", audio_path)
        # Always use verbose_json for diarization + speaker info
        if response_format is None:
            response_format = "verbose_json"
        if include_text is None:
@@ -202,7 +197,7 @@ class LocalAIClient:
            )
        try:
-            result = resp.json()
+            raw_result = resp.json()
        except json.JSONDecodeError:
            logger.error("Failed to parse LocalAI response as JSON.")
            raise LocalAIError(
@@ -212,11 +207,16 @@ class LocalAIClient:
        if verbose:
            print("Diarization and transcription finished. Starting post-processing.")
-        return self._parse_diarization_response(result)
+        parsed = self._parse_diarization_response(raw_result)
        if return_raw:
            parsed["raw_result"] = raw_result
        return parsed
    def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
        """
-        Convert LocalAI response into the internal format used by Scraibe:
+        Convert LocalAI verbose_json response into the internal format used by Scraibe:
        {
          "segments": [ [start, end], ... ],
          "speakers": [ "SPEAKER_00", ... ],
@@ -7,13 +7,16 @@ Runs the Web GUI that:
 - Sends audio to LocalAI for transcription + diarization
 - Optionally sends transcript to a second LLM for summarization
 - Returns transcript (and summary) in the browser
 - Optionally emails transcript files (TXT + JSON)
 This is the default entrypoint when running in Docker.
 """
 import os
 import json
 import logging
 import tempfile
 from datetime import datetime
 import gradio as gr
@@ -70,10 +73,23 @@ def create_app():
        )
    # Helper: run transcription via LocalAI API
-    def run_transcribe(audio_path, task, language, num_speakers):
+    def run_transcribe(
        audio_path,
        task,
        language,
        num_speakers,
        send_email_flag,
        email_to,
        email_cc,
        email_subject,
    ):
        if not audio_path:
            raise ValueError("No audio file provided.")
        email_status = ""
        attachments = []
        # Ensure we use rich export mode (for JSON with diarization)
        try:
            if task == "transcript_and_summarize":
                result = scraibe.transcript_and_summarize(
@@ -81,11 +97,14 @@ def create_app():
                    language=language or None,
                    num_speakers=int(num_speakers) if num_speakers else None,
                    verbose=True,
                    for_export=True,
                )
                transcript_text = result.get("transcript", "")
                summary_text = result.get("summary", "")
                segments = result.get("segments", [])
                raw_result = result.get("raw_result")
-                # Save as .md
+                # Save as .md (transcript + summary)
                md_path = tempfile.mktemp(suffix=".md")
                with open(md_path, "w", encoding="utf-8") as f:
                    f.write("# Transcript\n\n")
@@ -93,32 +112,74 @@ def create_app():
                    f.write("\n\n# Summary\n\n")
                    f.write(summary_text)
-                return (
+                # Save as .txt (plain transcript)
-                    transcript_text,
+                txt_path = tempfile.mktemp(suffix=".txt")
-                    summary_text,
+                with open(txt_path, "w", encoding="utf-8") as f:
-                    md_path,
+                    f.write(transcript_text)
-                    "Transcription and summarization completed.",
+
-                )
+                # Save as .json (diarization + transcript + summary)
                json_data = {
                    "task": "transcript_and_summarize",
                    "transcript": transcript_text,
                    "summary": summary_text,
                    "segments": segments,
                    "metadata": {
                        "timestamp": datetime.utcnow().isoformat()
                    },
                }
                if raw_result is not None:
                    json_data["raw_result"] = raw_result
                json_path = tempfile.mktemp(suffix=".json")
                with open(json_path, "w", encoding="utf-8") as f:
                    json.dump(json_data, f, indent=2, ensure_ascii=False)
                # Prepare attachments for email
                if send_email_flag:
                    attachments = [txt_path, json_path]
                status_msg = "Transcription and summarization completed."
            else:
-                # Default: transcribe only
+                # transcribe only (with diarization)
-                text = scraibe.transcribe(
+                result = scraibe.transcribe(
                    audio_file=audio_path,
                    language=language or None,
                    num_speakers=int(num_speakers) if num_speakers else None,
                    verbose=True,
                    for_export=True,
                )
                transcript_text = result.get("transcript", "")
                segments = result.get("segments", [])
                raw_result = result.get("raw_result")
-                # Save as .txt
+                # Save as .txt (plain transcript)
                txt_path = tempfile.mktemp(suffix=".txt")
                with open(txt_path, "w", encoding="utf-8") as f:
-                    f.write(text)
+                    f.write(transcript_text)
                # Save as .json (diarization + transcript)
                json_data = {
                    "task": "transcribe",
                    "transcript": transcript_text,
                    "segments": segments,
                    "metadata": {
                        "timestamp": datetime.utcnow().isoformat()
                    },
                }
                if raw_result is not None:
                    json_data["raw_result"] = raw_result
                json_path = tempfile.mktemp(suffix=".json")
                with open(json_path, "w", encoding="utf-8") as f:
                    json.dump(json_data, f, indent=2, ensure_ascii=False)
                # Prepare attachments for email
                if send_email_flag:
                    attachments = [txt_path, json_path]
                status_msg = "Transcription completed."
                return (
                    text,
                    "",
                    txt_path,
                    "Transcription completed.",
                )
        except Exception as e:
            logger.error("Error during transcription: %s", e)
            return (
@@ -126,6 +187,54 @@ def create_app():
                "",
                None,
                f"Error: {e}",
                "",
            )
        # Handle email after successful transcription
        if send_email_flag and attachments:
            try:
                from .email_sender import send_email, EmailError
            except ImportError:
                email_status = "Email feature unavailable (email_sender not found)."
            else:
                to = (email_to or "").strip()
                cc = (email_cc or "").strip()
                subject = (email_subject or "").strip()
                if not to:
                    email_status = "Email not sent: 'To' address is empty."
                else:
                    if not subject:
                        subject = f"ScrAIbe Transcript - {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"
                    body = (
                        "Please find the transcription files attached.\n"
                        "This message was generated by ScrAIbe.\n"
                    )
                    try:
                        send_email(
                            to=to,
                            cc=cc or None,
                            subject=subject,
                            body=body,
                            attachments=attachments,
                        )
                        email_status = "Transcript files sent via email."
                    except EmailError as e:
                        email_status = f"Email failed: {e}"
                    except Exception as e:
                        email_status = f"Email failed: {e}"
        # Use md_path for file_output in transcript_and_summarize, else txt_path
        file_path = md_path if task == "transcript_and_summarize" else txt_path
        return (
            transcript_text,
            summary_text if task == "transcript_and_summarize" else "",
            file_path,
            status_msg,
            email_status,
        )
    # Load header/footer HTML if present
@@ -180,6 +289,31 @@ def create_app():
                        precision=0,
                    )
                # Email options
                send_email_checkbox = gr.Checkbox(
                    label="Send transcript files via email"
                )
                with gr.Group(visible=False) as email_group:
                    email_to = gr.Textbox(
                        label="To (comma-separated)",
                        placeholder="e.g. name@example.com",
                    )
                    email_cc = gr.Textbox(
                        label="CC (optional, comma-separated)",
                        placeholder="e.g. manager@example.com",
                    )
                    email_subject = gr.Textbox(
                        label="Subject (optional)",
                        placeholder="Default: ScrAIbe Transcript - <date>",
                    )
                send_email_checkbox.change(
                    fn=lambda v: gr.update(visible=v),
                    inputs=[send_email_checkbox],
                    outputs=[email_group],
                )
                transcribe_btn = gr.Button("Start", variant="primary")
            with gr.Column(scale=3):
@@ -201,6 +335,11 @@ def create_app():
                    label="Status",
                    interactive=False,
                )
                email_status_text = gr.Textbox(
                    label="Email status",
                    interactive=False,
                    visible=True,
                )
        # Footer
        if footer_html:
@@ -218,20 +357,34 @@ def create_app():
            outputs=[summary_text],
        )
-        def on_transcribe(audio, task, language, num_speakers):
+        def on_transcribe(
            audio,
            task,
            language,
            num_speakers,
            send_email_flag,
            email_to_val,
            email_cc_val,
            email_subject_val,
        ):
            if not audio:
                return (
                    "",
                    "",
                    None,
                    "Please upload or record audio.",
                    "",
                )
-            transcript, summary, file_path, msg = run_transcribe(
+            transcript, summary, file_path, status_msg, email_status = run_transcribe(
                audio_path=audio,
                task=task,
                language=language,
                num_speakers=num_speakers,
                send_email_flag=bool(send_email_flag),
                email_to=email_to_val,
                email_cc=email_cc_val,
                email_subject=email_subject_val,
            )
            show_summary = bool(summary)
@@ -239,7 +392,8 @@ def create_app():
                transcript,
                summary,
                file_path if file_path else None,
-                msg,
+                status_msg,
                email_status,
            )
        transcribe_btn.click(
@@ -249,12 +403,17 @@ def create_app():
                task_choice,
                language_input,
                num_speakers_input,
                send_email_checkbox,
                email_to,
                email_cc,
                email_subject,
            ],
            outputs=[
                output_text,
                summary_text,
                file_output,
                status_text,
                email_status_text,
            ],
        )