Use verbose_json diarization, add JSON+TXT email feature
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

This commit is contained in:
admin
2026-06-14 05:36:45 +00:00
parent f6db48b1d0
commit b9d25a39dd
4 changed files with 421 additions and 54 deletions
+78 -17
View File
@@ -17,7 +17,7 @@ but ignored when not relevant.
import os import os
import logging import logging
from typing import Union, Optional from typing import Union, Optional, Dict, Any
from .localai_client import LocalAIClient, LocalAIError from .localai_client import LocalAIClient, LocalAIError
from .summarizer import SummarizerClient, SummarizerError from .summarizer import SummarizerClient, SummarizerError
@@ -120,21 +120,21 @@ class Scraibe:
def transcribe( def transcribe(
self, self,
audio_file: Union[str], audio_file: str,
*,
for_export: bool = False,
**kwargs, **kwargs,
) -> str: ) -> Union[str, Dict[str, Any]]:
""" """
Transcribe the provided audio file using LocalAI. Transcribe the provided audio file using LocalAI.
Uses /v1/audio/diarization with vibevoice.cpp, then concatenates Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
all segment texts.
Args:
audio_file (str): Path to the audio file.
**kwargs: Additional keyword arguments (some forwarded, others ignored).
Returns: Returns:
str: The concatenated transcribed text. - If for_export=False: plain transcript text (str).
- If for_export=True: dict with:
- transcript: plain text
- segments: list[segment] with speaker labels
- raw_result: full verbose_json from LocalAI (if present)
""" """
if isinstance(audio_file, str): if isinstance(audio_file, str):
if not os.path.exists(audio_file): if not os.path.exists(audio_file):
@@ -152,31 +152,70 @@ class Scraibe:
audio_path=audio_file, audio_path=audio_file,
include_text=True, include_text=True,
verbose=verbose, verbose=verbose,
return_raw=True,
**kwargs, **kwargs,
) )
except LocalAIError as e: except LocalAIError as e:
logger.error("Error during LocalAI transcription: %s", e) logger.error("Error during LocalAI transcription: %s", e)
raise LocalAIError(f"Error during LocalAI transcription: {e}") raise LocalAIError(f"Error during LocalAI transcription: {e}")
segments = result.get("segments", [])
speakers = result.get("speakers", [])
transcripts = result.get("transcripts", []) transcripts = result.get("transcripts", [])
text = " ".join(t.strip() for t in transcripts if t.strip())
logger.info("transcribe completed, length=%d chars", len(text)) # Build simple transcript text
return text if for_export:
# Include speaker-labeled transcript
lines = []
for seg, speaker, text in zip(segments, speakers, transcripts):
start, end = seg
ts = self._format_timestamp(start)
line = f"[{ts}] {speaker}: {text.strip()}"
lines.append(line)
full_text = "\n\n".join(lines)
else:
# Legacy: space-joined text
full_text = " ".join(t.strip() for t in transcripts if t.strip())
logger.info("transcribe completed, length=%d chars", len(full_text))
if for_export:
# Return richer structure for JSON export
raw_result = result.get("raw_result")
return {
"transcript": full_text,
"segments": [
{
"id": i,
"speaker": sp,
"start": seg[0],
"end": seg[1],
"text": txt,
}
for i, (seg, sp, txt) in enumerate(
zip(segments, speakers, transcripts)
)
],
"raw_result": raw_result if raw_result is not None else None,
}
return full_text
def transcript_and_summarize( def transcript_and_summarize(
self, self,
audio_file: Union[str], audio_file: str,
*, *,
summarizer_api_url: Optional[str] = None, summarizer_api_url: Optional[str] = None,
summarizer_api_key: Optional[str] = None, summarizer_api_key: Optional[str] = None,
summarizer_model: Optional[str] = None, summarizer_model: Optional[str] = None,
for_export: bool = False,
**kwargs, **kwargs,
) -> dict: ) -> dict:
""" """
Transcribe the audio file and generate a detailed summary. Transcribe the audio file and generate a detailed summary.
Steps: Steps:
- Transcribe via LocalAI. - Transcribe via LocalAI (verbose_json).
- Build a plain-text transcript (with speaker labels). - Build a plain-text transcript (with speaker labels).
- Summarize the transcript using the configured LLM. - Summarize the transcript using the configured LLM.
@@ -184,6 +223,8 @@ class Scraibe:
dict with: dict with:
- transcript: full transcript text (with speaker labels) - transcript: full transcript text (with speaker labels)
- summary: final detailed summary (markdown-ready) - summary: final detailed summary (markdown-ready)
- segments: (if for_export) list[segment] with speaker labels
- raw_result: (if for_export) full verbose_json from LocalAI
""" """
if isinstance(audio_file, str): if isinstance(audio_file, str):
if not os.path.exists(audio_file): if not os.path.exists(audio_file):
@@ -202,6 +243,7 @@ class Scraibe:
audio_path=audio_file, audio_path=audio_file,
include_text=True, include_text=True,
verbose=verbose, verbose=verbose,
return_raw=True,
**kwargs, **kwargs,
) )
except LocalAIError as e: except LocalAIError as e:
@@ -249,11 +291,30 @@ class Scraibe:
logger.info("transcript_and_summarize completed.") logger.info("transcript_and_summarize completed.")
return { out = {
"transcript": full_transcript, "transcript": full_transcript,
"summary": summary, "summary": summary,
} }
if for_export:
# Add segments and raw_result for JSON export
raw_result = result.get("raw_result")
out["segments"] = [
{
"id": i,
"speaker": sp,
"start": seg[0],
"end": seg[1],
"text": txt,
}
for i, (seg, sp, txt) in enumerate(
zip(segments, speakers, transcripts)
)
]
out["raw_result"] = raw_result if raw_result is not None else None
return out
# ----------------- # -----------------
# Helpers # Helpers
# ----------------- # -----------------
+147
View File
@@ -0,0 +1,147 @@
"""
Email sender module for ScrAIbe.
Sends transcription outputs (TXT, JSON, etc.) via SMTP.
All credentials are configured via environment variables.
"""
import os
import smtplib
import logging
from email import encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from typing import List, Optional
logger = logging.getLogger("scraibe.email_sender")
class EmailError(Exception):
pass
def get_email_config():
"""
Read email configuration from environment variables.
Raises EmailError if required fields are missing.
"""
smtp_host = os.getenv("EMAIL_SMTP_HOST")
smtp_port = os.getenv("EMAIL_SMTP_PORT")
smtp_user = os.getenv("EMAIL_SMTP_USER")
smtp_password = os.getenv("EMAIL_SMTP_PASSWORD")
from_address = os.getenv("EMAIL_FROM_ADDRESS")
use_tls_str = os.getenv("EMAIL_SMTP_USE_TLS", "true").strip().lower()
use_tls = use_tls_str not in ("false", "0", "no")
if not all([smtp_host, smtp_port, smtp_user, smtp_password, from_address]):
raise EmailError(
"Email configuration incomplete. "
"Ensure EMAIL_SMTP_HOST, EMAIL_SMTP_PORT, EMAIL_SMTP_USER, "
"EMAIL_SMTP_PASSWORD, and EMAIL_FROM_ADDRESS are set."
)
return {
"smtp_host": smtp_host,
"smtp_port": int(smtp_port),
"smtp_user": smtp_user,
"smtp_password": smtp_password,
"from_address": from_address,
"use_tls": use_tls,
}
def send_email(
to: str,
subject: str,
body: str,
attachments: List[str],
cc: Optional[str] = None,
) -> bool:
"""
Send an email with optional file attachments.
Args:
to: Comma-separated list of recipient email addresses.
subject: Email subject.
body: Email body (plain text).
attachments: List of file paths to attach.
cc: Comma-separated list of CC email addresses (optional).
Returns:
True if sent successfully.
Raises:
EmailError if sending fails.
"""
try:
cfg = get_email_config()
except EmailError as e:
logger.error("Email configuration error: %s", e)
raise
# Parse recipients
to_list = [addr.strip() for addr in to.split(",") if addr.strip()]
cc_list = [addr.strip() for addr in cc.split(",") if addr.strip()] if cc else []
if not to_list:
raise EmailError("No valid 'To' email addresses provided.")
# Build message
msg = MIMEMultipart()
msg["From"] = cfg["from_address"]
msg["To"] = ", ".join(to_list)
if cc_list:
msg["Cc"] = ", ".join(cc_list)
msg["Subject"] = subject
msg.attach(MIMEText(body, "plain"))
# Attach files
for file_path in attachments:
if not os.path.isfile(file_path):
logger.warning("Attachment file not found, skipping: %s", file_path)
continue
try:
with open(file_path, "rb") as f:
part = MIMEBase("application", "octet-stream")
part.set_payload(f.read())
encoders.encode_base64(part)
part.add_header(
"Content-Disposition",
"attachment",
filename=os.path.basename(file_path),
)
msg.attach(part)
except Exception as e:
logger.warning("Failed to attach file %s: %s", file_path, e)
# Connect and send
try:
if cfg["use_tls"]:
server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
server.ehlo()
server.starttls()
server.ehlo()
else:
server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
server.ehlo()
server.login(cfg["smtp_user"], cfg["smtp_password"])
server.sendmail(
cfg["from_address"],
to_list + cc_list,
msg.as_string(),
)
server.quit()
logger.info(
"Email sent to %s (CC: %s)",
to_list,
cc_list or "None",
)
return True
except Exception as e:
logger.error("Failed to send email: %s", e)
raise EmailError(f"Failed to send email: {e}")
+16 -16
View File
@@ -39,7 +39,7 @@ class LocalAIClient:
Responsibilities: Responsibilities:
- Read configuration from environment. - Read configuration from environment.
- Upload audio file as multipart/form-data. - Upload audio file as multipart/form-data.
- Parse diarization + transcription response. - Parse diarization + transcription response (verbose_json).
- Map response into the same structure expected by Scraibe's Transcript. - Map response into the same structure expected by Scraibe's Transcript.
""" """
@@ -106,20 +106,13 @@ class LocalAIClient:
response_format: Optional[str] = None, response_format: Optional[str] = None,
include_text: Optional[bool] = None, include_text: Optional[bool] = None,
verbose: bool = False, verbose: bool = False,
return_raw: bool = False,
**_ignored, **_ignored,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
Send audio to LocalAI /v1/audio/diarization and return a dict Send audio to LocalAI /v1/audio/diarization and return:
in the same style as the previous internal diarization output: - A normalized dict with segments, speakers, transcripts.
- Optionally, the raw verbose_json response (for JSON export).
{
"segments": [ [start, end], ... ],
"speakers": [ "SPEAKER_00", ... ],
"transcripts": [ "text for segment", ... ]
}
Extra kwargs that the old UI used (e.g., whisper-specific) are
accepted but ignored.
Args: Args:
audio_path: Path to the audio file. audio_path: Path to the audio file.
@@ -131,16 +124,18 @@ class LocalAIClient:
min_duration_on: Optional min segment duration. min_duration_on: Optional min segment duration.
min_duration_off: Optional min gap duration. min_duration_off: Optional min gap duration.
response_format: "json", "verbose_json", or "rttm". response_format: "json", "verbose_json", or "rttm".
Defaults to "verbose_json" if not set. Defaults to "verbose_json".
include_text: Whether to request per-segment text. include_text: Whether to request per-segment text.
Defaults to True. Defaults to True.
verbose: If True, prints progress messages. verbose: If True, prints progress messages.
return_raw: If True, also return the raw API response in 'raw_result'.
""" """
if verbose: if verbose:
print("Starting diarization and transcription via LocalAI.") print("Starting diarization and transcription via LocalAI.")
logger.info("diarize_and_transcribe requested for: %s", audio_path) logger.info("diarize_and_transcribe requested for: %s", audio_path)
# Always use verbose_json for diarization + speaker info
if response_format is None: if response_format is None:
response_format = "verbose_json" response_format = "verbose_json"
if include_text is None: if include_text is None:
@@ -202,7 +197,7 @@ class LocalAIClient:
) )
try: try:
result = resp.json() raw_result = resp.json()
except json.JSONDecodeError: except json.JSONDecodeError:
logger.error("Failed to parse LocalAI response as JSON.") logger.error("Failed to parse LocalAI response as JSON.")
raise LocalAIError( raise LocalAIError(
@@ -212,11 +207,16 @@ class LocalAIClient:
if verbose: if verbose:
print("Diarization and transcription finished. Starting post-processing.") print("Diarization and transcription finished. Starting post-processing.")
return self._parse_diarization_response(result) parsed = self._parse_diarization_response(raw_result)
if return_raw:
parsed["raw_result"] = raw_result
return parsed
def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]: def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
""" """
Convert LocalAI response into the internal format used by Scraibe: Convert LocalAI verbose_json response into the internal format used by Scraibe:
{ {
"segments": [ [start, end], ... ], "segments": [ [start, end], ... ],
"speakers": [ "SPEAKER_00", ... ], "speakers": [ "SPEAKER_00", ... ],
+180 -21
View File
@@ -7,13 +7,16 @@ Runs the Web GUI that:
- Sends audio to LocalAI for transcription + diarization - Sends audio to LocalAI for transcription + diarization
- Optionally sends transcript to a second LLM for summarization - Optionally sends transcript to a second LLM for summarization
- Returns transcript (and summary) in the browser - Returns transcript (and summary) in the browser
- Optionally emails transcript files (TXT + JSON)
This is the default entrypoint when running in Docker. This is the default entrypoint when running in Docker.
""" """
import os import os
import json
import logging import logging
import tempfile import tempfile
from datetime import datetime
import gradio as gr import gradio as gr
@@ -70,10 +73,23 @@ def create_app():
) )
# Helper: run transcription via LocalAI API # Helper: run transcription via LocalAI API
def run_transcribe(audio_path, task, language, num_speakers): def run_transcribe(
audio_path,
task,
language,
num_speakers,
send_email_flag,
email_to,
email_cc,
email_subject,
):
if not audio_path: if not audio_path:
raise ValueError("No audio file provided.") raise ValueError("No audio file provided.")
email_status = ""
attachments = []
# Ensure we use rich export mode (for JSON with diarization)
try: try:
if task == "transcript_and_summarize": if task == "transcript_and_summarize":
result = scraibe.transcript_and_summarize( result = scraibe.transcript_and_summarize(
@@ -81,11 +97,14 @@ def create_app():
language=language or None, language=language or None,
num_speakers=int(num_speakers) if num_speakers else None, num_speakers=int(num_speakers) if num_speakers else None,
verbose=True, verbose=True,
for_export=True,
) )
transcript_text = result.get("transcript", "") transcript_text = result.get("transcript", "")
summary_text = result.get("summary", "") summary_text = result.get("summary", "")
segments = result.get("segments", [])
raw_result = result.get("raw_result")
# Save as .md # Save as .md (transcript + summary)
md_path = tempfile.mktemp(suffix=".md") md_path = tempfile.mktemp(suffix=".md")
with open(md_path, "w", encoding="utf-8") as f: with open(md_path, "w", encoding="utf-8") as f:
f.write("# Transcript\n\n") f.write("# Transcript\n\n")
@@ -93,32 +112,74 @@ def create_app():
f.write("\n\n# Summary\n\n") f.write("\n\n# Summary\n\n")
f.write(summary_text) f.write(summary_text)
return ( # Save as .txt (plain transcript)
transcript_text, txt_path = tempfile.mktemp(suffix=".txt")
summary_text, with open(txt_path, "w", encoding="utf-8") as f:
md_path, f.write(transcript_text)
"Transcription and summarization completed.",
) # Save as .json (diarization + transcript + summary)
json_data = {
"task": "transcript_and_summarize",
"transcript": transcript_text,
"summary": summary_text,
"segments": segments,
"metadata": {
"timestamp": datetime.utcnow().isoformat()
},
}
if raw_result is not None:
json_data["raw_result"] = raw_result
json_path = tempfile.mktemp(suffix=".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
# Prepare attachments for email
if send_email_flag:
attachments = [txt_path, json_path]
status_msg = "Transcription and summarization completed."
else: else:
# Default: transcribe only # transcribe only (with diarization)
text = scraibe.transcribe( result = scraibe.transcribe(
audio_file=audio_path, audio_file=audio_path,
language=language or None, language=language or None,
num_speakers=int(num_speakers) if num_speakers else None, num_speakers=int(num_speakers) if num_speakers else None,
verbose=True, verbose=True,
for_export=True,
) )
transcript_text = result.get("transcript", "")
segments = result.get("segments", [])
raw_result = result.get("raw_result")
# Save as .txt # Save as .txt (plain transcript)
txt_path = tempfile.mktemp(suffix=".txt") txt_path = tempfile.mktemp(suffix=".txt")
with open(txt_path, "w", encoding="utf-8") as f: with open(txt_path, "w", encoding="utf-8") as f:
f.write(text) f.write(transcript_text)
# Save as .json (diarization + transcript)
json_data = {
"task": "transcribe",
"transcript": transcript_text,
"segments": segments,
"metadata": {
"timestamp": datetime.utcnow().isoformat()
},
}
if raw_result is not None:
json_data["raw_result"] = raw_result
json_path = tempfile.mktemp(suffix=".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
# Prepare attachments for email
if send_email_flag:
attachments = [txt_path, json_path]
status_msg = "Transcription completed."
return (
text,
"",
txt_path,
"Transcription completed.",
)
except Exception as e: except Exception as e:
logger.error("Error during transcription: %s", e) logger.error("Error during transcription: %s", e)
return ( return (
@@ -126,6 +187,54 @@ def create_app():
"", "",
None, None,
f"Error: {e}", f"Error: {e}",
"",
)
# Handle email after successful transcription
if send_email_flag and attachments:
try:
from .email_sender import send_email, EmailError
except ImportError:
email_status = "Email feature unavailable (email_sender not found)."
else:
to = (email_to or "").strip()
cc = (email_cc or "").strip()
subject = (email_subject or "").strip()
if not to:
email_status = "Email not sent: 'To' address is empty."
else:
if not subject:
subject = f"ScrAIbe Transcript - {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"
body = (
"Please find the transcription files attached.\n"
"This message was generated by ScrAIbe.\n"
)
try:
send_email(
to=to,
cc=cc or None,
subject=subject,
body=body,
attachments=attachments,
)
email_status = "Transcript files sent via email."
except EmailError as e:
email_status = f"Email failed: {e}"
except Exception as e:
email_status = f"Email failed: {e}"
# Use md_path for file_output in transcript_and_summarize, else txt_path
file_path = md_path if task == "transcript_and_summarize" else txt_path
return (
transcript_text,
summary_text if task == "transcript_and_summarize" else "",
file_path,
status_msg,
email_status,
) )
# Load header/footer HTML if present # Load header/footer HTML if present
@@ -180,6 +289,31 @@ def create_app():
precision=0, precision=0,
) )
# Email options
send_email_checkbox = gr.Checkbox(
label="Send transcript files via email"
)
with gr.Group(visible=False) as email_group:
email_to = gr.Textbox(
label="To (comma-separated)",
placeholder="e.g. name@example.com",
)
email_cc = gr.Textbox(
label="CC (optional, comma-separated)",
placeholder="e.g. manager@example.com",
)
email_subject = gr.Textbox(
label="Subject (optional)",
placeholder="Default: ScrAIbe Transcript - <date>",
)
send_email_checkbox.change(
fn=lambda v: gr.update(visible=v),
inputs=[send_email_checkbox],
outputs=[email_group],
)
transcribe_btn = gr.Button("Start", variant="primary") transcribe_btn = gr.Button("Start", variant="primary")
with gr.Column(scale=3): with gr.Column(scale=3):
@@ -201,6 +335,11 @@ def create_app():
label="Status", label="Status",
interactive=False, interactive=False,
) )
email_status_text = gr.Textbox(
label="Email status",
interactive=False,
visible=True,
)
# Footer # Footer
if footer_html: if footer_html:
@@ -218,20 +357,34 @@ def create_app():
outputs=[summary_text], outputs=[summary_text],
) )
def on_transcribe(audio, task, language, num_speakers): def on_transcribe(
audio,
task,
language,
num_speakers,
send_email_flag,
email_to_val,
email_cc_val,
email_subject_val,
):
if not audio: if not audio:
return ( return (
"", "",
"", "",
None, None,
"Please upload or record audio.", "Please upload or record audio.",
"",
) )
transcript, summary, file_path, msg = run_transcribe( transcript, summary, file_path, status_msg, email_status = run_transcribe(
audio_path=audio, audio_path=audio,
task=task, task=task,
language=language, language=language,
num_speakers=num_speakers, num_speakers=num_speakers,
send_email_flag=bool(send_email_flag),
email_to=email_to_val,
email_cc=email_cc_val,
email_subject=email_subject_val,
) )
show_summary = bool(summary) show_summary = bool(summary)
@@ -239,7 +392,8 @@ def create_app():
transcript, transcript,
summary, summary,
file_path if file_path else None, file_path if file_path else None,
msg, status_msg,
email_status,
) )
transcribe_btn.click( transcribe_btn.click(
@@ -249,12 +403,17 @@ def create_app():
task_choice, task_choice,
language_input, language_input,
num_speakers_input, num_speakers_input,
send_email_checkbox,
email_to,
email_cc,
email_subject,
], ],
outputs=[ outputs=[
output_text, output_text,
summary_text, summary_text,
file_output, file_output,
status_text, status_text,
email_status_text,
], ],
) )