Use verbose_json diarization, add JSON+TXT email feature
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

This commit is contained in:
admin
2026-06-14 05:36:45 +00:00
parent f6db48b1d0
commit b9d25a39dd
4 changed files with 421 additions and 54 deletions
+78 -17
View File
@@ -17,7 +17,7 @@ but ignored when not relevant.
import os
import logging
from typing import Union, Optional
from typing import Union, Optional, Dict, Any
from .localai_client import LocalAIClient, LocalAIError
from .summarizer import SummarizerClient, SummarizerError
@@ -120,21 +120,21 @@ class Scraibe:
def transcribe(
self,
audio_file: Union[str],
audio_file: str,
*,
for_export: bool = False,
**kwargs,
) -> str:
) -> Union[str, Dict[str, Any]]:
"""
Transcribe the provided audio file using LocalAI.
Uses /v1/audio/diarization with vibevoice.cpp, then concatenates
all segment texts.
Args:
audio_file (str): Path to the audio file.
**kwargs: Additional keyword arguments (some forwarded, others ignored).
Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
Returns:
str: The concatenated transcribed text.
- If for_export=False: plain transcript text (str).
- If for_export=True: dict with:
- transcript: plain text
- segments: list[segment] with speaker labels
- raw_result: full verbose_json from LocalAI (if present)
"""
if isinstance(audio_file, str):
if not os.path.exists(audio_file):
@@ -152,31 +152,70 @@ class Scraibe:
audio_path=audio_file,
include_text=True,
verbose=verbose,
return_raw=True,
**kwargs,
)
except LocalAIError as e:
logger.error("Error during LocalAI transcription: %s", e)
raise LocalAIError(f"Error during LocalAI transcription: {e}")
segments = result.get("segments", [])
speakers = result.get("speakers", [])
transcripts = result.get("transcripts", [])
text = " ".join(t.strip() for t in transcripts if t.strip())
logger.info("transcribe completed, length=%d chars", len(text))
return text
# Build simple transcript text
if for_export:
# Include speaker-labeled transcript
lines = []
for seg, speaker, text in zip(segments, speakers, transcripts):
start, end = seg
ts = self._format_timestamp(start)
line = f"[{ts}] {speaker}: {text.strip()}"
lines.append(line)
full_text = "\n\n".join(lines)
else:
# Legacy: space-joined text
full_text = " ".join(t.strip() for t in transcripts if t.strip())
logger.info("transcribe completed, length=%d chars", len(full_text))
if for_export:
# Return richer structure for JSON export
raw_result = result.get("raw_result")
return {
"transcript": full_text,
"segments": [
{
"id": i,
"speaker": sp,
"start": seg[0],
"end": seg[1],
"text": txt,
}
for i, (seg, sp, txt) in enumerate(
zip(segments, speakers, transcripts)
)
],
"raw_result": raw_result if raw_result is not None else None,
}
return full_text
def transcript_and_summarize(
self,
audio_file: Union[str],
audio_file: str,
*,
summarizer_api_url: Optional[str] = None,
summarizer_api_key: Optional[str] = None,
summarizer_model: Optional[str] = None,
for_export: bool = False,
**kwargs,
) -> dict:
"""
Transcribe the audio file and generate a detailed summary.
Steps:
- Transcribe via LocalAI.
- Transcribe via LocalAI (verbose_json).
- Build a plain-text transcript (with speaker labels).
- Summarize the transcript using the configured LLM.
@@ -184,6 +223,8 @@ class Scraibe:
dict with:
- transcript: full transcript text (with speaker labels)
- summary: final detailed summary (markdown-ready)
- segments: (if for_export) list[segment] with speaker labels
- raw_result: (if for_export) full verbose_json from LocalAI
"""
if isinstance(audio_file, str):
if not os.path.exists(audio_file):
@@ -202,6 +243,7 @@ class Scraibe:
audio_path=audio_file,
include_text=True,
verbose=verbose,
return_raw=True,
**kwargs,
)
except LocalAIError as e:
@@ -249,11 +291,30 @@ class Scraibe:
logger.info("transcript_and_summarize completed.")
return {
out = {
"transcript": full_transcript,
"summary": summary,
}
if for_export:
# Add segments and raw_result for JSON export
raw_result = result.get("raw_result")
out["segments"] = [
{
"id": i,
"speaker": sp,
"start": seg[0],
"end": seg[1],
"text": txt,
}
for i, (seg, sp, txt) in enumerate(
zip(segments, speakers, transcripts)
)
]
out["raw_result"] = raw_result if raw_result is not None else None
return out
# -----------------
# Helpers
# -----------------
+147
View File
@@ -0,0 +1,147 @@
"""
Email sender module for ScrAIbe.
Sends transcription outputs (TXT, JSON, etc.) via SMTP.
All credentials are configured via environment variables.
"""
import os
import smtplib
import logging
from email import encoders
from email.mime.base import MIMEBase
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
from typing import List, Optional
logger = logging.getLogger("scraibe.email_sender")
class EmailError(Exception):
pass
def get_email_config():
"""
Read email configuration from environment variables.
Raises EmailError if required fields are missing.
"""
smtp_host = os.getenv("EMAIL_SMTP_HOST")
smtp_port = os.getenv("EMAIL_SMTP_PORT")
smtp_user = os.getenv("EMAIL_SMTP_USER")
smtp_password = os.getenv("EMAIL_SMTP_PASSWORD")
from_address = os.getenv("EMAIL_FROM_ADDRESS")
use_tls_str = os.getenv("EMAIL_SMTP_USE_TLS", "true").strip().lower()
use_tls = use_tls_str not in ("false", "0", "no")
if not all([smtp_host, smtp_port, smtp_user, smtp_password, from_address]):
raise EmailError(
"Email configuration incomplete. "
"Ensure EMAIL_SMTP_HOST, EMAIL_SMTP_PORT, EMAIL_SMTP_USER, "
"EMAIL_SMTP_PASSWORD, and EMAIL_FROM_ADDRESS are set."
)
return {
"smtp_host": smtp_host,
"smtp_port": int(smtp_port),
"smtp_user": smtp_user,
"smtp_password": smtp_password,
"from_address": from_address,
"use_tls": use_tls,
}
def send_email(
to: str,
subject: str,
body: str,
attachments: List[str],
cc: Optional[str] = None,
) -> bool:
"""
Send an email with optional file attachments.
Args:
to: Comma-separated list of recipient email addresses.
subject: Email subject.
body: Email body (plain text).
attachments: List of file paths to attach.
cc: Comma-separated list of CC email addresses (optional).
Returns:
True if sent successfully.
Raises:
EmailError if sending fails.
"""
try:
cfg = get_email_config()
except EmailError as e:
logger.error("Email configuration error: %s", e)
raise
# Parse recipients
to_list = [addr.strip() for addr in to.split(",") if addr.strip()]
cc_list = [addr.strip() for addr in cc.split(",") if addr.strip()] if cc else []
if not to_list:
raise EmailError("No valid 'To' email addresses provided.")
# Build message
msg = MIMEMultipart()
msg["From"] = cfg["from_address"]
msg["To"] = ", ".join(to_list)
if cc_list:
msg["Cc"] = ", ".join(cc_list)
msg["Subject"] = subject
msg.attach(MIMEText(body, "plain"))
# Attach files
for file_path in attachments:
if not os.path.isfile(file_path):
logger.warning("Attachment file not found, skipping: %s", file_path)
continue
try:
with open(file_path, "rb") as f:
part = MIMEBase("application", "octet-stream")
part.set_payload(f.read())
encoders.encode_base64(part)
part.add_header(
"Content-Disposition",
"attachment",
filename=os.path.basename(file_path),
)
msg.attach(part)
except Exception as e:
logger.warning("Failed to attach file %s: %s", file_path, e)
# Connect and send
try:
if cfg["use_tls"]:
server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
server.ehlo()
server.starttls()
server.ehlo()
else:
server = smtplib.SMTP(cfg["smtp_host"], cfg["smtp_port"], timeout=30)
server.ehlo()
server.login(cfg["smtp_user"], cfg["smtp_password"])
server.sendmail(
cfg["from_address"],
to_list + cc_list,
msg.as_string(),
)
server.quit()
logger.info(
"Email sent to %s (CC: %s)",
to_list,
cc_list or "None",
)
return True
except Exception as e:
logger.error("Failed to send email: %s", e)
raise EmailError(f"Failed to send email: {e}")
+16 -16
View File
@@ -39,7 +39,7 @@ class LocalAIClient:
Responsibilities:
- Read configuration from environment.
- Upload audio file as multipart/form-data.
- Parse diarization + transcription response.
- Parse diarization + transcription response (verbose_json).
- Map response into the same structure expected by Scraibe's Transcript.
"""
@@ -106,20 +106,13 @@ class LocalAIClient:
response_format: Optional[str] = None,
include_text: Optional[bool] = None,
verbose: bool = False,
return_raw: bool = False,
**_ignored,
) -> Dict[str, Any]:
"""
Send audio to LocalAI /v1/audio/diarization and return a dict
in the same style as the previous internal diarization output:
{
"segments": [ [start, end], ... ],
"speakers": [ "SPEAKER_00", ... ],
"transcripts": [ "text for segment", ... ]
}
Extra kwargs that the old UI used (e.g., whisper-specific) are
accepted but ignored.
Send audio to LocalAI /v1/audio/diarization and return:
- A normalized dict with segments, speakers, transcripts.
- Optionally, the raw verbose_json response (for JSON export).
Args:
audio_path: Path to the audio file.
@@ -131,16 +124,18 @@ class LocalAIClient:
min_duration_on: Optional min segment duration.
min_duration_off: Optional min gap duration.
response_format: "json", "verbose_json", or "rttm".
Defaults to "verbose_json" if not set.
Defaults to "verbose_json".
include_text: Whether to request per-segment text.
Defaults to True.
verbose: If True, prints progress messages.
return_raw: If True, also return the raw API response in 'raw_result'.
"""
if verbose:
print("Starting diarization and transcription via LocalAI.")
logger.info("diarize_and_transcribe requested for: %s", audio_path)
# Always use verbose_json for diarization + speaker info
if response_format is None:
response_format = "verbose_json"
if include_text is None:
@@ -202,7 +197,7 @@ class LocalAIClient:
)
try:
result = resp.json()
raw_result = resp.json()
except json.JSONDecodeError:
logger.error("Failed to parse LocalAI response as JSON.")
raise LocalAIError(
@@ -212,11 +207,16 @@ class LocalAIClient:
if verbose:
print("Diarization and transcription finished. Starting post-processing.")
return self._parse_diarization_response(result)
parsed = self._parse_diarization_response(raw_result)
if return_raw:
parsed["raw_result"] = raw_result
return parsed
def _parse_diarization_response(self, result: Dict[str, Any]) -> Dict[str, Any]:
"""
Convert LocalAI response into the internal format used by Scraibe:
Convert LocalAI verbose_json response into the internal format used by Scraibe:
{
"segments": [ [start, end], ... ],
"speakers": [ "SPEAKER_00", ... ],
+180 -21
View File
@@ -7,13 +7,16 @@ Runs the Web GUI that:
- Sends audio to LocalAI for transcription + diarization
- Optionally sends transcript to a second LLM for summarization
- Returns transcript (and summary) in the browser
- Optionally emails transcript files (TXT + JSON)
This is the default entrypoint when running in Docker.
"""
import os
import json
import logging
import tempfile
from datetime import datetime
import gradio as gr
@@ -70,10 +73,23 @@ def create_app():
)
# Helper: run transcription via LocalAI API
def run_transcribe(audio_path, task, language, num_speakers):
def run_transcribe(
audio_path,
task,
language,
num_speakers,
send_email_flag,
email_to,
email_cc,
email_subject,
):
if not audio_path:
raise ValueError("No audio file provided.")
email_status = ""
attachments = []
# Ensure we use rich export mode (for JSON with diarization)
try:
if task == "transcript_and_summarize":
result = scraibe.transcript_and_summarize(
@@ -81,11 +97,14 @@ def create_app():
language=language or None,
num_speakers=int(num_speakers) if num_speakers else None,
verbose=True,
for_export=True,
)
transcript_text = result.get("transcript", "")
summary_text = result.get("summary", "")
segments = result.get("segments", [])
raw_result = result.get("raw_result")
# Save as .md
# Save as .md (transcript + summary)
md_path = tempfile.mktemp(suffix=".md")
with open(md_path, "w", encoding="utf-8") as f:
f.write("# Transcript\n\n")
@@ -93,32 +112,74 @@ def create_app():
f.write("\n\n# Summary\n\n")
f.write(summary_text)
return (
transcript_text,
summary_text,
md_path,
"Transcription and summarization completed.",
)
# Save as .txt (plain transcript)
txt_path = tempfile.mktemp(suffix=".txt")
with open(txt_path, "w", encoding="utf-8") as f:
f.write(transcript_text)
# Save as .json (diarization + transcript + summary)
json_data = {
"task": "transcript_and_summarize",
"transcript": transcript_text,
"summary": summary_text,
"segments": segments,
"metadata": {
"timestamp": datetime.utcnow().isoformat()
},
}
if raw_result is not None:
json_data["raw_result"] = raw_result
json_path = tempfile.mktemp(suffix=".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
# Prepare attachments for email
if send_email_flag:
attachments = [txt_path, json_path]
status_msg = "Transcription and summarization completed."
else:
# Default: transcribe only
text = scraibe.transcribe(
# transcribe only (with diarization)
result = scraibe.transcribe(
audio_file=audio_path,
language=language or None,
num_speakers=int(num_speakers) if num_speakers else None,
verbose=True,
for_export=True,
)
transcript_text = result.get("transcript", "")
segments = result.get("segments", [])
raw_result = result.get("raw_result")
# Save as .txt
# Save as .txt (plain transcript)
txt_path = tempfile.mktemp(suffix=".txt")
with open(txt_path, "w", encoding="utf-8") as f:
f.write(text)
f.write(transcript_text)
# Save as .json (diarization + transcript)
json_data = {
"task": "transcribe",
"transcript": transcript_text,
"segments": segments,
"metadata": {
"timestamp": datetime.utcnow().isoformat()
},
}
if raw_result is not None:
json_data["raw_result"] = raw_result
json_path = tempfile.mktemp(suffix=".json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(json_data, f, indent=2, ensure_ascii=False)
# Prepare attachments for email
if send_email_flag:
attachments = [txt_path, json_path]
status_msg = "Transcription completed."
return (
text,
"",
txt_path,
"Transcription completed.",
)
except Exception as e:
logger.error("Error during transcription: %s", e)
return (
@@ -126,6 +187,54 @@ def create_app():
"",
None,
f"Error: {e}",
"",
)
# Handle email after successful transcription
if send_email_flag and attachments:
try:
from .email_sender import send_email, EmailError
except ImportError:
email_status = "Email feature unavailable (email_sender not found)."
else:
to = (email_to or "").strip()
cc = (email_cc or "").strip()
subject = (email_subject or "").strip()
if not to:
email_status = "Email not sent: 'To' address is empty."
else:
if not subject:
subject = f"ScrAIbe Transcript - {datetime.utcnow().strftime('%Y-%m-%d %H:%M UTC')}"
body = (
"Please find the transcription files attached.\n"
"This message was generated by ScrAIbe.\n"
)
try:
send_email(
to=to,
cc=cc or None,
subject=subject,
body=body,
attachments=attachments,
)
email_status = "Transcript files sent via email."
except EmailError as e:
email_status = f"Email failed: {e}"
except Exception as e:
email_status = f"Email failed: {e}"
# Use md_path for file_output in transcript_and_summarize, else txt_path
file_path = md_path if task == "transcript_and_summarize" else txt_path
return (
transcript_text,
summary_text if task == "transcript_and_summarize" else "",
file_path,
status_msg,
email_status,
)
# Load header/footer HTML if present
@@ -180,6 +289,31 @@ def create_app():
precision=0,
)
# Email options
send_email_checkbox = gr.Checkbox(
label="Send transcript files via email"
)
with gr.Group(visible=False) as email_group:
email_to = gr.Textbox(
label="To (comma-separated)",
placeholder="e.g. name@example.com",
)
email_cc = gr.Textbox(
label="CC (optional, comma-separated)",
placeholder="e.g. manager@example.com",
)
email_subject = gr.Textbox(
label="Subject (optional)",
placeholder="Default: ScrAIbe Transcript - <date>",
)
send_email_checkbox.change(
fn=lambda v: gr.update(visible=v),
inputs=[send_email_checkbox],
outputs=[email_group],
)
transcribe_btn = gr.Button("Start", variant="primary")
with gr.Column(scale=3):
@@ -201,6 +335,11 @@ def create_app():
label="Status",
interactive=False,
)
email_status_text = gr.Textbox(
label="Email status",
interactive=False,
visible=True,
)
# Footer
if footer_html:
@@ -218,20 +357,34 @@ def create_app():
outputs=[summary_text],
)
def on_transcribe(audio, task, language, num_speakers):
def on_transcribe(
audio,
task,
language,
num_speakers,
send_email_flag,
email_to_val,
email_cc_val,
email_subject_val,
):
if not audio:
return (
"",
"",
None,
"Please upload or record audio.",
"",
)
transcript, summary, file_path, msg = run_transcribe(
transcript, summary, file_path, status_msg, email_status = run_transcribe(
audio_path=audio,
task=task,
language=language,
num_speakers=num_speakers,
send_email_flag=bool(send_email_flag),
email_to=email_to_val,
email_cc=email_cc_val,
email_subject=email_subject_val,
)
show_summary = bool(summary)
@@ -239,7 +392,8 @@ def create_app():
transcript,
summary,
file_path if file_path else None,
msg,
status_msg,
email_status,
)
transcribe_btn.click(
@@ -249,12 +403,17 @@ def create_app():
task_choice,
language_input,
num_speakers_input,
send_email_checkbox,
email_to,
email_cc,
email_subject,
],
outputs=[
output_text,
summary_text,
file_output,
status_text,
email_status_text,
],
)