From 49e999f0ee8a882fea3119b3b857659d608a241d Mon Sep 17 00:00:00 2001 From: ScrAIbe Admin Date: Sun, 14 Jun 2026 18:05:37 +0000 Subject: [PATCH] Add Identify speakers option: AI infers names and replaces Speaker IDs in transcript --- scraibe/tasks.py | 86 ++++++++++++++++++++++++++++++++++++++++++++++++ scraibe/webui.py | 9 +++++ 2 files changed, 95 insertions(+) diff --git a/scraibe/tasks.py b/scraibe/tasks.py index 26ffac9..74f099b 100644 --- a/scraibe/tasks.py +++ b/scraibe/tasks.py @@ -10,6 +10,7 @@ from datetime import datetime from .celery_app import celery_app from .autotranscript import Scraibe +from .summarizer import SummarizerClient, SummarizerError from .misc import setup_logging from .email_sender import send_email, EmailError, load_template from .email_sender import create_transcript_docx, create_summary_docx @@ -238,6 +239,7 @@ def process_transcription_task( email_to: str, email_cc: str, include_summary: bool, + identify_speakers: bool = False, ): """ Async task: transcribe audio, optionally summarize, then email results. @@ -294,6 +296,90 @@ def process_transcription_task( segments = result.get("segments", []) raw_result = result.get("raw_result") + # 3b) Optional speaker identification + speaker_map = {} # e.g. {"SPEAKER 1": "John", "SPEAKER 2": "Maria"} + if identify_speakers: + try: + # Use the same summarizer client as transcript_and_summarize + scraibe._ensure_summarizer() + summarizer = scraibe._summarizer + + prompt = ( + "Below is a transcript with speaker labels like 'SPEAKER 1', 'SPEAKER 2', etc. " + "Based on how they speak and the context, suggest realistic names for each speaker. " + "Do not add extra commentary. Output ONLY a mapping in this exact format, one per line: +SPEAKER 1: Suggested Name +SPEAKER 2: Suggested Name +SPEAKER 3: Suggested Name + +Transcript: +" + transcript_text + ) + + response = summarizer._chat_completion( + messages=[{"role": "user", "content": prompt}], + temperature=0.3, + max_tokens=300, + ) + reply = (response or {}).get("choices", [{}])[0].get("message", {}).get("content", "") + + # Parse mapping + import re + for m in re.finditer( + r"SPEAKER\s+(\d+)\s*:\s*(.+)", + reply, + re.IGNORECASE, + ): + spk = f"SPEAKER {m.group(1).strip()}" + name = m.group(2).strip().rstrip(".") + if name: + speaker_map[spk] = name + + logger.info("Speaker identification mapping: %s", speaker_map) + + # Apply mapping to transcript text + if speaker_map: + def replace_speaker(m): + label = m.group(0).strip() + # normalize to "SPEAKER N" + normalized = re.sub( + r"\s+", + " ", + re.sub(r"[^A-Z0-9\s]", "", label.upper()), + ).strip() + return speaker_map.get(normalized, label) + + # Replace in lines like "[00:12] SPEAKER 1:" but preserve timestamp and colon + def replace_in_line(line: str) -> str: + # match after timestamp bracket and space: "SPEAKER N:" + return re.sub( + r"(\[\d+:\d+(?::\d+)?\]\s*)([A-Z\s]+?):\s*", + lambda m: m.group(1) + (speaker_map.get(m.group(2).strip(), m.group(2)) + ": "), + line, + ) + + transcript_lines = transcript_text.splitlines() + transcript_text = "\n".join( + replace_in_line(line) for line in transcript_lines + ) + + # Also update segments for JSON export + updated_segments = [] + for seg in segments: + sp = (seg.get("speaker") or "").strip() + sp_norm = re.sub(r"[^A-Z0-9\s]", "", sp.upper()).strip() + sp_new = speaker_map.get(sp_norm, sp) + seg = dict(seg) + seg["speaker"] = sp_new + updated_segments.append(seg) + segments = updated_segments + + except (SummarizerError, Exception) as e: + logger.warning( + "Speaker identification failed; falling back to Speaker IDs: %s", e + ) + speaker_map = {} + # 4) Prepare files # Transcript .md diff --git a/scraibe/webui.py b/scraibe/webui.py index 35fe90e..b9dad14 100644 --- a/scraibe/webui.py +++ b/scraibe/webui.py @@ -135,6 +135,12 @@ def create_app(): label="Task", ) + identify_speakers = gr.Checkbox( + label="Identify speakers (best effort using AI)", + value=False, + info="If enabled, AI will attempt to infer real names for speakers and replace Speaker 1/2/etc. in the transcript." + ) + with gr.Row(): language_input = gr.Textbox( label="Language (optional)", @@ -188,6 +194,7 @@ def create_app(): num_speakers, email_to_val, email_cc_val, + identify_speakers_val, ): if not audio: return "Please upload or record audio." @@ -225,6 +232,7 @@ def create_app(): email_to=email_to_val, email_cc=email_cc_val or None, include_summary=(task == "transcript_and_summarize"), + identify_speakers=bool(identify_speakers_val), ) except Exception as e: logger.error("Error enqueuing job: %s", e) @@ -247,6 +255,7 @@ def create_app(): num_speakers_input, email_to, email_cc, + identify_speakers, ], outputs=[status_text], )