Use verbose_json diarization, add JSON+TXT email feature
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

This commit is contained in:
admin
2026-06-14 05:36:45 +00:00
parent f6db48b1d0
commit b9d25a39dd
4 changed files with 421 additions and 54 deletions
+78 -17
View File
@@ -17,7 +17,7 @@ but ignored when not relevant.
import os
import logging
from typing import Union, Optional
from typing import Union, Optional, Dict, Any
from .localai_client import LocalAIClient, LocalAIError
from .summarizer import SummarizerClient, SummarizerError
@@ -120,21 +120,21 @@ class Scraibe:
def transcribe(
self,
audio_file: Union[str],
audio_file: str,
*,
for_export: bool = False,
**kwargs,
) -> str:
) -> Union[str, Dict[str, Any]]:
"""
Transcribe the provided audio file using LocalAI.
Uses /v1/audio/diarization with vibevoice.cpp, then concatenates
all segment texts.
Args:
audio_file (str): Path to the audio file.
**kwargs: Additional keyword arguments (some forwarded, others ignored).
Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
Returns:
str: The concatenated transcribed text.
- If for_export=False: plain transcript text (str).
- If for_export=True: dict with:
- transcript: plain text
- segments: list[segment] with speaker labels
- raw_result: full verbose_json from LocalAI (if present)
"""
if isinstance(audio_file, str):
if not os.path.exists(audio_file):
@@ -152,31 +152,70 @@ class Scraibe:
audio_path=audio_file,
include_text=True,
verbose=verbose,
return_raw=True,
**kwargs,
)
except LocalAIError as e:
logger.error("Error during LocalAI transcription: %s", e)
raise LocalAIError(f"Error during LocalAI transcription: {e}")
segments = result.get("segments", [])
speakers = result.get("speakers", [])
transcripts = result.get("transcripts", [])
text = " ".join(t.strip() for t in transcripts if t.strip())
logger.info("transcribe completed, length=%d chars", len(text))
return text
# Build simple transcript text
if for_export:
# Include speaker-labeled transcript
lines = []
for seg, speaker, text in zip(segments, speakers, transcripts):
start, end = seg
ts = self._format_timestamp(start)
line = f"[{ts}] {speaker}: {text.strip()}"
lines.append(line)
full_text = "\n\n".join(lines)
else:
# Legacy: space-joined text
full_text = " ".join(t.strip() for t in transcripts if t.strip())
logger.info("transcribe completed, length=%d chars", len(full_text))
if for_export:
# Return richer structure for JSON export
raw_result = result.get("raw_result")
return {
"transcript": full_text,
"segments": [
{
"id": i,
"speaker": sp,
"start": seg[0],
"end": seg[1],
"text": txt,
}
for i, (seg, sp, txt) in enumerate(
zip(segments, speakers, transcripts)
)
],
"raw_result": raw_result if raw_result is not None else None,
}
return full_text
def transcript_and_summarize(
self,
audio_file: Union[str],
audio_file: str,
*,
summarizer_api_url: Optional[str] = None,
summarizer_api_key: Optional[str] = None,
summarizer_model: Optional[str] = None,
for_export: bool = False,
**kwargs,
) -> dict:
"""
Transcribe the audio file and generate a detailed summary.
Steps:
- Transcribe via LocalAI.
- Transcribe via LocalAI (verbose_json).
- Build a plain-text transcript (with speaker labels).
- Summarize the transcript using the configured LLM.
@@ -184,6 +223,8 @@ class Scraibe:
dict with:
- transcript: full transcript text (with speaker labels)
- summary: final detailed summary (markdown-ready)
- segments: (if for_export) list[segment] with speaker labels
- raw_result: (if for_export) full verbose_json from LocalAI
"""
if isinstance(audio_file, str):
if not os.path.exists(audio_file):
@@ -202,6 +243,7 @@ class Scraibe:
audio_path=audio_file,
include_text=True,
verbose=verbose,
return_raw=True,
**kwargs,
)
except LocalAIError as e:
@@ -249,11 +291,30 @@ class Scraibe:
logger.info("transcript_and_summarize completed.")
return {
out = {
"transcript": full_transcript,
"summary": summary,
}
if for_export:
# Add segments and raw_result for JSON export
raw_result = result.get("raw_result")
out["segments"] = [
{
"id": i,
"speaker": sp,
"start": seg[0],
"end": seg[1],
"text": txt,
}
for i, (seg, sp, txt) in enumerate(
zip(segments, speakers, transcripts)
)
]
out["raw_result"] = raw_result if raw_result is not None else None
return out
# -----------------
# Helpers
# -----------------