Use verbose_json diarization, add JSON+TXT email feature
This commit is contained in:
+78
-17
@@ -17,7 +17,7 @@ but ignored when not relevant.
|
||||
|
||||
import os
|
||||
import logging
|
||||
from typing import Union, Optional
|
||||
from typing import Union, Optional, Dict, Any
|
||||
|
||||
from .localai_client import LocalAIClient, LocalAIError
|
||||
from .summarizer import SummarizerClient, SummarizerError
|
||||
@@ -120,21 +120,21 @@ class Scraibe:
|
||||
|
||||
def transcribe(
|
||||
self,
|
||||
audio_file: Union[str],
|
||||
audio_file: str,
|
||||
*,
|
||||
for_export: bool = False,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
) -> Union[str, Dict[str, Any]]:
|
||||
"""
|
||||
Transcribe the provided audio file using LocalAI.
|
||||
|
||||
Uses /v1/audio/diarization with vibevoice.cpp, then concatenates
|
||||
all segment texts.
|
||||
|
||||
Args:
|
||||
audio_file (str): Path to the audio file.
|
||||
**kwargs: Additional keyword arguments (some forwarded, others ignored).
|
||||
|
||||
Uses /v1/audio/diarization with vibevoice.cpp (verbose_json).
|
||||
Returns:
|
||||
str: The concatenated transcribed text.
|
||||
- If for_export=False: plain transcript text (str).
|
||||
- If for_export=True: dict with:
|
||||
- transcript: plain text
|
||||
- segments: list[segment] with speaker labels
|
||||
- raw_result: full verbose_json from LocalAI (if present)
|
||||
"""
|
||||
if isinstance(audio_file, str):
|
||||
if not os.path.exists(audio_file):
|
||||
@@ -152,31 +152,70 @@ class Scraibe:
|
||||
audio_path=audio_file,
|
||||
include_text=True,
|
||||
verbose=verbose,
|
||||
return_raw=True,
|
||||
**kwargs,
|
||||
)
|
||||
except LocalAIError as e:
|
||||
logger.error("Error during LocalAI transcription: %s", e)
|
||||
raise LocalAIError(f"Error during LocalAI transcription: {e}")
|
||||
|
||||
segments = result.get("segments", [])
|
||||
speakers = result.get("speakers", [])
|
||||
transcripts = result.get("transcripts", [])
|
||||
text = " ".join(t.strip() for t in transcripts if t.strip())
|
||||
logger.info("transcribe completed, length=%d chars", len(text))
|
||||
return text
|
||||
|
||||
# Build simple transcript text
|
||||
if for_export:
|
||||
# Include speaker-labeled transcript
|
||||
lines = []
|
||||
for seg, speaker, text in zip(segments, speakers, transcripts):
|
||||
start, end = seg
|
||||
ts = self._format_timestamp(start)
|
||||
line = f"[{ts}] {speaker}: {text.strip()}"
|
||||
lines.append(line)
|
||||
full_text = "\n\n".join(lines)
|
||||
else:
|
||||
# Legacy: space-joined text
|
||||
full_text = " ".join(t.strip() for t in transcripts if t.strip())
|
||||
|
||||
logger.info("transcribe completed, length=%d chars", len(full_text))
|
||||
|
||||
if for_export:
|
||||
# Return richer structure for JSON export
|
||||
raw_result = result.get("raw_result")
|
||||
return {
|
||||
"transcript": full_text,
|
||||
"segments": [
|
||||
{
|
||||
"id": i,
|
||||
"speaker": sp,
|
||||
"start": seg[0],
|
||||
"end": seg[1],
|
||||
"text": txt,
|
||||
}
|
||||
for i, (seg, sp, txt) in enumerate(
|
||||
zip(segments, speakers, transcripts)
|
||||
)
|
||||
],
|
||||
"raw_result": raw_result if raw_result is not None else None,
|
||||
}
|
||||
|
||||
return full_text
|
||||
|
||||
def transcript_and_summarize(
|
||||
self,
|
||||
audio_file: Union[str],
|
||||
audio_file: str,
|
||||
*,
|
||||
summarizer_api_url: Optional[str] = None,
|
||||
summarizer_api_key: Optional[str] = None,
|
||||
summarizer_model: Optional[str] = None,
|
||||
for_export: bool = False,
|
||||
**kwargs,
|
||||
) -> dict:
|
||||
"""
|
||||
Transcribe the audio file and generate a detailed summary.
|
||||
|
||||
Steps:
|
||||
- Transcribe via LocalAI.
|
||||
- Transcribe via LocalAI (verbose_json).
|
||||
- Build a plain-text transcript (with speaker labels).
|
||||
- Summarize the transcript using the configured LLM.
|
||||
|
||||
@@ -184,6 +223,8 @@ class Scraibe:
|
||||
dict with:
|
||||
- transcript: full transcript text (with speaker labels)
|
||||
- summary: final detailed summary (markdown-ready)
|
||||
- segments: (if for_export) list[segment] with speaker labels
|
||||
- raw_result: (if for_export) full verbose_json from LocalAI
|
||||
"""
|
||||
if isinstance(audio_file, str):
|
||||
if not os.path.exists(audio_file):
|
||||
@@ -202,6 +243,7 @@ class Scraibe:
|
||||
audio_path=audio_file,
|
||||
include_text=True,
|
||||
verbose=verbose,
|
||||
return_raw=True,
|
||||
**kwargs,
|
||||
)
|
||||
except LocalAIError as e:
|
||||
@@ -249,11 +291,30 @@ class Scraibe:
|
||||
|
||||
logger.info("transcript_and_summarize completed.")
|
||||
|
||||
return {
|
||||
out = {
|
||||
"transcript": full_transcript,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
if for_export:
|
||||
# Add segments and raw_result for JSON export
|
||||
raw_result = result.get("raw_result")
|
||||
out["segments"] = [
|
||||
{
|
||||
"id": i,
|
||||
"speaker": sp,
|
||||
"start": seg[0],
|
||||
"end": seg[1],
|
||||
"text": txt,
|
||||
}
|
||||
for i, (seg, sp, txt) in enumerate(
|
||||
zip(segments, speakers, transcripts)
|
||||
)
|
||||
]
|
||||
out["raw_result"] = raw_result if raw_result is not None else None
|
||||
|
||||
return out
|
||||
|
||||
# -----------------
|
||||
# Helpers
|
||||
# -----------------
|
||||
|
||||
Reference in New Issue
Block a user