Add structured logging for Docker; support LOG_LEVEL env and --log-level
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

This commit is contained in:
admin
2026-06-13 17:46:25 +00:00
parent 47b3304297
commit 2ea46ada42
5 changed files with 140 additions and 9 deletions
+21 -1
View File
@@ -16,12 +16,15 @@ but ignored when not relevant.
""" """
import os import os
import logging
from typing import Union, Optional from typing import Union, Optional
from .localai_client import LocalAIClient, LocalAIError from .localai_client import LocalAIClient, LocalAIError
from .summarizer import SummarizerClient, SummarizerError from .summarizer import SummarizerClient, SummarizerError
from .transcript_exporter import Transcript from .transcript_exporter import Transcript
logger = logging.getLogger("scraibe.autotranscript")
class Scraibe: class Scraibe:
""" """
@@ -68,6 +71,8 @@ class Scraibe:
""" """
self.verbose = verbose or kwargs.get("verbose", False) self.verbose = verbose or kwargs.get("verbose", False)
logger.info("Initializing Scraibe.")
try: try:
self.client = LocalAIClient( self.client = LocalAIClient(
api_url=api_url, api_url=api_url,
@@ -75,6 +80,7 @@ class Scraibe:
model=model, model=model,
) )
except LocalAIError as e: except LocalAIError as e:
logger.error("Failed to initialize LocalAI client: %s", e)
raise LocalAIError(f"Failed to initialize LocalAI client: {e}") raise LocalAIError(f"Failed to initialize LocalAI client: {e}")
# Summarizer is lazy-initialized if needed # Summarizer is lazy-initialized if needed
@@ -95,6 +101,7 @@ class Scraibe:
if self._summarizer is not None: if self._summarizer is not None:
return self._summarizer return self._summarizer
logger.info("Initializing SummarizerClient (lazy).")
try: try:
self._summarizer = SummarizerClient( self._summarizer = SummarizerClient(
api_url=api_url, api_url=api_url,
@@ -102,6 +109,7 @@ class Scraibe:
model=model, model=model,
) )
except SummarizerError as e: except SummarizerError as e:
logger.error("Failed to initialize Summarizer client: %s", e)
raise SummarizerError(f"Failed to initialize Summarizer client: {e}") raise SummarizerError(f"Failed to initialize Summarizer client: {e}")
return self._summarizer return self._summarizer
@@ -137,6 +145,7 @@ class Scraibe:
) )
verbose = kwargs.get("verbose", self.verbose) verbose = kwargs.get("verbose", self.verbose)
logger.info("transcribe called for: %s", audio_file)
try: try:
result = self.client.diarize_and_transcribe( result = self.client.diarize_and_transcribe(
@@ -146,10 +155,13 @@ class Scraibe:
**kwargs, **kwargs,
) )
except LocalAIError as e: except LocalAIError as e:
logger.error("Error during LocalAI transcription: %s", e)
raise LocalAIError(f"Error during LocalAI transcription: {e}") raise LocalAIError(f"Error during LocalAI transcription: {e}")
transcripts = result.get("transcripts", []) transcripts = result.get("transcripts", [])
return " ".join(t.strip() for t in transcripts if t.strip()) text = " ".join(t.strip() for t in transcripts if t.strip())
logger.info("transcribe completed, length=%d chars", len(text))
return text
def transcript_and_summarize( def transcript_and_summarize(
self, self,
@@ -182,6 +194,7 @@ class Scraibe:
) )
verbose = kwargs.get("verbose", self.verbose) verbose = kwargs.get("verbose", self.verbose)
logger.info("transcript_and_summarize called for: %s", audio_file)
# 1) Get diarized + transcribed result # 1) Get diarized + transcribed result
try: try:
@@ -192,6 +205,7 @@ class Scraibe:
**kwargs, **kwargs,
) )
except LocalAIError as e: except LocalAIError as e:
logger.error("Error during LocalAI transcription: %s", e)
raise LocalAIError(f"Error during LocalAI transcription: {e}") raise LocalAIError(f"Error during LocalAI transcription: {e}")
segments = result.get("segments", []) segments = result.get("segments", [])
@@ -199,6 +213,7 @@ class Scraibe:
transcripts = result.get("transcripts", []) transcripts = result.get("transcripts", [])
if not segments: if not segments:
logger.warning("No segments returned; returning empty transcript/summary.")
return { return {
"transcript": "", "transcript": "",
"summary": "No transcript content to summarize.", "summary": "No transcript content to summarize.",
@@ -213,6 +228,7 @@ class Scraibe:
lines.append(line) lines.append(line)
full_transcript = "\n\n".join(lines) full_transcript = "\n\n".join(lines)
logger.info("Built full transcript, length=%d chars", len(full_transcript))
# 3) Summarize # 3) Summarize
try: try:
@@ -222,13 +238,17 @@ class Scraibe:
model=summarizer_model, model=summarizer_model,
) )
except SummarizerError as e: except SummarizerError as e:
logger.error("Failed to initialize summarizer: %s", e)
raise SummarizerError(f"Failed to initialize summarizer: {e}") raise SummarizerError(f"Failed to initialize summarizer: {e}")
try: try:
summary = summarizer.summarize_transcript(full_transcript) summary = summarizer.summarize_transcript(full_transcript)
except SummarizerError as e: except SummarizerError as e:
logger.error("Error during summarization: %s", e)
raise SummarizerError(f"Error during summarization: {e}") raise SummarizerError(f"Error during summarization: {e}")
logger.info("transcript_and_summarize completed.")
return { return {
"transcript": full_transcript, "transcript": full_transcript,
"summary": summary, "summary": summary,
+35 -4
View File
@@ -9,9 +9,10 @@ This version is adapted for LocalAI-based transcription and diarization.
import os import os
import json import json
import logging
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from .autotranscript import Scraibe from .autotranscript import Scraibe
from .misc import set_threads from .misc import set_threads, setup_logging
def cli(): def cli():
@@ -20,6 +21,11 @@ def cli():
and diarize audio files via a LocalAI server. and diarize audio files via a LocalAI server.
""" """
# Initialize logging (can be overridden via --log-level)
setup_logging(level=os.getenv("LOG_LEVEL", "INFO"))
logger = logging.getLogger("scraibe.cli")
def str2bool(string): def str2bool(string):
str2val = {"True": True, "False": False} str2val = {"True": True, "False": False}
if string in str2val: if string in str2val:
@@ -181,18 +187,34 @@ def cli():
help="Number of speakers in the audio.", help="Number of speakers in the audio.",
) )
parser.add_argument(
"--log-level",
type=str,
default=None,
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
help="Override LOG_LEVEL env var for logging verbosity.",
)
args = parser.parse_args() args = parser.parse_args()
# Apply log-level override if provided
log_level = args.log_level or os.getenv("LOG_LEVEL", "INFO")
setup_logging(level=log_level)
logger.info("CLI starting with log_level=%s", log_level)
arg_dict = vars(args) arg_dict = vars(args)
# configure output # configure output
out_folder = arg_dict.pop("output_directory") out_folder = arg_dict.pop("output_directory")
os.makedirs(out_folder, exist_ok=True) os.makedirs(out_folder, exist_ok=True)
logger.info("Output directory: %s", out_folder)
out_format = arg_dict.pop("output_format") out_format = arg_dict.pop("output_format")
task = arg_dict.pop("task") task = arg_dict.pop("task")
logger.info("Task: %s", task)
logger.info("Output format: %s", out_format)
set_threads(arg_dict.pop("num_threads")) set_threads(arg_dict.pop("num_threads"))
# Build kwargs for Scraibe (LocalAI-backed) # Build kwargs for Scraibe (LocalAI-backed)
@@ -208,13 +230,18 @@ def cli():
"verbose": arg_dict.pop("verbose_output"), "verbose": arg_dict.pop("verbose_output"),
} }
logger.info("LocalAI API URL: %s", class_kwargs["api_url"] or os.getenv("LOCALAI_API_URL", "<not set>"))
logger.info("LocalAI Model: %s", class_kwargs["model"] or os.getenv("LOCALAI_MODEL", "<not set>"))
model = Scraibe(**class_kwargs) model = Scraibe(**class_kwargs)
if arg_dict["audio_files"]: if arg_dict["audio_files"]:
audio_files = arg_dict.pop("audio_files") audio_files = arg_dict.pop("audio_files")
logger.info("Audio files: %s", audio_files)
if task == "transcribe": if task == "transcribe":
for audio in audio_files: for audio in audio_files:
logger.info("Starting 'transcribe' for: %s", audio)
out = model.transcribe( out = model.transcribe(
audio, audio,
language=arg_dict.pop("language"), language=arg_dict.pop("language"),
@@ -223,12 +250,14 @@ def cli():
) )
basename = audio.split("/")[-1].split(".")[0] basename = audio.split("/")[-1].split(".")[0]
path = os.path.join(out_folder, f"{basename}.{out_format}") path = os.path.join(out_folder, f"{basename}.{out_format}")
print(f"Saving {basename}.{out_format} to {out_folder}") logger.info("Saving transcript to: %s", path)
with open(path, "w", encoding="utf-8") as f: with open(path, "w", encoding="utf-8") as f:
f.write(out) f.write(out)
logger.info("Transcript saved: %s", path)
elif task == "transcript_and_summarize": elif task == "transcript_and_summarize":
for audio in audio_files: for audio in audio_files:
logger.info("Starting 'transcript_and_summarize' for: %s", audio)
result = model.transcript_and_summarize( result = model.transcript_and_summarize(
audio, audio,
summarizer_api_url=arg_dict.pop("summarizer_api_url"), summarizer_api_url=arg_dict.pop("summarizer_api_url"),
@@ -246,7 +275,7 @@ def cli():
# Always use .md for transcript_and_summarize # Always use .md for transcript_and_summarize
md_path = os.path.join(out_folder, f"{basename}.md") md_path = os.path.join(out_folder, f"{basename}.md")
print(f"Saving {basename}.md (transcript + summary) to {out_folder}") logger.info("Saving transcript + summary to: %s", md_path)
with open(md_path, "w", encoding="utf-8") as f: with open(md_path, "w", encoding="utf-8") as f:
f.write("# Transcript\n\n") f.write("# Transcript\n\n")
@@ -254,5 +283,7 @@ def cli():
f.write("\n\n# Summary\n\n") f.write("\n\n# Summary\n\n")
f.write(summary_text) f.write(summary_text)
logger.info("Transcript + summary saved: %s", md_path)
if __name__ == "__main__": if __name__ == "__main__":
cli() cli()
+24 -2
View File
@@ -19,10 +19,13 @@ Environment Variables:
import os import os
import io import io
import json import json
import logging
from typing import Dict, List, Any, Optional from typing import Dict, List, Any, Optional
import httpx import httpx
logger = logging.getLogger("scraibe.localai_client")
class LocalAIError(Exception): class LocalAIError(Exception):
"""Raised when the LocalAI API returns an error or unexpected response.""" """Raised when the LocalAI API returns an error or unexpected response."""
@@ -67,6 +70,12 @@ class LocalAIClient:
"Provide the LocalAI server URL via environment or constructor." "Provide the LocalAI server URL via environment or constructor."
) )
logger.info(
"Initializing LocalAIClient: url=%s model=%s",
self.api_url,
self.model,
)
self._client = httpx.Client( self._client = httpx.Client(
base_url=self.api_url, base_url=self.api_url,
timeout=self.timeout, timeout=self.timeout,
@@ -130,7 +139,8 @@ class LocalAIClient:
if verbose: if verbose:
print("Starting diarization and transcription via LocalAI.") print("Starting diarization and transcription via LocalAI.")
# Defaults: use verbose_json + include_text to get both diarization and transcription. logger.info("diarize_and_transcribe requested for: %s", audio_path)
if response_format is None: if response_format is None:
response_format = "verbose_json" response_format = "verbose_json"
if include_text is None: if include_text is None:
@@ -158,6 +168,8 @@ class LocalAIClient:
if min_duration_off is not None: if min_duration_off is not None:
data["min_duration_off"] = str(min_duration_off) data["min_duration_off"] = str(min_duration_off)
logger.debug("LocalAI request params: %s", data)
# Open file # Open file
if not os.path.exists(audio_path): if not os.path.exists(audio_path):
raise LocalAIError(f"Audio file not found: {audio_path}") raise LocalAIError(f"Audio file not found: {audio_path}")
@@ -172,6 +184,7 @@ class LocalAIClient:
headers["Authorization"] = f"Bearer {self.api_key}" headers["Authorization"] = f"Bearer {self.api_key}"
# POST /v1/audio/diarization # POST /v1/audio/diarization
logger.info("Sending request to LocalAI: /v1/audio/diarization")
resp = self._client.post( resp = self._client.post(
"/v1/audio/diarization", "/v1/audio/diarization",
data=data, data=data,
@@ -179,8 +192,11 @@ class LocalAIClient:
headers=headers, headers=headers,
) )
logger.info("LocalAI response status: %d", resp.status_code)
if resp.status_code >= 400: if resp.status_code >= 400:
body = resp.text body = resp.text
logger.error("LocalAI error response: %s", body)
raise LocalAIError( raise LocalAIError(
f"LocalAI request failed with status {resp.status_code}: {body}" f"LocalAI request failed with status {resp.status_code}: {body}"
) )
@@ -188,6 +204,7 @@ class LocalAIClient:
try: try:
result = resp.json() result = resp.json()
except json.JSONDecodeError: except json.JSONDecodeError:
logger.error("Failed to parse LocalAI response as JSON.")
raise LocalAIError( raise LocalAIError(
"Failed to parse LocalAI response as JSON." "Failed to parse LocalAI response as JSON."
) )
@@ -209,7 +226,7 @@ class LocalAIClient:
segments = result.get("segments", []) segments = result.get("segments", [])
if not segments: if not segments:
# If no segments, return empty but valid structure logger.warning("LocalAI returned no segments.")
return { return {
"segments": [], "segments": [],
"speakers": [], "speakers": [],
@@ -230,6 +247,11 @@ class LocalAIClient:
out_speakers.append(speaker) out_speakers.append(speaker)
out_transcripts.append(text) out_transcripts.append(text)
logger.info(
"Parsed %d segments from LocalAI.",
len(out_segments),
)
return { return {
"segments": out_segments, "segments": out_segments,
"speakers": out_speakers, "speakers": out_speakers,
+20
View File
@@ -1,4 +1,5 @@
import os import os
import logging
from argparse import Action from argparse import Action
from ast import literal_eval from ast import literal_eval
@@ -13,6 +14,25 @@ PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote")
PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml") PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")
def setup_logging(level: str = "INFO"):
"""
Configure root logger to write to stdout so Docker can capture logs.
Args:
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
"""
numeric_level = getattr(logging, level.upper(), logging.INFO)
if not isinstance(numeric_level, int):
numeric_level = logging.INFO
logging.basicConfig(
level=numeric_level,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
datefmt="%Y-%m-%dT%H:%M:%S%z",
force=True,
)
def set_threads(parse_threads=None, yaml_threads=None): def set_threads(parse_threads=None, yaml_threads=None):
""" """
Configure number of threads. Configure number of threads.
+40 -2
View File
@@ -6,8 +6,8 @@ Provides a client to summarize long transcripts via an LLM endpoint.
Behavior: Behavior:
- Chunks transcript into 10,240-character segments. - Chunks transcript into 10,240-character segments.
- Generates a summary for each chunk. - Summarizes each chunk.
- Combines all chunk summaries and produces a final, detailed summary. - Summarizes the summaries into a final, detailed summary.
Environment Variables: Environment Variables:
- SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080) - SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080)
@@ -17,10 +17,13 @@ Environment Variables:
import os import os
import json import json
import logging
from typing import Optional from typing import Optional
import httpx import httpx
logger = logging.getLogger("scraibe.summarizer")
class SummarizerError(Exception): class SummarizerError(Exception):
"""Raised when the summarization API call fails.""" """Raised when the summarization API call fails."""
@@ -53,6 +56,12 @@ class SummarizerClient:
"Provide the summarization LLM URL via environment or constructor." "Provide the summarization LLM URL via environment or constructor."
) )
logger.info(
"Initializing SummarizerClient: url=%s model=%s",
self.api_url,
self.model,
)
self._client = httpx.Client( self._client = httpx.Client(
base_url=self.api_url, base_url=self.api_url,
timeout=self.timeout, timeout=self.timeout,
@@ -84,21 +93,40 @@ class SummarizerClient:
- Next steps / action items - Next steps / action items
""" """
if not transcript.strip(): if not transcript.strip():
logger.warning("Empty transcript provided to summarize_transcript.")
return "No transcript provided to summarize." return "No transcript provided to summarize."
logger.info(
"Starting summarization for transcript length=%d chars",
len(transcript),
)
# 1) Chunk the transcript # 1) Chunk the transcript
chunks = self._chunk_text(transcript) chunks = self._chunk_text(transcript)
logger.info("Split transcript into %d chunks.", len(chunks))
# 2) Summarize each chunk # 2) Summarize each chunk
chunk_summaries = [] chunk_summaries = []
for i, chunk in enumerate(chunks): for i, chunk in enumerate(chunks):
logger.info(
"Summarizing chunk %d/%d (length=%d)",
i + 1,
len(chunks),
len(chunk),
)
summary = self._summarize_chunk(chunk, i, len(chunks)) summary = self._summarize_chunk(chunk, i, len(chunks))
chunk_summaries.append(summary) chunk_summaries.append(summary)
# 3) Combine and summarize summaries # 3) Combine and summarize summaries
combined = "\n\n".join(chunk_summaries) combined = "\n\n".join(chunk_summaries)
logger.info(
"Combining %d chunk summaries (total length=%d) for final summary.",
len(chunk_summaries),
len(combined),
)
final_summary = self._summarize_combined(combined) final_summary = self._summarize_combined(combined)
logger.info("Summarization completed.")
return final_summary return final_summary
def _chunk_text(self, text: str) -> list[str]: def _chunk_text(self, text: str) -> list[str]:
@@ -183,13 +211,18 @@ class SummarizerClient:
if self.api_key: if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}" headers["Authorization"] = f"Bearer {self.api_key}"
logger.info("Calling summarizer endpoint: /v1/chat/completions")
resp = self._client.post( resp = self._client.post(
"/v1/chat/completions", "/v1/chat/completions",
json=payload, json=payload,
headers=headers, headers=headers,
) )
logger.info("Summarizer response status: %d", resp.status_code)
if resp.status_code >= 400: if resp.status_code >= 400:
logger.error("Summarizer error response: %s", resp.text)
raise SummarizerError( raise SummarizerError(
f"Summarizer API error {resp.status_code}: {resp.text}" f"Summarizer API error {resp.status_code}: {resp.text}"
) )
@@ -197,6 +230,7 @@ class SummarizerClient:
try: try:
data = resp.json() data = resp.json()
except json.JSONDecodeError: except json.JSONDecodeError:
logger.error("Failed to parse summarizer response as JSON.")
raise SummarizerError( raise SummarizerError(
"Failed to parse summarizer response as JSON." "Failed to parse summarizer response as JSON."
) )
@@ -206,6 +240,10 @@ class SummarizerClient:
content = data["choices"][0]["message"]["content"] content = data["choices"][0]["message"]["content"]
return content.strip() return content.strip()
except (KeyError, IndexError, TypeError): except (KeyError, IndexError, TypeError):
logger.error(
"Unexpected summarizer response format: %s",
json.dumps(data, indent=2),
)
raise SummarizerError( raise SummarizerError(
"Unexpected summarizer response format: " "Unexpected summarizer response format: "
f"{json.dumps(data, indent=2)}" f"{json.dumps(data, indent=2)}"