diff --git a/scraibe/autotranscript.py b/scraibe/autotranscript.py index c895af2..40b69ac 100644 --- a/scraibe/autotranscript.py +++ b/scraibe/autotranscript.py @@ -16,12 +16,15 @@ but ignored when not relevant. """ import os +import logging from typing import Union, Optional from .localai_client import LocalAIClient, LocalAIError from .summarizer import SummarizerClient, SummarizerError from .transcript_exporter import Transcript +logger = logging.getLogger("scraibe.autotranscript") + class Scraibe: """ @@ -68,6 +71,8 @@ class Scraibe: """ self.verbose = verbose or kwargs.get("verbose", False) + logger.info("Initializing Scraibe.") + try: self.client = LocalAIClient( api_url=api_url, @@ -75,6 +80,7 @@ class Scraibe: model=model, ) except LocalAIError as e: + logger.error("Failed to initialize LocalAI client: %s", e) raise LocalAIError(f"Failed to initialize LocalAI client: {e}") # Summarizer is lazy-initialized if needed @@ -95,6 +101,7 @@ class Scraibe: if self._summarizer is not None: return self._summarizer + logger.info("Initializing SummarizerClient (lazy).") try: self._summarizer = SummarizerClient( api_url=api_url, @@ -102,6 +109,7 @@ class Scraibe: model=model, ) except SummarizerError as e: + logger.error("Failed to initialize Summarizer client: %s", e) raise SummarizerError(f"Failed to initialize Summarizer client: {e}") return self._summarizer @@ -137,6 +145,7 @@ class Scraibe: ) verbose = kwargs.get("verbose", self.verbose) + logger.info("transcribe called for: %s", audio_file) try: result = self.client.diarize_and_transcribe( @@ -146,10 +155,13 @@ class Scraibe: **kwargs, ) except LocalAIError as e: + logger.error("Error during LocalAI transcription: %s", e) raise LocalAIError(f"Error during LocalAI transcription: {e}") transcripts = result.get("transcripts", []) - return " ".join(t.strip() for t in transcripts if t.strip()) + text = " ".join(t.strip() for t in transcripts if t.strip()) + logger.info("transcribe completed, length=%d chars", len(text)) + return text def transcript_and_summarize( self, @@ -182,6 +194,7 @@ class Scraibe: ) verbose = kwargs.get("verbose", self.verbose) + logger.info("transcript_and_summarize called for: %s", audio_file) # 1) Get diarized + transcribed result try: @@ -192,6 +205,7 @@ class Scraibe: **kwargs, ) except LocalAIError as e: + logger.error("Error during LocalAI transcription: %s", e) raise LocalAIError(f"Error during LocalAI transcription: {e}") segments = result.get("segments", []) @@ -199,6 +213,7 @@ class Scraibe: transcripts = result.get("transcripts", []) if not segments: + logger.warning("No segments returned; returning empty transcript/summary.") return { "transcript": "", "summary": "No transcript content to summarize.", @@ -213,6 +228,7 @@ class Scraibe: lines.append(line) full_transcript = "\n\n".join(lines) + logger.info("Built full transcript, length=%d chars", len(full_transcript)) # 3) Summarize try: @@ -222,13 +238,17 @@ class Scraibe: model=summarizer_model, ) except SummarizerError as e: + logger.error("Failed to initialize summarizer: %s", e) raise SummarizerError(f"Failed to initialize summarizer: {e}") try: summary = summarizer.summarize_transcript(full_transcript) except SummarizerError as e: + logger.error("Error during summarization: %s", e) raise SummarizerError(f"Error during summarization: {e}") + logger.info("transcript_and_summarize completed.") + return { "transcript": full_transcript, "summary": summary, diff --git a/scraibe/cli.py b/scraibe/cli.py index 01b5659..6a918df 100644 --- a/scraibe/cli.py +++ b/scraibe/cli.py @@ -9,9 +9,10 @@ This version is adapted for LocalAI-based transcription and diarization. import os import json +import logging from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from .autotranscript import Scraibe -from .misc import set_threads +from .misc import set_threads, setup_logging def cli(): @@ -20,6 +21,11 @@ def cli(): and diarize audio files via a LocalAI server. """ + # Initialize logging (can be overridden via --log-level) + setup_logging(level=os.getenv("LOG_LEVEL", "INFO")) + + logger = logging.getLogger("scraibe.cli") + def str2bool(string): str2val = {"True": True, "False": False} if string in str2val: @@ -181,18 +187,34 @@ def cli(): help="Number of speakers in the audio.", ) + parser.add_argument( + "--log-level", + type=str, + default=None, + choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], + help="Override LOG_LEVEL env var for logging verbosity.", + ) + args = parser.parse_args() + # Apply log-level override if provided + log_level = args.log_level or os.getenv("LOG_LEVEL", "INFO") + setup_logging(level=log_level) + logger.info("CLI starting with log_level=%s", log_level) + arg_dict = vars(args) # configure output out_folder = arg_dict.pop("output_directory") os.makedirs(out_folder, exist_ok=True) + logger.info("Output directory: %s", out_folder) out_format = arg_dict.pop("output_format") - task = arg_dict.pop("task") + logger.info("Task: %s", task) + logger.info("Output format: %s", out_format) + set_threads(arg_dict.pop("num_threads")) # Build kwargs for Scraibe (LocalAI-backed) @@ -208,13 +230,18 @@ def cli(): "verbose": arg_dict.pop("verbose_output"), } + logger.info("LocalAI API URL: %s", class_kwargs["api_url"] or os.getenv("LOCALAI_API_URL", "")) + logger.info("LocalAI Model: %s", class_kwargs["model"] or os.getenv("LOCALAI_MODEL", "")) + model = Scraibe(**class_kwargs) if arg_dict["audio_files"]: audio_files = arg_dict.pop("audio_files") + logger.info("Audio files: %s", audio_files) if task == "transcribe": for audio in audio_files: + logger.info("Starting 'transcribe' for: %s", audio) out = model.transcribe( audio, language=arg_dict.pop("language"), @@ -223,12 +250,14 @@ def cli(): ) basename = audio.split("/")[-1].split(".")[0] path = os.path.join(out_folder, f"{basename}.{out_format}") - print(f"Saving {basename}.{out_format} to {out_folder}") + logger.info("Saving transcript to: %s", path) with open(path, "w", encoding="utf-8") as f: f.write(out) + logger.info("Transcript saved: %s", path) elif task == "transcript_and_summarize": for audio in audio_files: + logger.info("Starting 'transcript_and_summarize' for: %s", audio) result = model.transcript_and_summarize( audio, summarizer_api_url=arg_dict.pop("summarizer_api_url"), @@ -246,7 +275,7 @@ def cli(): # Always use .md for transcript_and_summarize md_path = os.path.join(out_folder, f"{basename}.md") - print(f"Saving {basename}.md (transcript + summary) to {out_folder}") + logger.info("Saving transcript + summary to: %s", md_path) with open(md_path, "w", encoding="utf-8") as f: f.write("# Transcript\n\n") @@ -254,5 +283,7 @@ def cli(): f.write("\n\n# Summary\n\n") f.write(summary_text) + logger.info("Transcript + summary saved: %s", md_path) + if __name__ == "__main__": cli() diff --git a/scraibe/localai_client.py b/scraibe/localai_client.py index 7f6d24c..4d10070 100644 --- a/scraibe/localai_client.py +++ b/scraibe/localai_client.py @@ -19,10 +19,13 @@ Environment Variables: import os import io import json +import logging from typing import Dict, List, Any, Optional import httpx +logger = logging.getLogger("scraibe.localai_client") + class LocalAIError(Exception): """Raised when the LocalAI API returns an error or unexpected response.""" @@ -67,6 +70,12 @@ class LocalAIClient: "Provide the LocalAI server URL via environment or constructor." ) + logger.info( + "Initializing LocalAIClient: url=%s model=%s", + self.api_url, + self.model, + ) + self._client = httpx.Client( base_url=self.api_url, timeout=self.timeout, @@ -130,7 +139,8 @@ class LocalAIClient: if verbose: print("Starting diarization and transcription via LocalAI.") - # Defaults: use verbose_json + include_text to get both diarization and transcription. + logger.info("diarize_and_transcribe requested for: %s", audio_path) + if response_format is None: response_format = "verbose_json" if include_text is None: @@ -158,6 +168,8 @@ class LocalAIClient: if min_duration_off is not None: data["min_duration_off"] = str(min_duration_off) + logger.debug("LocalAI request params: %s", data) + # Open file if not os.path.exists(audio_path): raise LocalAIError(f"Audio file not found: {audio_path}") @@ -172,6 +184,7 @@ class LocalAIClient: headers["Authorization"] = f"Bearer {self.api_key}" # POST /v1/audio/diarization + logger.info("Sending request to LocalAI: /v1/audio/diarization") resp = self._client.post( "/v1/audio/diarization", data=data, @@ -179,8 +192,11 @@ class LocalAIClient: headers=headers, ) + logger.info("LocalAI response status: %d", resp.status_code) + if resp.status_code >= 400: body = resp.text + logger.error("LocalAI error response: %s", body) raise LocalAIError( f"LocalAI request failed with status {resp.status_code}: {body}" ) @@ -188,6 +204,7 @@ class LocalAIClient: try: result = resp.json() except json.JSONDecodeError: + logger.error("Failed to parse LocalAI response as JSON.") raise LocalAIError( "Failed to parse LocalAI response as JSON." ) @@ -209,7 +226,7 @@ class LocalAIClient: segments = result.get("segments", []) if not segments: - # If no segments, return empty but valid structure + logger.warning("LocalAI returned no segments.") return { "segments": [], "speakers": [], @@ -230,6 +247,11 @@ class LocalAIClient: out_speakers.append(speaker) out_transcripts.append(text) + logger.info( + "Parsed %d segments from LocalAI.", + len(out_segments), + ) + return { "segments": out_segments, "speakers": out_speakers, diff --git a/scraibe/misc.py b/scraibe/misc.py index 3857a1c..5d3f292 100644 --- a/scraibe/misc.py +++ b/scraibe/misc.py @@ -1,4 +1,5 @@ import os +import logging from argparse import Action from ast import literal_eval @@ -13,6 +14,25 @@ PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote") PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml") +def setup_logging(level: str = "INFO"): + """ + Configure root logger to write to stdout so Docker can capture logs. + + Args: + level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL). + """ + numeric_level = getattr(logging, level.upper(), logging.INFO) + if not isinstance(numeric_level, int): + numeric_level = logging.INFO + + logging.basicConfig( + level=numeric_level, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", + datefmt="%Y-%m-%dT%H:%M:%S%z", + force=True, + ) + + def set_threads(parse_threads=None, yaml_threads=None): """ Configure number of threads. diff --git a/scraibe/summarizer.py b/scraibe/summarizer.py index cd52cb3..e2d72bd 100644 --- a/scraibe/summarizer.py +++ b/scraibe/summarizer.py @@ -6,8 +6,8 @@ Provides a client to summarize long transcripts via an LLM endpoint. Behavior: - Chunks transcript into 10,240-character segments. -- Generates a summary for each chunk. -- Combines all chunk summaries and produces a final, detailed summary. +- Summarizes each chunk. +- Summarizes the summaries into a final, detailed summary. Environment Variables: - SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080) @@ -17,10 +17,13 @@ Environment Variables: import os import json +import logging from typing import Optional import httpx +logger = logging.getLogger("scraibe.summarizer") + class SummarizerError(Exception): """Raised when the summarization API call fails.""" @@ -53,6 +56,12 @@ class SummarizerClient: "Provide the summarization LLM URL via environment or constructor." ) + logger.info( + "Initializing SummarizerClient: url=%s model=%s", + self.api_url, + self.model, + ) + self._client = httpx.Client( base_url=self.api_url, timeout=self.timeout, @@ -84,21 +93,40 @@ class SummarizerClient: - Next steps / action items """ if not transcript.strip(): + logger.warning("Empty transcript provided to summarize_transcript.") return "No transcript provided to summarize." + logger.info( + "Starting summarization for transcript length=%d chars", + len(transcript), + ) + # 1) Chunk the transcript chunks = self._chunk_text(transcript) + logger.info("Split transcript into %d chunks.", len(chunks)) # 2) Summarize each chunk chunk_summaries = [] for i, chunk in enumerate(chunks): + logger.info( + "Summarizing chunk %d/%d (length=%d)", + i + 1, + len(chunks), + len(chunk), + ) summary = self._summarize_chunk(chunk, i, len(chunks)) chunk_summaries.append(summary) # 3) Combine and summarize summaries combined = "\n\n".join(chunk_summaries) + logger.info( + "Combining %d chunk summaries (total length=%d) for final summary.", + len(chunk_summaries), + len(combined), + ) final_summary = self._summarize_combined(combined) + logger.info("Summarization completed.") return final_summary def _chunk_text(self, text: str) -> list[str]: @@ -183,13 +211,18 @@ class SummarizerClient: if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" + logger.info("Calling summarizer endpoint: /v1/chat/completions") + resp = self._client.post( "/v1/chat/completions", json=payload, headers=headers, ) + logger.info("Summarizer response status: %d", resp.status_code) + if resp.status_code >= 400: + logger.error("Summarizer error response: %s", resp.text) raise SummarizerError( f"Summarizer API error {resp.status_code}: {resp.text}" ) @@ -197,6 +230,7 @@ class SummarizerClient: try: data = resp.json() except json.JSONDecodeError: + logger.error("Failed to parse summarizer response as JSON.") raise SummarizerError( "Failed to parse summarizer response as JSON." ) @@ -206,6 +240,10 @@ class SummarizerClient: content = data["choices"][0]["message"]["content"] return content.strip() except (KeyError, IndexError, TypeError): + logger.error( + "Unexpected summarizer response format: %s", + json.dumps(data, indent=2), + ) raise SummarizerError( "Unexpected summarizer response format: " f"{json.dumps(data, indent=2)}"