Add structured logging for Docker; support LOG_LEVEL env and --log-level
This commit is contained in:
@@ -16,12 +16,15 @@ but ignored when not relevant.
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
from typing import Union, Optional
|
from typing import Union, Optional
|
||||||
|
|
||||||
from .localai_client import LocalAIClient, LocalAIError
|
from .localai_client import LocalAIClient, LocalAIError
|
||||||
from .summarizer import SummarizerClient, SummarizerError
|
from .summarizer import SummarizerClient, SummarizerError
|
||||||
from .transcript_exporter import Transcript
|
from .transcript_exporter import Transcript
|
||||||
|
|
||||||
|
logger = logging.getLogger("scraibe.autotranscript")
|
||||||
|
|
||||||
|
|
||||||
class Scraibe:
|
class Scraibe:
|
||||||
"""
|
"""
|
||||||
@@ -68,6 +71,8 @@ class Scraibe:
|
|||||||
"""
|
"""
|
||||||
self.verbose = verbose or kwargs.get("verbose", False)
|
self.verbose = verbose or kwargs.get("verbose", False)
|
||||||
|
|
||||||
|
logger.info("Initializing Scraibe.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.client = LocalAIClient(
|
self.client = LocalAIClient(
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
@@ -75,6 +80,7 @@ class Scraibe:
|
|||||||
model=model,
|
model=model,
|
||||||
)
|
)
|
||||||
except LocalAIError as e:
|
except LocalAIError as e:
|
||||||
|
logger.error("Failed to initialize LocalAI client: %s", e)
|
||||||
raise LocalAIError(f"Failed to initialize LocalAI client: {e}")
|
raise LocalAIError(f"Failed to initialize LocalAI client: {e}")
|
||||||
|
|
||||||
# Summarizer is lazy-initialized if needed
|
# Summarizer is lazy-initialized if needed
|
||||||
@@ -95,6 +101,7 @@ class Scraibe:
|
|||||||
if self._summarizer is not None:
|
if self._summarizer is not None:
|
||||||
return self._summarizer
|
return self._summarizer
|
||||||
|
|
||||||
|
logger.info("Initializing SummarizerClient (lazy).")
|
||||||
try:
|
try:
|
||||||
self._summarizer = SummarizerClient(
|
self._summarizer = SummarizerClient(
|
||||||
api_url=api_url,
|
api_url=api_url,
|
||||||
@@ -102,6 +109,7 @@ class Scraibe:
|
|||||||
model=model,
|
model=model,
|
||||||
)
|
)
|
||||||
except SummarizerError as e:
|
except SummarizerError as e:
|
||||||
|
logger.error("Failed to initialize Summarizer client: %s", e)
|
||||||
raise SummarizerError(f"Failed to initialize Summarizer client: {e}")
|
raise SummarizerError(f"Failed to initialize Summarizer client: {e}")
|
||||||
|
|
||||||
return self._summarizer
|
return self._summarizer
|
||||||
@@ -137,6 +145,7 @@ class Scraibe:
|
|||||||
)
|
)
|
||||||
|
|
||||||
verbose = kwargs.get("verbose", self.verbose)
|
verbose = kwargs.get("verbose", self.verbose)
|
||||||
|
logger.info("transcribe called for: %s", audio_file)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
result = self.client.diarize_and_transcribe(
|
result = self.client.diarize_and_transcribe(
|
||||||
@@ -146,10 +155,13 @@ class Scraibe:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
except LocalAIError as e:
|
except LocalAIError as e:
|
||||||
|
logger.error("Error during LocalAI transcription: %s", e)
|
||||||
raise LocalAIError(f"Error during LocalAI transcription: {e}")
|
raise LocalAIError(f"Error during LocalAI transcription: {e}")
|
||||||
|
|
||||||
transcripts = result.get("transcripts", [])
|
transcripts = result.get("transcripts", [])
|
||||||
return " ".join(t.strip() for t in transcripts if t.strip())
|
text = " ".join(t.strip() for t in transcripts if t.strip())
|
||||||
|
logger.info("transcribe completed, length=%d chars", len(text))
|
||||||
|
return text
|
||||||
|
|
||||||
def transcript_and_summarize(
|
def transcript_and_summarize(
|
||||||
self,
|
self,
|
||||||
@@ -182,6 +194,7 @@ class Scraibe:
|
|||||||
)
|
)
|
||||||
|
|
||||||
verbose = kwargs.get("verbose", self.verbose)
|
verbose = kwargs.get("verbose", self.verbose)
|
||||||
|
logger.info("transcript_and_summarize called for: %s", audio_file)
|
||||||
|
|
||||||
# 1) Get diarized + transcribed result
|
# 1) Get diarized + transcribed result
|
||||||
try:
|
try:
|
||||||
@@ -192,6 +205,7 @@ class Scraibe:
|
|||||||
**kwargs,
|
**kwargs,
|
||||||
)
|
)
|
||||||
except LocalAIError as e:
|
except LocalAIError as e:
|
||||||
|
logger.error("Error during LocalAI transcription: %s", e)
|
||||||
raise LocalAIError(f"Error during LocalAI transcription: {e}")
|
raise LocalAIError(f"Error during LocalAI transcription: {e}")
|
||||||
|
|
||||||
segments = result.get("segments", [])
|
segments = result.get("segments", [])
|
||||||
@@ -199,6 +213,7 @@ class Scraibe:
|
|||||||
transcripts = result.get("transcripts", [])
|
transcripts = result.get("transcripts", [])
|
||||||
|
|
||||||
if not segments:
|
if not segments:
|
||||||
|
logger.warning("No segments returned; returning empty transcript/summary.")
|
||||||
return {
|
return {
|
||||||
"transcript": "",
|
"transcript": "",
|
||||||
"summary": "No transcript content to summarize.",
|
"summary": "No transcript content to summarize.",
|
||||||
@@ -213,6 +228,7 @@ class Scraibe:
|
|||||||
lines.append(line)
|
lines.append(line)
|
||||||
|
|
||||||
full_transcript = "\n\n".join(lines)
|
full_transcript = "\n\n".join(lines)
|
||||||
|
logger.info("Built full transcript, length=%d chars", len(full_transcript))
|
||||||
|
|
||||||
# 3) Summarize
|
# 3) Summarize
|
||||||
try:
|
try:
|
||||||
@@ -222,13 +238,17 @@ class Scraibe:
|
|||||||
model=summarizer_model,
|
model=summarizer_model,
|
||||||
)
|
)
|
||||||
except SummarizerError as e:
|
except SummarizerError as e:
|
||||||
|
logger.error("Failed to initialize summarizer: %s", e)
|
||||||
raise SummarizerError(f"Failed to initialize summarizer: {e}")
|
raise SummarizerError(f"Failed to initialize summarizer: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
summary = summarizer.summarize_transcript(full_transcript)
|
summary = summarizer.summarize_transcript(full_transcript)
|
||||||
except SummarizerError as e:
|
except SummarizerError as e:
|
||||||
|
logger.error("Error during summarization: %s", e)
|
||||||
raise SummarizerError(f"Error during summarization: {e}")
|
raise SummarizerError(f"Error during summarization: {e}")
|
||||||
|
|
||||||
|
logger.info("transcript_and_summarize completed.")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"transcript": full_transcript,
|
"transcript": full_transcript,
|
||||||
"summary": summary,
|
"summary": summary,
|
||||||
|
|||||||
+35
-4
@@ -9,9 +9,10 @@ This version is adapted for LocalAI-based transcription and diarization.
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
||||||
from .autotranscript import Scraibe
|
from .autotranscript import Scraibe
|
||||||
from .misc import set_threads
|
from .misc import set_threads, setup_logging
|
||||||
|
|
||||||
|
|
||||||
def cli():
|
def cli():
|
||||||
@@ -20,6 +21,11 @@ def cli():
|
|||||||
and diarize audio files via a LocalAI server.
|
and diarize audio files via a LocalAI server.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# Initialize logging (can be overridden via --log-level)
|
||||||
|
setup_logging(level=os.getenv("LOG_LEVEL", "INFO"))
|
||||||
|
|
||||||
|
logger = logging.getLogger("scraibe.cli")
|
||||||
|
|
||||||
def str2bool(string):
|
def str2bool(string):
|
||||||
str2val = {"True": True, "False": False}
|
str2val = {"True": True, "False": False}
|
||||||
if string in str2val:
|
if string in str2val:
|
||||||
@@ -181,18 +187,34 @@ def cli():
|
|||||||
help="Number of speakers in the audio.",
|
help="Number of speakers in the audio.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
parser.add_argument(
|
||||||
|
"--log-level",
|
||||||
|
type=str,
|
||||||
|
default=None,
|
||||||
|
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
||||||
|
help="Override LOG_LEVEL env var for logging verbosity.",
|
||||||
|
)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
# Apply log-level override if provided
|
||||||
|
log_level = args.log_level or os.getenv("LOG_LEVEL", "INFO")
|
||||||
|
setup_logging(level=log_level)
|
||||||
|
logger.info("CLI starting with log_level=%s", log_level)
|
||||||
|
|
||||||
arg_dict = vars(args)
|
arg_dict = vars(args)
|
||||||
|
|
||||||
# configure output
|
# configure output
|
||||||
out_folder = arg_dict.pop("output_directory")
|
out_folder = arg_dict.pop("output_directory")
|
||||||
os.makedirs(out_folder, exist_ok=True)
|
os.makedirs(out_folder, exist_ok=True)
|
||||||
|
logger.info("Output directory: %s", out_folder)
|
||||||
|
|
||||||
out_format = arg_dict.pop("output_format")
|
out_format = arg_dict.pop("output_format")
|
||||||
|
|
||||||
task = arg_dict.pop("task")
|
task = arg_dict.pop("task")
|
||||||
|
|
||||||
|
logger.info("Task: %s", task)
|
||||||
|
logger.info("Output format: %s", out_format)
|
||||||
|
|
||||||
set_threads(arg_dict.pop("num_threads"))
|
set_threads(arg_dict.pop("num_threads"))
|
||||||
|
|
||||||
# Build kwargs for Scraibe (LocalAI-backed)
|
# Build kwargs for Scraibe (LocalAI-backed)
|
||||||
@@ -208,13 +230,18 @@ def cli():
|
|||||||
"verbose": arg_dict.pop("verbose_output"),
|
"verbose": arg_dict.pop("verbose_output"),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger.info("LocalAI API URL: %s", class_kwargs["api_url"] or os.getenv("LOCALAI_API_URL", "<not set>"))
|
||||||
|
logger.info("LocalAI Model: %s", class_kwargs["model"] or os.getenv("LOCALAI_MODEL", "<not set>"))
|
||||||
|
|
||||||
model = Scraibe(**class_kwargs)
|
model = Scraibe(**class_kwargs)
|
||||||
|
|
||||||
if arg_dict["audio_files"]:
|
if arg_dict["audio_files"]:
|
||||||
audio_files = arg_dict.pop("audio_files")
|
audio_files = arg_dict.pop("audio_files")
|
||||||
|
logger.info("Audio files: %s", audio_files)
|
||||||
|
|
||||||
if task == "transcribe":
|
if task == "transcribe":
|
||||||
for audio in audio_files:
|
for audio in audio_files:
|
||||||
|
logger.info("Starting 'transcribe' for: %s", audio)
|
||||||
out = model.transcribe(
|
out = model.transcribe(
|
||||||
audio,
|
audio,
|
||||||
language=arg_dict.pop("language"),
|
language=arg_dict.pop("language"),
|
||||||
@@ -223,12 +250,14 @@ def cli():
|
|||||||
)
|
)
|
||||||
basename = audio.split("/")[-1].split(".")[0]
|
basename = audio.split("/")[-1].split(".")[0]
|
||||||
path = os.path.join(out_folder, f"{basename}.{out_format}")
|
path = os.path.join(out_folder, f"{basename}.{out_format}")
|
||||||
print(f"Saving {basename}.{out_format} to {out_folder}")
|
logger.info("Saving transcript to: %s", path)
|
||||||
with open(path, "w", encoding="utf-8") as f:
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
f.write(out)
|
f.write(out)
|
||||||
|
logger.info("Transcript saved: %s", path)
|
||||||
|
|
||||||
elif task == "transcript_and_summarize":
|
elif task == "transcript_and_summarize":
|
||||||
for audio in audio_files:
|
for audio in audio_files:
|
||||||
|
logger.info("Starting 'transcript_and_summarize' for: %s", audio)
|
||||||
result = model.transcript_and_summarize(
|
result = model.transcript_and_summarize(
|
||||||
audio,
|
audio,
|
||||||
summarizer_api_url=arg_dict.pop("summarizer_api_url"),
|
summarizer_api_url=arg_dict.pop("summarizer_api_url"),
|
||||||
@@ -246,7 +275,7 @@ def cli():
|
|||||||
|
|
||||||
# Always use .md for transcript_and_summarize
|
# Always use .md for transcript_and_summarize
|
||||||
md_path = os.path.join(out_folder, f"{basename}.md")
|
md_path = os.path.join(out_folder, f"{basename}.md")
|
||||||
print(f"Saving {basename}.md (transcript + summary) to {out_folder}")
|
logger.info("Saving transcript + summary to: %s", md_path)
|
||||||
|
|
||||||
with open(md_path, "w", encoding="utf-8") as f:
|
with open(md_path, "w", encoding="utf-8") as f:
|
||||||
f.write("# Transcript\n\n")
|
f.write("# Transcript\n\n")
|
||||||
@@ -254,5 +283,7 @@ def cli():
|
|||||||
f.write("\n\n# Summary\n\n")
|
f.write("\n\n# Summary\n\n")
|
||||||
f.write(summary_text)
|
f.write(summary_text)
|
||||||
|
|
||||||
|
logger.info("Transcript + summary saved: %s", md_path)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
cli()
|
cli()
|
||||||
|
|||||||
@@ -19,10 +19,13 @@ Environment Variables:
|
|||||||
import os
|
import os
|
||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from typing import Dict, List, Any, Optional
|
from typing import Dict, List, Any, Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger("scraibe.localai_client")
|
||||||
|
|
||||||
|
|
||||||
class LocalAIError(Exception):
|
class LocalAIError(Exception):
|
||||||
"""Raised when the LocalAI API returns an error or unexpected response."""
|
"""Raised when the LocalAI API returns an error or unexpected response."""
|
||||||
@@ -67,6 +70,12 @@ class LocalAIClient:
|
|||||||
"Provide the LocalAI server URL via environment or constructor."
|
"Provide the LocalAI server URL via environment or constructor."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Initializing LocalAIClient: url=%s model=%s",
|
||||||
|
self.api_url,
|
||||||
|
self.model,
|
||||||
|
)
|
||||||
|
|
||||||
self._client = httpx.Client(
|
self._client = httpx.Client(
|
||||||
base_url=self.api_url,
|
base_url=self.api_url,
|
||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
@@ -130,7 +139,8 @@ class LocalAIClient:
|
|||||||
if verbose:
|
if verbose:
|
||||||
print("Starting diarization and transcription via LocalAI.")
|
print("Starting diarization and transcription via LocalAI.")
|
||||||
|
|
||||||
# Defaults: use verbose_json + include_text to get both diarization and transcription.
|
logger.info("diarize_and_transcribe requested for: %s", audio_path)
|
||||||
|
|
||||||
if response_format is None:
|
if response_format is None:
|
||||||
response_format = "verbose_json"
|
response_format = "verbose_json"
|
||||||
if include_text is None:
|
if include_text is None:
|
||||||
@@ -158,6 +168,8 @@ class LocalAIClient:
|
|||||||
if min_duration_off is not None:
|
if min_duration_off is not None:
|
||||||
data["min_duration_off"] = str(min_duration_off)
|
data["min_duration_off"] = str(min_duration_off)
|
||||||
|
|
||||||
|
logger.debug("LocalAI request params: %s", data)
|
||||||
|
|
||||||
# Open file
|
# Open file
|
||||||
if not os.path.exists(audio_path):
|
if not os.path.exists(audio_path):
|
||||||
raise LocalAIError(f"Audio file not found: {audio_path}")
|
raise LocalAIError(f"Audio file not found: {audio_path}")
|
||||||
@@ -172,6 +184,7 @@ class LocalAIClient:
|
|||||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||||
|
|
||||||
# POST /v1/audio/diarization
|
# POST /v1/audio/diarization
|
||||||
|
logger.info("Sending request to LocalAI: /v1/audio/diarization")
|
||||||
resp = self._client.post(
|
resp = self._client.post(
|
||||||
"/v1/audio/diarization",
|
"/v1/audio/diarization",
|
||||||
data=data,
|
data=data,
|
||||||
@@ -179,8 +192,11 @@ class LocalAIClient:
|
|||||||
headers=headers,
|
headers=headers,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info("LocalAI response status: %d", resp.status_code)
|
||||||
|
|
||||||
if resp.status_code >= 400:
|
if resp.status_code >= 400:
|
||||||
body = resp.text
|
body = resp.text
|
||||||
|
logger.error("LocalAI error response: %s", body)
|
||||||
raise LocalAIError(
|
raise LocalAIError(
|
||||||
f"LocalAI request failed with status {resp.status_code}: {body}"
|
f"LocalAI request failed with status {resp.status_code}: {body}"
|
||||||
)
|
)
|
||||||
@@ -188,6 +204,7 @@ class LocalAIClient:
|
|||||||
try:
|
try:
|
||||||
result = resp.json()
|
result = resp.json()
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
|
logger.error("Failed to parse LocalAI response as JSON.")
|
||||||
raise LocalAIError(
|
raise LocalAIError(
|
||||||
"Failed to parse LocalAI response as JSON."
|
"Failed to parse LocalAI response as JSON."
|
||||||
)
|
)
|
||||||
@@ -209,7 +226,7 @@ class LocalAIClient:
|
|||||||
segments = result.get("segments", [])
|
segments = result.get("segments", [])
|
||||||
|
|
||||||
if not segments:
|
if not segments:
|
||||||
# If no segments, return empty but valid structure
|
logger.warning("LocalAI returned no segments.")
|
||||||
return {
|
return {
|
||||||
"segments": [],
|
"segments": [],
|
||||||
"speakers": [],
|
"speakers": [],
|
||||||
@@ -230,6 +247,11 @@ class LocalAIClient:
|
|||||||
out_speakers.append(speaker)
|
out_speakers.append(speaker)
|
||||||
out_transcripts.append(text)
|
out_transcripts.append(text)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Parsed %d segments from LocalAI.",
|
||||||
|
len(out_segments),
|
||||||
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"segments": out_segments,
|
"segments": out_segments,
|
||||||
"speakers": out_speakers,
|
"speakers": out_speakers,
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
import os
|
import os
|
||||||
|
import logging
|
||||||
from argparse import Action
|
from argparse import Action
|
||||||
from ast import literal_eval
|
from ast import literal_eval
|
||||||
|
|
||||||
@@ -13,6 +14,25 @@ PYANNOTE_DEFAULT_PATH = os.path.join(CACHE_DIR, "pyannote")
|
|||||||
PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")
|
PYANNOTE_DEFAULT_CONFIG = os.path.join(PYANNOTE_DEFAULT_PATH, "config.yaml")
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging(level: str = "INFO"):
|
||||||
|
"""
|
||||||
|
Configure root logger to write to stdout so Docker can capture logs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
level: Logging level (DEBUG, INFO, WARNING, ERROR, CRITICAL).
|
||||||
|
"""
|
||||||
|
numeric_level = getattr(logging, level.upper(), logging.INFO)
|
||||||
|
if not isinstance(numeric_level, int):
|
||||||
|
numeric_level = logging.INFO
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=numeric_level,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
datefmt="%Y-%m-%dT%H:%M:%S%z",
|
||||||
|
force=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def set_threads(parse_threads=None, yaml_threads=None):
|
def set_threads(parse_threads=None, yaml_threads=None):
|
||||||
"""
|
"""
|
||||||
Configure number of threads.
|
Configure number of threads.
|
||||||
|
|||||||
+40
-2
@@ -6,8 +6,8 @@ Provides a client to summarize long transcripts via an LLM endpoint.
|
|||||||
|
|
||||||
Behavior:
|
Behavior:
|
||||||
- Chunks transcript into 10,240-character segments.
|
- Chunks transcript into 10,240-character segments.
|
||||||
- Generates a summary for each chunk.
|
- Summarizes each chunk.
|
||||||
- Combines all chunk summaries and produces a final, detailed summary.
|
- Summarizes the summaries into a final, detailed summary.
|
||||||
|
|
||||||
Environment Variables:
|
Environment Variables:
|
||||||
- SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080)
|
- SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080)
|
||||||
@@ -17,10 +17,13 @@ Environment Variables:
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
logger = logging.getLogger("scraibe.summarizer")
|
||||||
|
|
||||||
|
|
||||||
class SummarizerError(Exception):
|
class SummarizerError(Exception):
|
||||||
"""Raised when the summarization API call fails."""
|
"""Raised when the summarization API call fails."""
|
||||||
@@ -53,6 +56,12 @@ class SummarizerClient:
|
|||||||
"Provide the summarization LLM URL via environment or constructor."
|
"Provide the summarization LLM URL via environment or constructor."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Initializing SummarizerClient: url=%s model=%s",
|
||||||
|
self.api_url,
|
||||||
|
self.model,
|
||||||
|
)
|
||||||
|
|
||||||
self._client = httpx.Client(
|
self._client = httpx.Client(
|
||||||
base_url=self.api_url,
|
base_url=self.api_url,
|
||||||
timeout=self.timeout,
|
timeout=self.timeout,
|
||||||
@@ -84,21 +93,40 @@ class SummarizerClient:
|
|||||||
- Next steps / action items
|
- Next steps / action items
|
||||||
"""
|
"""
|
||||||
if not transcript.strip():
|
if not transcript.strip():
|
||||||
|
logger.warning("Empty transcript provided to summarize_transcript.")
|
||||||
return "No transcript provided to summarize."
|
return "No transcript provided to summarize."
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Starting summarization for transcript length=%d chars",
|
||||||
|
len(transcript),
|
||||||
|
)
|
||||||
|
|
||||||
# 1) Chunk the transcript
|
# 1) Chunk the transcript
|
||||||
chunks = self._chunk_text(transcript)
|
chunks = self._chunk_text(transcript)
|
||||||
|
logger.info("Split transcript into %d chunks.", len(chunks))
|
||||||
|
|
||||||
# 2) Summarize each chunk
|
# 2) Summarize each chunk
|
||||||
chunk_summaries = []
|
chunk_summaries = []
|
||||||
for i, chunk in enumerate(chunks):
|
for i, chunk in enumerate(chunks):
|
||||||
|
logger.info(
|
||||||
|
"Summarizing chunk %d/%d (length=%d)",
|
||||||
|
i + 1,
|
||||||
|
len(chunks),
|
||||||
|
len(chunk),
|
||||||
|
)
|
||||||
summary = self._summarize_chunk(chunk, i, len(chunks))
|
summary = self._summarize_chunk(chunk, i, len(chunks))
|
||||||
chunk_summaries.append(summary)
|
chunk_summaries.append(summary)
|
||||||
|
|
||||||
# 3) Combine and summarize summaries
|
# 3) Combine and summarize summaries
|
||||||
combined = "\n\n".join(chunk_summaries)
|
combined = "\n\n".join(chunk_summaries)
|
||||||
|
logger.info(
|
||||||
|
"Combining %d chunk summaries (total length=%d) for final summary.",
|
||||||
|
len(chunk_summaries),
|
||||||
|
len(combined),
|
||||||
|
)
|
||||||
final_summary = self._summarize_combined(combined)
|
final_summary = self._summarize_combined(combined)
|
||||||
|
|
||||||
|
logger.info("Summarization completed.")
|
||||||
return final_summary
|
return final_summary
|
||||||
|
|
||||||
def _chunk_text(self, text: str) -> list[str]:
|
def _chunk_text(self, text: str) -> list[str]:
|
||||||
@@ -183,13 +211,18 @@ class SummarizerClient:
|
|||||||
if self.api_key:
|
if self.api_key:
|
||||||
headers["Authorization"] = f"Bearer {self.api_key}"
|
headers["Authorization"] = f"Bearer {self.api_key}"
|
||||||
|
|
||||||
|
logger.info("Calling summarizer endpoint: /v1/chat/completions")
|
||||||
|
|
||||||
resp = self._client.post(
|
resp = self._client.post(
|
||||||
"/v1/chat/completions",
|
"/v1/chat/completions",
|
||||||
json=payload,
|
json=payload,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
logger.info("Summarizer response status: %d", resp.status_code)
|
||||||
|
|
||||||
if resp.status_code >= 400:
|
if resp.status_code >= 400:
|
||||||
|
logger.error("Summarizer error response: %s", resp.text)
|
||||||
raise SummarizerError(
|
raise SummarizerError(
|
||||||
f"Summarizer API error {resp.status_code}: {resp.text}"
|
f"Summarizer API error {resp.status_code}: {resp.text}"
|
||||||
)
|
)
|
||||||
@@ -197,6 +230,7 @@ class SummarizerClient:
|
|||||||
try:
|
try:
|
||||||
data = resp.json()
|
data = resp.json()
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
|
logger.error("Failed to parse summarizer response as JSON.")
|
||||||
raise SummarizerError(
|
raise SummarizerError(
|
||||||
"Failed to parse summarizer response as JSON."
|
"Failed to parse summarizer response as JSON."
|
||||||
)
|
)
|
||||||
@@ -206,6 +240,10 @@ class SummarizerClient:
|
|||||||
content = data["choices"][0]["message"]["content"]
|
content = data["choices"][0]["message"]["content"]
|
||||||
return content.strip()
|
return content.strip()
|
||||||
except (KeyError, IndexError, TypeError):
|
except (KeyError, IndexError, TypeError):
|
||||||
|
logger.error(
|
||||||
|
"Unexpected summarizer response format: %s",
|
||||||
|
json.dumps(data, indent=2),
|
||||||
|
)
|
||||||
raise SummarizerError(
|
raise SummarizerError(
|
||||||
"Unexpected summarizer response format: "
|
"Unexpected summarizer response format: "
|
||||||
f"{json.dumps(data, indent=2)}"
|
f"{json.dumps(data, indent=2)}"
|
||||||
|
|||||||
Reference in New Issue
Block a user