Add structured logging for Docker; support LOG_LEVEL env and --log-level
Mirror and run GitLab CI / build (push) Has been cancelled
Ruff / ruff (push) Has been cancelled

This commit is contained in:
admin
2026-06-13 17:46:25 +00:00
parent 47b3304297
commit 2ea46ada42
5 changed files with 140 additions and 9 deletions
+40 -2
View File
@@ -6,8 +6,8 @@ Provides a client to summarize long transcripts via an LLM endpoint.
Behavior:
- Chunks transcript into 10,240-character segments.
- Generates a summary for each chunk.
- Combines all chunk summaries and produces a final, detailed summary.
- Summarizes each chunk.
- Summarizes the summaries into a final, detailed summary.
Environment Variables:
- SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080)
@@ -17,10 +17,13 @@ Environment Variables:
import os
import json
import logging
from typing import Optional
import httpx
logger = logging.getLogger("scraibe.summarizer")
class SummarizerError(Exception):
"""Raised when the summarization API call fails."""
@@ -53,6 +56,12 @@ class SummarizerClient:
"Provide the summarization LLM URL via environment or constructor."
)
logger.info(
"Initializing SummarizerClient: url=%s model=%s",
self.api_url,
self.model,
)
self._client = httpx.Client(
base_url=self.api_url,
timeout=self.timeout,
@@ -84,21 +93,40 @@ class SummarizerClient:
- Next steps / action items
"""
if not transcript.strip():
logger.warning("Empty transcript provided to summarize_transcript.")
return "No transcript provided to summarize."
logger.info(
"Starting summarization for transcript length=%d chars",
len(transcript),
)
# 1) Chunk the transcript
chunks = self._chunk_text(transcript)
logger.info("Split transcript into %d chunks.", len(chunks))
# 2) Summarize each chunk
chunk_summaries = []
for i, chunk in enumerate(chunks):
logger.info(
"Summarizing chunk %d/%d (length=%d)",
i + 1,
len(chunks),
len(chunk),
)
summary = self._summarize_chunk(chunk, i, len(chunks))
chunk_summaries.append(summary)
# 3) Combine and summarize summaries
combined = "\n\n".join(chunk_summaries)
logger.info(
"Combining %d chunk summaries (total length=%d) for final summary.",
len(chunk_summaries),
len(combined),
)
final_summary = self._summarize_combined(combined)
logger.info("Summarization completed.")
return final_summary
def _chunk_text(self, text: str) -> list[str]:
@@ -183,13 +211,18 @@ class SummarizerClient:
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
logger.info("Calling summarizer endpoint: /v1/chat/completions")
resp = self._client.post(
"/v1/chat/completions",
json=payload,
headers=headers,
)
logger.info("Summarizer response status: %d", resp.status_code)
if resp.status_code >= 400:
logger.error("Summarizer error response: %s", resp.text)
raise SummarizerError(
f"Summarizer API error {resp.status_code}: {resp.text}"
)
@@ -197,6 +230,7 @@ class SummarizerClient:
try:
data = resp.json()
except json.JSONDecodeError:
logger.error("Failed to parse summarizer response as JSON.")
raise SummarizerError(
"Failed to parse summarizer response as JSON."
)
@@ -206,6 +240,10 @@ class SummarizerClient:
content = data["choices"][0]["message"]["content"]
return content.strip()
except (KeyError, IndexError, TypeError):
logger.error(
"Unexpected summarizer response format: %s",
json.dumps(data, indent=2),
)
raise SummarizerError(
"Unexpected summarizer response format: "
f"{json.dumps(data, indent=2)}"