""" Summarizer Module ----------------- Provides a client to summarize long transcripts via an LLM endpoint. Behavior: - Chunks transcript into 10,240-character segments. - Summarizes each chunk. - Summarizes the summaries into a final, detailed summary. Environment Variables: - SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080) - SUMMARIZER_API_KEY: (optional) API key, if required - SUMMARIZER_MODEL: (optional) Model name (e.g., llama-3.1-8b-instruct) """ import os import json import logging from typing import Optional import httpx logger = logging.getLogger("scraibe.summarizer") class SummarizerError(Exception): """Raised when the summarization API call fails.""" pass class SummarizerClient: """ HTTP client for an OpenAI-compatible chat completions endpoint. Used to summarize long transcripts in chunks. """ CHUNK_SIZE = 10_240 # characters per chunk def __init__( self, api_url: Optional[str] = None, api_key: Optional[str] = None, model: Optional[str] = None, timeout: float = 600.0, ): self.api_url = (api_url or os.getenv("SUMMARIZER_API_URL")).strip().rstrip("/") self.api_key = api_key or os.getenv("SUMMARIZER_API_KEY") or None self.model = model or os.getenv("SUMMARIZER_MODEL") or "llama-3.1-8b-instruct" self.timeout = timeout if not self.api_url: raise SummarizerError( "SUMMARIZER_API_URL is not set. " "Provide the summarization LLM URL via environment or constructor." ) logger.info( "Initializing SummarizerClient: url=%s model=%s", self.api_url, self.model, ) self._client = httpx.Client( base_url=self.api_url, timeout=self.timeout, follow_redirects=True, ) def close(self): self._client.close() def __del__(self): try: self._client.close() except Exception: pass def summarize_transcript(self, transcript: str) -> str: """ Summarize a (possibly very long) transcript. Strategy: - Split transcript into chunks of CHUNK_SIZE characters. - Generate a detailed summary for each chunk. - Combine all chunk summaries and generate a final, concise but thorough summary. The final summary should make it clear: - What was discussed - Main issues - Outcomes / decisions - Next steps / action items """ if not transcript.strip(): logger.warning("Empty transcript provided to summarize_transcript.") return "No transcript provided to summarize." logger.info( "Starting summarization for transcript length=%d chars", len(transcript), ) # 1) Chunk the transcript chunks = self._chunk_text(transcript) logger.info("Split transcript into %d chunks.", len(chunks)) # 2) Summarize each chunk chunk_summaries = [] for i, chunk in enumerate(chunks): logger.info( "Summarizing chunk %d/%d (length=%d)", i + 1, len(chunks), len(chunk), ) summary = self._summarize_chunk(chunk, i, len(chunks)) chunk_summaries.append(summary) # 3) Combine and summarize summaries combined = "\n\n".join(chunk_summaries) logger.info( "Combining %d chunk summaries (total length=%d) for final summary.", len(chunk_summaries), len(combined), ) final_summary = self._summarize_combined(combined) logger.info("Summarization completed.") return final_summary def _chunk_text(self, text: str) -> list[str]: """Split text into chunks of CHUNK_SIZE characters.""" chunks = [] start = 0 while start < len(text): end = start + self.CHUNK_SIZE if end >= len(text): chunks.append(text[start:]) break # Try to break at a reasonable boundary (newline or space) break_pos = text.rfind("\n", start, end) if break_pos == -1: break_pos = text.rfind(" ", start, end) if break_pos == -1 or break_pos <= start: break_pos = end chunks.append(text[start:break_pos].strip()) start = break_pos return chunks def _summarize_chunk(self, chunk: str, index: int, total: int) -> str: system_prompt = ( "You are an expert legal and business meeting summarizer. " "You will receive a segment of a longer transcript. " "Provide a detailed, structured summary of this segment, focusing on: " "- Topics discussed\n" "- Key points and arguments\n" "- Decisions and agreements\n" "- Action items and responsibilities\n" "- Any risks, conflicts, or open issues\n\n" "Be concise but complete. Use bullet points when helpful. " "Do not add information that is not present in the transcript." ) user_prompt = ( f"This is segment {index + 1} of {total} from a longer conversation.\n\n" f"{chunk}" ) return self._chat_completion(system_prompt, user_prompt) def _summarize_combined(self, combined_summaries: str) -> str: system_prompt = ( "You are an expert legal and business meeting summarizer. " "You will receive several intermediate summaries of a longer conversation. " "Produce a single, comprehensive summary that makes it clear: " "- The overall purpose and context of the discussion\n" "- The main issues and topics addressed\n" "- Key arguments and positions (briefly)\n" "- Decisions and outcomes\n" "- Action items, responsibilities, and next steps\n" "- Any unresolved issues or risks\n\n" "The summary should be detailed enough that a reader who was not present " "can understand what happened and what is expected going forward. " "Use clear, concise language and bullet points where appropriate." ) user_prompt = ( "Here are the intermediate summaries from different parts of the same conversation:\n\n" f"{combined_summaries}" ) return self._chat_completion(system_prompt, user_prompt) def _chat_completion(self, system_prompt: str, user_prompt: str) -> str: """ Call OpenAI-compatible /v1/chat/completions endpoint. """ payload = { "model": self.model, "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], "temperature": 0.3, } headers = { "Content-Type": "application/json", } if self.api_key: headers["Authorization"] = f"Bearer {self.api_key}" logger.info("Calling summarizer endpoint: /v1/chat/completions") resp = self._client.post( "/v1/chat/completions", json=payload, headers=headers, ) logger.info("Summarizer response status: %d", resp.status_code) if resp.status_code >= 400: logger.error("Summarizer error response: %s", resp.text) raise SummarizerError( f"Summarizer API error {resp.status_code}: {resp.text}" ) try: data = resp.json() except json.JSONDecodeError: logger.error("Failed to parse summarizer response as JSON.") raise SummarizerError( "Failed to parse summarizer response as JSON." ) # Extract assistant message try: content = data["choices"][0]["message"]["content"] return content.strip() except (KeyError, IndexError, TypeError): logger.error( "Unexpected summarizer response format: %s", json.dumps(data, indent=2), ) raise SummarizerError( "Unexpected summarizer response format: " f"{json.dumps(data, indent=2)}" )