scribe/scraibe/summarizer.py

"""
Summarizer Module
-----------------

Provides a client to summarize long transcripts via an LLM endpoint.

Behavior:
- Chunks transcript into 10,240-character segments.
- Summarizes each chunk.
- Summarizes the summaries into a final, detailed summary.

Environment Variables:
- SUMMARIZER_API_URL: (required) Base URL of the LLM API (e.g., http://localhost:8080)
- SUMMARIZER_API_KEY: (optional) API key, if required
- SUMMARIZER_MODEL:   (optional) Model name (e.g., llama-3.1-8b-instruct)
"""

import os
import json
import logging
from typing import Optional

import httpx

logger = logging.getLogger("scraibe.summarizer")


class SummarizerError(Exception):
    """Raised when the summarization API call fails."""
    pass


class SummarizerClient:
    """
    HTTP client for an OpenAI-compatible chat completions endpoint.
    Used to summarize long transcripts in chunks.
    """

    CHUNK_SIZE = 10_240  # characters per chunk

    def __init__(
        self,
        api_url: Optional[str] = None,
        api_key: Optional[str] = None,
        model: Optional[str] = None,
        timeout: float = 3600.0,
    ):
        self.api_url = (api_url or os.getenv("SUMMARIZER_API_URL")).strip().rstrip("/")
        self.api_key = api_key or os.getenv("SUMMARIZER_API_KEY") or None
        self.model = model or os.getenv("SUMMARIZER_MODEL") or "llama-3.1-8b-instruct"
        self.timeout = timeout

        if not self.api_url:
            raise SummarizerError(
                "SUMMARIZER_API_URL is not set. "
                "Provide the summarization LLM URL via environment or constructor."
            )

        logger.info(
            "Initializing SummarizerClient: url=%s model=%s",
            self.api_url,
            self.model,
        )

        self._client = httpx.Client(
            base_url=self.api_url,
            timeout=self.timeout,
            follow_redirects=True,
        )

    def close(self):
        self._client.close()

    def __del__(self):
        try:
            self._client.close()
        except Exception:
            pass

    def summarize_transcript(self, transcript: str) -> str:
        """
        Summarize a (possibly very long) transcript.

        Strategy:
        - Split transcript into chunks of CHUNK_SIZE characters.
        - Generate a detailed summary for each chunk.
        - Combine all chunk summaries and generate a final, concise but thorough summary.

        The final summary should make it clear:
        - What was discussed
        - Main issues
        - Outcomes / decisions
        - Next steps / action items
        """
        if not transcript.strip():
            logger.warning("Empty transcript provided to summarize_transcript.")
            return "No transcript provided to summarize."

        logger.info(
            "Starting summarization for transcript length=%d chars",
            len(transcript),
        )

        # 1) Chunk the transcript
        chunks = self._chunk_text(transcript)
        logger.info("Split transcript into %d chunks.", len(chunks))

        # 2) Summarize each chunk
        chunk_summaries = []
        for i, chunk in enumerate(chunks):
            logger.info(
                "Summarizing chunk %d/%d (length=%d)",
                i + 1,
                len(chunks),
                len(chunk),
            )
            summary = self._summarize_chunk(chunk, i, len(chunks))
            chunk_summaries.append(summary)

        # 3) Combine and summarize summaries
        combined = "\n\n".join(chunk_summaries)
        logger.info(
            "Combining %d chunk summaries (total length=%d) for final summary.",
            len(chunk_summaries),
            len(combined),
        )
        final_summary = self._summarize_combined(combined)

        logger.info("Summarization completed.")
        return final_summary

    def _chunk_text(self, text: str) -> list[str]:
        """Split text into chunks of CHUNK_SIZE characters."""
        chunks = []
        start = 0
        while start < len(text):
            end = start + self.CHUNK_SIZE
            if end >= len(text):
                chunks.append(text[start:])
                break
            # Try to break at a reasonable boundary (newline or space)
            break_pos = text.rfind("\n", start, end)
            if break_pos == -1:
                break_pos = text.rfind(" ", start, end)
            if break_pos == -1 or break_pos <= start:
                break_pos = end
            chunks.append(text[start:break_pos].strip())
            start = break_pos
        return chunks

    def _load_summary_prompt(self, role: str) -> str:
        """
        Load summary prompt for the given role: 'chunk' or 'combined'.

        Priority:
        1) SUMMARY_PROMPT_{ROLE} (env)
        2) SUMMARY_PROMPT_FILE (env) with [chunk] / [combined] sections
        3) Built-in default prompt
        """
        role_upper = role.upper()

        # 1) Direct env var: SUMMARY_PROMPT_CHUNK / SUMMARY_PROMPT_COMBINED
        env_key = f"SUMMARY_PROMPT_{role_upper}"
        env_prompt = (os.getenv(env_key) or "").strip()
        if env_prompt:
            return env_prompt

        # 2) File-based prompt with sections
        prompt_file = (os.getenv("SUMMARY_PROMPT_FILE") or "").strip()
        if prompt_file and os.path.exists(prompt_file):
            try:
                with open(prompt_file, "r", encoding="utf-8") as f:
                    content = f.read()
                # Simple section parser: [chunk], [combined]
                import re
                pattern = re.compile(
                    r"\[" + role + r"\]\s*\n(.*?)(?=\n\[|$)",
                    re.DOTALL,
                )
                m = pattern.search(content)
                if m:
                    text = m.group(1).strip()
                    if text:
                        return text
            except Exception as e:
                logger.warning("Failed to load SUMMARY_PROMPT_FILE for %s: %s", role, e)

        # 3) Default prompts
        if role == "chunk":
            return (
                "You are an expert legal and business meeting summarizer. "
                "You will receive a segment of a longer transcript. "
                "Provide a detailed, structured summary of this segment, focusing on: "
                "- Topics discussed\n"
                "- Key points and arguments\n"
                "- Decisions and agreements\n"
                "- Action items and responsibilities\n"
                "- Any risks, conflicts, or open issues\n\n"
                "Be concise but complete. Use bullet points where helpful. "
                "Do not add information that is not present in the transcript."
            )
        else:
            return (
                "You are an expert legal and business meeting summarizer. "
                "You will receive several intermediate summaries of a longer conversation. "
                "Produce a single, comprehensive summary that makes it clear: "
                "- The overall purpose and context of the discussion\n"
                "- The main issues and topics addressed\n"
                "- Key arguments and positions (briefly)\n"
                "- Decisions and outcomes\n"
                "- Action items, responsibilities, and next steps\n"
                "- Any unresolved issues or risks\n\n"
                "The summary should be detailed enough that a reader who was not present "
                "can understand what happened and what is expected going forward. "
                "Use clear, concise language and bullet points where appropriate. "
                "Use markdown formatting (headings, lists, bold) to structure the summary."
            )

    def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
        system_prompt = self._load_summary_prompt("chunk")

        user_prompt = (
            f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
            f"{chunk}"
        )

        return self._chat_completion(system_prompt, user_prompt)

    def _summarize_combined(self, combined_summaries: str) -> str:
        system_prompt = self._load_summary_prompt("combined")

        user_prompt = (
            "Here are the intermediate summaries from different parts of the same conversation:\n\n"
            f"{combined_summaries}"
        )

        return self._chat_completion(system_prompt, user_prompt)

    def _chat_completion(self, system_prompt: str, user_prompt: str) -> str:
        """
        Call OpenAI-compatible /v1/chat/completions endpoint.
        """
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt},
            ],
            "temperature": 0.3,
        }

        headers = {
            "Content-Type": "application/json",
        }
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"

        logger.info("Calling summarizer endpoint: /v1/chat/completions")

        resp = self._client.post(
            "/v1/chat/completions",
            json=payload,
            headers=headers,
        )

        logger.info("Summarizer response status: %d", resp.status_code)

        if resp.status_code >= 400:
            logger.error("Summarizer error response: %s", resp.text)
            raise SummarizerError(
                f"Summarizer API error {resp.status_code}: {resp.text}"
            )

        try:
            data = resp.json()
        except json.JSONDecodeError:
            logger.error("Failed to parse summarizer response as JSON.")
            raise SummarizerError(
                "Failed to parse summarizer response as JSON."
            )

        # Extract assistant message
        try:
            content = data["choices"][0]["message"]["content"]
            return content.strip()
        except (KeyError, IndexError, TypeError):
            logger.error(
                "Unexpected summarizer response format: %s",
                json.dumps(data, indent=2),
            )
            raise SummarizerError(
                "Unexpected summarizer response format: "
                f"{json.dumps(data, indent=2)}"
            )