Improve summary prompt, add markdown-to-DOCX styling, and add cover pages

- Configurable summary prompts via ENV or file; stronger default prompt. - New docx_styles.py: converts markdown (headings, bullets, bold/italic) to DOCX. - Updated create_summary_docx to use markdown-aware styling. - New docx_cover.py: reusable cover page for transcript and summary. - Cover pages enabled when COVER_PAGE_ENABLED=true.
2026-06-19 17:16:46 +00:00
parent 54414def26
commit 7a31be9de5
4 changed files with 369 additions and 37 deletions
@@ -0,0 +1,118 @@
+"""
+Reusable cover-page generator for transcript and summary DOCX files.
+
+Configuration (env):
+- COVER_PAGE_ENABLED: "true"/"false" (default: false)
+- COVER_PAGE_ORGANIZATION: e.g., "A.P.Strom"
+- COVER_PAGE_TITLE_PREFIX: e.g., "TRANSCRIPT" or "SUMMARY"
+- COVER_PAGE_LOGO_URL: optional URL
+- COVER_PAGE_LOGO_PATH: optional local path
+"""
+
+import os
+from typing import Optional
+from docx import Document
+from docx.shared import Pt, Inches
+from docx.enum.text import WD_ALIGN_PARAGRAPH
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+
+
+def _add_page_break(doc: Document):
+    """Insert a page break paragraph."""
+    p = doc.add_paragraph()
+    pPr = p._p.get_or_add_pPr()
+    # Clear spacing/tabs
+    for child in list(pPr):
+        tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
+        if tag in ("tabs", "spacing", "ind"):
+            pPr.remove(child)
+    page_break = OxmlElement("w:pageBreak")
+    page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
+    pPr.append(page_break)
+
+
+def add_cover_page(
+    doc: Document,
+    title: str,
+    subtitle: Optional[str] = None,
+    metadata: Optional[dict] = None,
+    include_logo: bool = False,
+):
+    """
+    Insert a cover page at the current cursor position.
+
+    - title: e.g., "TRANSCRIPT" or "SUMMARY"
+    - subtitle: e.g., "Meeting of 16 June 2026"
+    - metadata: optional dict with keys like:
+        - "Organization"
+        - "Date"
+        - "Prepared by"
+        - "Reference"
+    """
+
+    org = (os.getenv("COVER_PAGE_ORGANIZATION") or "").strip() or metadata.get("Organization") if metadata else None
+    date = (metadata.get("Date") if metadata else None) or ""
+    prepared_by = (metadata.get("Prepared by") if metadata else None) or ""
+    reference = (metadata.get("Reference") if metadata else None) or ""
+
+    # Title
+    p = doc.add_paragraph()
+    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+    p.paragraph_format.space_after = Pt(6)
+    run = p.add_run(title.upper())
+    run.bold = True
+    run.font.name = "Courier"
+    run.font.size = Pt(18)
+
+    # Subtitle
+    if subtitle:
+        p = doc.add_paragraph()
+        p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        p.paragraph_format.space_after = Pt(12)
+        run = p.add_run(subtitle)
+        run.font.name = "Courier"
+        run.font.size = Pt(14)
+
+    # Optional logo placeholder (text-only for now; can be extended)
+    if include_logo:
+        logo_url = (os.getenv("COVER_PAGE_LOGO_URL") or "").strip()
+        logo_path = (os.getenv("COVER_PAGE_LOGO_PATH") or "").strip()
+        # For now, just reserve space; image insertion can be added later.
+        p = doc.add_paragraph()
+        p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        p.paragraph_format.space_after = Pt(12)
+
+    # Metadata lines
+    if org or date or prepared_by or reference:
+        p = doc.add_paragraph()
+        p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+        p.paragraph_format.space_after = Pt(4)
+        if org:
+            r = p.add_run(org)
+            r.font.name = "Courier"
+            r.font.size = Pt(12)
+        if date:
+            if org:
+                p.add_run("\n")
+            r = p.add_run(date)
+            r.font.name = "Courier"
+            r.font.size = Pt(12)
+
+        if prepared_by or reference:
+            p = doc.add_paragraph()
+            p.alignment = WD_ALIGN_PARAGRAPH.CENTER
+            p.paragraph_format.space_after = Pt(4)
+            if prepared_by:
+                r = p.add_run(f"Prepared by: {prepared_by}")
+                r.font.name = "Courier"
+                r.font.size = Pt(11)
+            if reference:
+                if prepared_by:
+                    p.add_run("\n")
+                r = p.add_run(f"Reference: {reference}")
+                r.font.name = "Courier"
+                r.font.size = Pt(11)
+
+    # Page break after cover page
+    _add_page_break(doc)
@@ -0,0 +1,149 @@
+"""
+Utility module for applying styles and converting simple markdown
+into styled DOCX paragraphs/runs for summaries.
+"""
+
+import re
+from docx import Document
+from docx.shared import Pt
+from docx.oxml import OxmlElement
+from docx.oxml.ns import qn
+
+
+def _ensure_style(doc, name, based_on="Normal", font_name="Courier", font_size=Pt(12)):
+    """
+    Ensure a paragraph style exists in the document.
+    """
+    styles = doc.styles
+    if name not in [s.name for s in styles]:
+        style = styles.add_style(name, 1)  # 1 = WD_STYLE_TYPE.PARAGRAPH
+        style.font.name = font_name
+        style.font.size = font_size
+        if based_on:
+            style.base_style = styles[based_on]
+    return styles[name]
+
+
+def apply_heading_style(paragraph, level: int):
+    """
+    Apply heading style to a paragraph based on level (1, 2, 3).
+    """
+    if level == 1:
+        style_name = "SummaryHeading1"
+        size = Pt(16)
+    elif level == 2:
+        style_name = "SummaryHeading2"
+        size = Pt(14)
+    else:
+        style_name = "SummaryHeading3"
+        size = Pt(12)
+
+    doc = paragraph.document
+    style = _ensure_style(doc, style_name, font_size=size)
+    paragraph.style = style
+    paragraph.paragraph_format.space_before = Pt(4)
+    paragraph.paragraph_format.space_after = Pt(2)
+
+
+def apply_bullet_style(paragraph):
+    """
+    Apply a simple bullet style to a paragraph.
+    """
+    doc = paragraph.document
+    style_name = "SummaryBullet"
+    style = _ensure_style(doc, style_name)
+    paragraph.style = style
+    pPr = paragraph._p.get_or_add_pPr()
+    tabs = OxmlElement("w:tabs")
+    tab = OxmlElement("w:tab")
+    tab.set(qn("w:val"), "left")
+    tab.set(qn("w:pos"), "360")
+    tabs.append(tab)
+    pPr.append(tabs)
+
+
+def parse_simple_md_to_paragraphs(doc, text: str):
+    """
+    Convert simple markdown text into DOCX paragraphs with styles.
+
+    Supported:
+    - # / ## / ### for headings
+    - - / * for bullet lists
+    - **bold** and *italic*
+
+    This is intentionally simple and robust for legal/business summaries.
+    """
+    lines = text.splitlines()
+    current_paragraph = None
+    in_list = False
+
+    for line in lines:
+        stripped = line.strip()
+        if not stripped:
+            current_paragraph = None
+            in_list = False
+            continue
+
+        # Headings
+        heading_match = re.match(r"^(#{1,3})\s+(.*)", stripped)
+        if heading_match:
+            level = len(heading_match.group(1))
+            content = heading_match.group(2).strip()
+            p = doc.add_paragraph()
+            apply_heading_style(p, level)
+            _add_run_with_inline_md(p, content)
+            current_paragraph = p
+            in_list = False
+            continue
+
+        # Bullet list
+        bullet_match = re.match(r"^[-*]\s+(.*)", stripped)
+        if bullet_match:
+            content = bullet_match.group(1).strip()
+            if not in_list or current_paragraph is None:
+                in_list = True
+                current_paragraph = doc.add_paragraph()
+                apply_bullet_style(current_paragraph)
+            else:
+                current_paragraph = doc.add_paragraph()
+                apply_bullet_style(current_paragraph)
+            _add_run_with_inline_md(current_paragraph, content)
+            continue
+
+        # Normal paragraph
+        if not in_list or current_paragraph is None:
+            in_list = False
+            current_paragraph = doc.add_paragraph()
+        else:
+            current_paragraph = doc.add_paragraph()
+
+        _add_run_with_inline_md(current_paragraph, stripped)
+
+
+def _add_run_with_inline_md(paragraph, text: str):
+    """
+    Add runs to a paragraph, interpreting **bold** and *italic*.
+    """
+    # Simple regex for bold and italic
+    parts = re.split(r"(\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*)", text)
+    for part in parts:
+        if not part:
+            continue
+
+        run = paragraph.add_run(part)
+        run.font.name = "Courier"
+        run.font.size = Pt(12)
+
+        # Bold
+        bold_match = re.fullmatch(r"\*\*(.+?)\*\*", part)
+        if bold_match:
+            run.bold = True
+            part = bold_match.group(1)
+
+        # Italic
+        italic_match = re.fullmatch(r"\*(.+?)\*", part)
+        if italic_match:
+            run.italic = True
+            part = italic_match.group(1)
+
+        run.text = part
@@ -505,7 +505,19 @@ def create_transcript_docx(text: str, filename: str):

    _create_transcript_section_properties(doc.sections[0])

-    # Step 3: Write prepared pages into DOCX
+    # Step 3: Optionally add cover page
+    from . import docx_cover
+    cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
+    if cover_enabled:
+        docx_cover.add_cover_page(
+            doc,
+            title="TRANSCRIPT",
+            subtitle=None,
+            metadata=None,
+            include_logo=True,
+        )
+
+    # Step 4: Write prepared pages into DOCX
    for page_idx, page_lines in enumerate(prepared_pages):
        # Insert page break between pages
        if page_idx > 0:
@@ -523,7 +535,7 @@ def create_transcript_docx(text: str, filename: str):
        for line_num, line_text in enumerate(page_lines, start=1):
            _add_transcript_paragraph(doc, line_text, line_number=line_num)

-    # Step 4: Add footer: "X of Y" centered
+    # Step 5: Add footer: "X of Y" centered
    section = doc.sections[0]
    footer = section.footer
    footer.is_linked_to_previous = False
@@ -563,8 +575,10 @@ def create_summary_docx(text: str, filename: str):
    Create a summary DOCX with:
    - 1" margins on all sides
    - 12pt Courier font
-    - No line numbering
+    - Markdown-aware WYSIWYG styling (headings, bullets, bold/italic)
    """
+    from . import docx_styles
+
    doc = Document()

    # Base font
@@ -584,13 +598,20 @@ def create_summary_docx(text: str, filename: str):
    for p in list(body.findall(f"{{{W_NS}}}p")):
        body.remove(p)

-    # Add summary content
-    lines = text.strip().splitlines()
-    for line in lines:
-        line = line.strip()
-        if not line:
-            continue
-        p = doc.add_paragraph(line)
-        p.paragraph_format.space_after = Pt(4)
+    # Optionally add cover page
+    from . import docx_cover
+    cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
+    if cover_enabled:
+        docx_cover.add_cover_page(
+            doc,
+            title="SUMMARY",
+            subtitle=None,
+            metadata=None,
+            include_logo=True,
+        )
+
+    # Add summary content using markdown-aware styling
+    if text.strip():
+        docx_styles.parse_simple_md_to_paragraphs(doc, text.strip())

    doc.save(filename)
@@ -148,8 +148,46 @@ class SummarizerClient:
            start = break_pos
        return chunks

-    def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
-        system_prompt = (
+    def _load_summary_prompt(self, role: str) -> str:
+        """
+        Load summary prompt for the given role: 'chunk' or 'combined'.
+
+        Priority:
+        1) SUMMARY_PROMPT_{ROLE} (env)
+        2) SUMMARY_PROMPT_FILE (env) with [chunk] / [combined] sections
+        3) Built-in default prompt
+        """
+        role_upper = role.upper()
+
+        # 1) Direct env var: SUMMARY_PROMPT_CHUNK / SUMMARY_PROMPT_COMBINED
+        env_key = f"SUMMARY_PROMPT_{role_upper}"
+        env_prompt = (os.getenv(env_key) or "").strip()
+        if env_prompt:
+            return env_prompt
+
+        # 2) File-based prompt with sections
+        prompt_file = (os.getenv("SUMMARY_PROMPT_FILE") or "").strip()
+        if prompt_file and os.path.exists(prompt_file):
+            try:
+                with open(prompt_file, "r", encoding="utf-8") as f:
+                    content = f.read()
+                # Simple section parser: [chunk], [combined]
+                import re
+                pattern = re.compile(
+                    r"\[" + role + r"\]\s*\n(.*?)(?=\n\[|$)",
+                    re.DOTALL,
+                )
+                m = pattern.search(content)
+                if m:
+                    text = m.group(1).strip()
+                    if text:
+                        return text
+            except Exception as e:
+                logger.warning("Failed to load SUMMARY_PROMPT_FILE for %s: %s", role, e)
+
+        # 3) Default prompts
+        if role == "chunk":
+            return (
                "You are an expert legal and business meeting summarizer. "
                "You will receive a segment of a longer transcript. "
                "Provide a detailed, structured summary of this segment, focusing on: "
@@ -158,19 +196,11 @@ class SummarizerClient:
                "- Decisions and agreements\n"
                "- Action items and responsibilities\n"
                "- Any risks, conflicts, or open issues\n\n"
-            "Be concise but complete. Use bullet points when helpful. "
+                "Be concise but complete. Use bullet points where helpful. "
                "Do not add information that is not present in the transcript."
            )
-
-        user_prompt = (
-            f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
-            f"{chunk}"
-        )
-
-        return self._chat_completion(system_prompt, user_prompt)
-
-    def _summarize_combined(self, combined_summaries: str) -> str:
-        system_prompt = (
+        else:
+            return (
                "You are an expert legal and business meeting summarizer. "
                "You will receive several intermediate summaries of a longer conversation. "
                "Produce a single, comprehensive summary that makes it clear: "
@@ -183,8 +213,22 @@ class SummarizerClient:
                "The summary should be detailed enough that a reader who was not present "
                "can understand what happened and what is expected going forward. "
                "Use clear, concise language and bullet points where appropriate. "
+                "Use markdown formatting (headings, lists, bold) to structure the summary."
            )

+    def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
+        system_prompt = self._load_summary_prompt("chunk")
+
+        user_prompt = (
+            f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
+            f"{chunk}"
+        )
+
+        return self._chat_completion(system_prompt, user_prompt)
+
+    def _summarize_combined(self, combined_summaries: str) -> str:
+        system_prompt = self._load_summary_prompt("combined")
+
        user_prompt = (
            "Here are the intermediate summaries from different parts of the same conversation:\n\n"
            f"{combined_summaries}"