From 7a31be9de59edb31ca4131183f816c89b5580f13 Mon Sep 17 00:00:00 2001 From: admin Date: Fri, 19 Jun 2026 17:16:46 +0000 Subject: [PATCH] Improve summary prompt, add markdown-to-DOCX styling, and add cover pages - Configurable summary prompts via ENV or file; stronger default prompt. - New docx_styles.py: converts markdown (headings, bullets, bold/italic) to DOCX. - Updated create_summary_docx to use markdown-aware styling. - New docx_cover.py: reusable cover page for transcript and summary. - Cover pages enabled when COVER_PAGE_ENABLED=true. --- scraibe/docx_cover.py | 118 +++++++++++++++++++++++++++++++ scraibe/docx_styles.py | 149 ++++++++++++++++++++++++++++++++++++++++ scraibe/email_sender.py | 43 +++++++++--- scraibe/summarizer.py | 96 +++++++++++++++++++------- 4 files changed, 369 insertions(+), 37 deletions(-) create mode 100644 scraibe/docx_cover.py create mode 100644 scraibe/docx_styles.py diff --git a/scraibe/docx_cover.py b/scraibe/docx_cover.py new file mode 100644 index 0000000..40cd498 --- /dev/null +++ b/scraibe/docx_cover.py @@ -0,0 +1,118 @@ +""" +Reusable cover-page generator for transcript and summary DOCX files. + +Configuration (env): +- COVER_PAGE_ENABLED: "true"/"false" (default: false) +- COVER_PAGE_ORGANIZATION: e.g., "A.P.Strom" +- COVER_PAGE_TITLE_PREFIX: e.g., "TRANSCRIPT" or "SUMMARY" +- COVER_PAGE_LOGO_URL: optional URL +- COVER_PAGE_LOGO_PATH: optional local path +""" + +import os +from typing import Optional +from docx import Document +from docx.shared import Pt, Inches +from docx.enum.text import WD_ALIGN_PARAGRAPH +from docx.oxml import OxmlElement +from docx.oxml.ns import qn + + +def _add_page_break(doc: Document): + """Insert a page break paragraph.""" + p = doc.add_paragraph() + pPr = p._p.get_or_add_pPr() + # Clear spacing/tabs + for child in list(pPr): + tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag + if tag in ("tabs", "spacing", "ind"): + pPr.remove(child) + page_break = OxmlElement("w:pageBreak") + page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1") + pPr.append(page_break) + + +def add_cover_page( + doc: Document, + title: str, + subtitle: Optional[str] = None, + metadata: Optional[dict] = None, + include_logo: bool = False, +): + """ + Insert a cover page at the current cursor position. + + - title: e.g., "TRANSCRIPT" or "SUMMARY" + - subtitle: e.g., "Meeting of 16 June 2026" + - metadata: optional dict with keys like: + - "Organization" + - "Date" + - "Prepared by" + - "Reference" + """ + + org = (os.getenv("COVER_PAGE_ORGANIZATION") or "").strip() or metadata.get("Organization") if metadata else None + date = (metadata.get("Date") if metadata else None) or "" + prepared_by = (metadata.get("Prepared by") if metadata else None) or "" + reference = (metadata.get("Reference") if metadata else None) or "" + + # Title + p = doc.add_paragraph() + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + p.paragraph_format.space_after = Pt(6) + run = p.add_run(title.upper()) + run.bold = True + run.font.name = "Courier" + run.font.size = Pt(18) + + # Subtitle + if subtitle: + p = doc.add_paragraph() + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + p.paragraph_format.space_after = Pt(12) + run = p.add_run(subtitle) + run.font.name = "Courier" + run.font.size = Pt(14) + + # Optional logo placeholder (text-only for now; can be extended) + if include_logo: + logo_url = (os.getenv("COVER_PAGE_LOGO_URL") or "").strip() + logo_path = (os.getenv("COVER_PAGE_LOGO_PATH") or "").strip() + # For now, just reserve space; image insertion can be added later. + p = doc.add_paragraph() + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + p.paragraph_format.space_after = Pt(12) + + # Metadata lines + if org or date or prepared_by or reference: + p = doc.add_paragraph() + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + p.paragraph_format.space_after = Pt(4) + if org: + r = p.add_run(org) + r.font.name = "Courier" + r.font.size = Pt(12) + if date: + if org: + p.add_run("\n") + r = p.add_run(date) + r.font.name = "Courier" + r.font.size = Pt(12) + + if prepared_by or reference: + p = doc.add_paragraph() + p.alignment = WD_ALIGN_PARAGRAPH.CENTER + p.paragraph_format.space_after = Pt(4) + if prepared_by: + r = p.add_run(f"Prepared by: {prepared_by}") + r.font.name = "Courier" + r.font.size = Pt(11) + if reference: + if prepared_by: + p.add_run("\n") + r = p.add_run(f"Reference: {reference}") + r.font.name = "Courier" + r.font.size = Pt(11) + + # Page break after cover page + _add_page_break(doc) diff --git a/scraibe/docx_styles.py b/scraibe/docx_styles.py new file mode 100644 index 0000000..ffb7bdb --- /dev/null +++ b/scraibe/docx_styles.py @@ -0,0 +1,149 @@ +""" +Utility module for applying styles and converting simple markdown +into styled DOCX paragraphs/runs for summaries. +""" + +import re +from docx import Document +from docx.shared import Pt +from docx.oxml import OxmlElement +from docx.oxml.ns import qn + + +def _ensure_style(doc, name, based_on="Normal", font_name="Courier", font_size=Pt(12)): + """ + Ensure a paragraph style exists in the document. + """ + styles = doc.styles + if name not in [s.name for s in styles]: + style = styles.add_style(name, 1) # 1 = WD_STYLE_TYPE.PARAGRAPH + style.font.name = font_name + style.font.size = font_size + if based_on: + style.base_style = styles[based_on] + return styles[name] + + +def apply_heading_style(paragraph, level: int): + """ + Apply heading style to a paragraph based on level (1, 2, 3). + """ + if level == 1: + style_name = "SummaryHeading1" + size = Pt(16) + elif level == 2: + style_name = "SummaryHeading2" + size = Pt(14) + else: + style_name = "SummaryHeading3" + size = Pt(12) + + doc = paragraph.document + style = _ensure_style(doc, style_name, font_size=size) + paragraph.style = style + paragraph.paragraph_format.space_before = Pt(4) + paragraph.paragraph_format.space_after = Pt(2) + + +def apply_bullet_style(paragraph): + """ + Apply a simple bullet style to a paragraph. + """ + doc = paragraph.document + style_name = "SummaryBullet" + style = _ensure_style(doc, style_name) + paragraph.style = style + pPr = paragraph._p.get_or_add_pPr() + tabs = OxmlElement("w:tabs") + tab = OxmlElement("w:tab") + tab.set(qn("w:val"), "left") + tab.set(qn("w:pos"), "360") + tabs.append(tab) + pPr.append(tabs) + + +def parse_simple_md_to_paragraphs(doc, text: str): + """ + Convert simple markdown text into DOCX paragraphs with styles. + + Supported: + - # / ## / ### for headings + - - / * for bullet lists + - **bold** and *italic* + + This is intentionally simple and robust for legal/business summaries. + """ + lines = text.splitlines() + current_paragraph = None + in_list = False + + for line in lines: + stripped = line.strip() + if not stripped: + current_paragraph = None + in_list = False + continue + + # Headings + heading_match = re.match(r"^(#{1,3})\s+(.*)", stripped) + if heading_match: + level = len(heading_match.group(1)) + content = heading_match.group(2).strip() + p = doc.add_paragraph() + apply_heading_style(p, level) + _add_run_with_inline_md(p, content) + current_paragraph = p + in_list = False + continue + + # Bullet list + bullet_match = re.match(r"^[-*]\s+(.*)", stripped) + if bullet_match: + content = bullet_match.group(1).strip() + if not in_list or current_paragraph is None: + in_list = True + current_paragraph = doc.add_paragraph() + apply_bullet_style(current_paragraph) + else: + current_paragraph = doc.add_paragraph() + apply_bullet_style(current_paragraph) + _add_run_with_inline_md(current_paragraph, content) + continue + + # Normal paragraph + if not in_list or current_paragraph is None: + in_list = False + current_paragraph = doc.add_paragraph() + else: + current_paragraph = doc.add_paragraph() + + _add_run_with_inline_md(current_paragraph, stripped) + + +def _add_run_with_inline_md(paragraph, text: str): + """ + Add runs to a paragraph, interpreting **bold** and *italic*. + """ + # Simple regex for bold and italic + parts = re.split(r"(\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*)", text) + for part in parts: + if not part: + continue + + run = paragraph.add_run(part) + run.font.name = "Courier" + run.font.size = Pt(12) + + # Bold + bold_match = re.fullmatch(r"\*\*(.+?)\*\*", part) + if bold_match: + run.bold = True + part = bold_match.group(1) + + # Italic + italic_match = re.fullmatch(r"\*(.+?)\*", part) + if italic_match: + run.italic = True + part = italic_match.group(1) + + run.text = part diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index 593e3f5..54f29c8 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -505,7 +505,19 @@ def create_transcript_docx(text: str, filename: str): _create_transcript_section_properties(doc.sections[0]) - # Step 3: Write prepared pages into DOCX + # Step 3: Optionally add cover page + from . import docx_cover + cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes") + if cover_enabled: + docx_cover.add_cover_page( + doc, + title="TRANSCRIPT", + subtitle=None, + metadata=None, + include_logo=True, + ) + + # Step 4: Write prepared pages into DOCX for page_idx, page_lines in enumerate(prepared_pages): # Insert page break between pages if page_idx > 0: @@ -523,7 +535,7 @@ def create_transcript_docx(text: str, filename: str): for line_num, line_text in enumerate(page_lines, start=1): _add_transcript_paragraph(doc, line_text, line_number=line_num) - # Step 4: Add footer: "X of Y" centered + # Step 5: Add footer: "X of Y" centered section = doc.sections[0] footer = section.footer footer.is_linked_to_previous = False @@ -563,8 +575,10 @@ def create_summary_docx(text: str, filename: str): Create a summary DOCX with: - 1" margins on all sides - 12pt Courier font - - No line numbering + - Markdown-aware WYSIWYG styling (headings, bullets, bold/italic) """ + from . import docx_styles + doc = Document() # Base font @@ -584,13 +598,20 @@ def create_summary_docx(text: str, filename: str): for p in list(body.findall(f"{{{W_NS}}}p")): body.remove(p) - # Add summary content - lines = text.strip().splitlines() - for line in lines: - line = line.strip() - if not line: - continue - p = doc.add_paragraph(line) - p.paragraph_format.space_after = Pt(4) + # Optionally add cover page + from . import docx_cover + cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes") + if cover_enabled: + docx_cover.add_cover_page( + doc, + title="SUMMARY", + subtitle=None, + metadata=None, + include_logo=True, + ) + + # Add summary content using markdown-aware styling + if text.strip(): + docx_styles.parse_simple_md_to_paragraphs(doc, text.strip()) doc.save(filename) diff --git a/scraibe/summarizer.py b/scraibe/summarizer.py index a61c1fb..10607a8 100644 --- a/scraibe/summarizer.py +++ b/scraibe/summarizer.py @@ -148,19 +148,76 @@ class SummarizerClient: start = break_pos return chunks + def _load_summary_prompt(self, role: str) -> str: + """ + Load summary prompt for the given role: 'chunk' or 'combined'. + + Priority: + 1) SUMMARY_PROMPT_{ROLE} (env) + 2) SUMMARY_PROMPT_FILE (env) with [chunk] / [combined] sections + 3) Built-in default prompt + """ + role_upper = role.upper() + + # 1) Direct env var: SUMMARY_PROMPT_CHUNK / SUMMARY_PROMPT_COMBINED + env_key = f"SUMMARY_PROMPT_{role_upper}" + env_prompt = (os.getenv(env_key) or "").strip() + if env_prompt: + return env_prompt + + # 2) File-based prompt with sections + prompt_file = (os.getenv("SUMMARY_PROMPT_FILE") or "").strip() + if prompt_file and os.path.exists(prompt_file): + try: + with open(prompt_file, "r", encoding="utf-8") as f: + content = f.read() + # Simple section parser: [chunk], [combined] + import re + pattern = re.compile( + r"\[" + role + r"\]\s*\n(.*?)(?=\n\[|$)", + re.DOTALL, + ) + m = pattern.search(content) + if m: + text = m.group(1).strip() + if text: + return text + except Exception as e: + logger.warning("Failed to load SUMMARY_PROMPT_FILE for %s: %s", role, e) + + # 3) Default prompts + if role == "chunk": + return ( + "You are an expert legal and business meeting summarizer. " + "You will receive a segment of a longer transcript. " + "Provide a detailed, structured summary of this segment, focusing on: " + "- Topics discussed\n" + "- Key points and arguments\n" + "- Decisions and agreements\n" + "- Action items and responsibilities\n" + "- Any risks, conflicts, or open issues\n\n" + "Be concise but complete. Use bullet points where helpful. " + "Do not add information that is not present in the transcript." + ) + else: + return ( + "You are an expert legal and business meeting summarizer. " + "You will receive several intermediate summaries of a longer conversation. " + "Produce a single, comprehensive summary that makes it clear: " + "- The overall purpose and context of the discussion\n" + "- The main issues and topics addressed\n" + "- Key arguments and positions (briefly)\n" + "- Decisions and outcomes\n" + "- Action items, responsibilities, and next steps\n" + "- Any unresolved issues or risks\n\n" + "The summary should be detailed enough that a reader who was not present " + "can understand what happened and what is expected going forward. " + "Use clear, concise language and bullet points where appropriate. " + "Use markdown formatting (headings, lists, bold) to structure the summary." + ) + def _summarize_chunk(self, chunk: str, index: int, total: int) -> str: - system_prompt = ( - "You are an expert legal and business meeting summarizer. " - "You will receive a segment of a longer transcript. " - "Provide a detailed, structured summary of this segment, focusing on: " - "- Topics discussed\n" - "- Key points and arguments\n" - "- Decisions and agreements\n" - "- Action items and responsibilities\n" - "- Any risks, conflicts, or open issues\n\n" - "Be concise but complete. Use bullet points when helpful. " - "Do not add information that is not present in the transcript." - ) + system_prompt = self._load_summary_prompt("chunk") user_prompt = ( f"This is segment {index + 1} of {total} from a longer conversation.\n\n" @@ -170,20 +227,7 @@ class SummarizerClient: return self._chat_completion(system_prompt, user_prompt) def _summarize_combined(self, combined_summaries: str) -> str: - system_prompt = ( - "You are an expert legal and business meeting summarizer. " - "You will receive several intermediate summaries of a longer conversation. " - "Produce a single, comprehensive summary that makes it clear: " - "- The overall purpose and context of the discussion\n" - "- The main issues and topics addressed\n" - "- Key arguments and positions (briefly)\n" - "- Decisions and outcomes\n" - "- Action items, responsibilities, and next steps\n" - "- Any unresolved issues or risks\n\n" - "The summary should be detailed enough that a reader who was not present " - "can understand what happened and what is expected going forward. " - "Use clear, concise language and bullet points where appropriate." - ) + system_prompt = self._load_summary_prompt("combined") user_prompt = ( "Here are the intermediate summaries from different parts of the same conversation:\n\n"