Improve summary prompt, add markdown-to-DOCX styling, and add cover pages
Mirror and run GitLab CI / build (push) Waiting to run
Ruff / ruff (push) Waiting to run

- Configurable summary prompts via ENV or file; stronger default prompt.
- New docx_styles.py: converts markdown (headings, bullets, bold/italic) to DOCX.
- Updated create_summary_docx to use markdown-aware styling.
- New docx_cover.py: reusable cover page for transcript and summary.
- Cover pages enabled when COVER_PAGE_ENABLED=true.
This commit is contained in:
admin
2026-06-19 17:16:46 +00:00
parent 54414def26
commit 7a31be9de5
4 changed files with 369 additions and 37 deletions
+118
View File
@@ -0,0 +1,118 @@
"""
Reusable cover-page generator for transcript and summary DOCX files.
Configuration (env):
- COVER_PAGE_ENABLED: "true"/"false" (default: false)
- COVER_PAGE_ORGANIZATION: e.g., "A.P.Strom"
- COVER_PAGE_TITLE_PREFIX: e.g., "TRANSCRIPT" or "SUMMARY"
- COVER_PAGE_LOGO_URL: optional URL
- COVER_PAGE_LOGO_PATH: optional local path
"""
import os
from typing import Optional
from docx import Document
from docx.shared import Pt, Inches
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
def _add_page_break(doc: Document):
"""Insert a page break paragraph."""
p = doc.add_paragraph()
pPr = p._p.get_or_add_pPr()
# Clear spacing/tabs
for child in list(pPr):
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
if tag in ("tabs", "spacing", "ind"):
pPr.remove(child)
page_break = OxmlElement("w:pageBreak")
page_break.set("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}val", "1")
pPr.append(page_break)
def add_cover_page(
doc: Document,
title: str,
subtitle: Optional[str] = None,
metadata: Optional[dict] = None,
include_logo: bool = False,
):
"""
Insert a cover page at the current cursor position.
- title: e.g., "TRANSCRIPT" or "SUMMARY"
- subtitle: e.g., "Meeting of 16 June 2026"
- metadata: optional dict with keys like:
- "Organization"
- "Date"
- "Prepared by"
- "Reference"
"""
org = (os.getenv("COVER_PAGE_ORGANIZATION") or "").strip() or metadata.get("Organization") if metadata else None
date = (metadata.get("Date") if metadata else None) or ""
prepared_by = (metadata.get("Prepared by") if metadata else None) or ""
reference = (metadata.get("Reference") if metadata else None) or ""
# Title
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(6)
run = p.add_run(title.upper())
run.bold = True
run.font.name = "Courier"
run.font.size = Pt(18)
# Subtitle
if subtitle:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(12)
run = p.add_run(subtitle)
run.font.name = "Courier"
run.font.size = Pt(14)
# Optional logo placeholder (text-only for now; can be extended)
if include_logo:
logo_url = (os.getenv("COVER_PAGE_LOGO_URL") or "").strip()
logo_path = (os.getenv("COVER_PAGE_LOGO_PATH") or "").strip()
# For now, just reserve space; image insertion can be added later.
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(12)
# Metadata lines
if org or date or prepared_by or reference:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(4)
if org:
r = p.add_run(org)
r.font.name = "Courier"
r.font.size = Pt(12)
if date:
if org:
p.add_run("\n")
r = p.add_run(date)
r.font.name = "Courier"
r.font.size = Pt(12)
if prepared_by or reference:
p = doc.add_paragraph()
p.alignment = WD_ALIGN_PARAGRAPH.CENTER
p.paragraph_format.space_after = Pt(4)
if prepared_by:
r = p.add_run(f"Prepared by: {prepared_by}")
r.font.name = "Courier"
r.font.size = Pt(11)
if reference:
if prepared_by:
p.add_run("\n")
r = p.add_run(f"Reference: {reference}")
r.font.name = "Courier"
r.font.size = Pt(11)
# Page break after cover page
_add_page_break(doc)
+149
View File
@@ -0,0 +1,149 @@
"""
Utility module for applying styles and converting simple markdown
into styled DOCX paragraphs/runs for summaries.
"""
import re
from docx import Document
from docx.shared import Pt
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
def _ensure_style(doc, name, based_on="Normal", font_name="Courier", font_size=Pt(12)):
"""
Ensure a paragraph style exists in the document.
"""
styles = doc.styles
if name not in [s.name for s in styles]:
style = styles.add_style(name, 1) # 1 = WD_STYLE_TYPE.PARAGRAPH
style.font.name = font_name
style.font.size = font_size
if based_on:
style.base_style = styles[based_on]
return styles[name]
def apply_heading_style(paragraph, level: int):
"""
Apply heading style to a paragraph based on level (1, 2, 3).
"""
if level == 1:
style_name = "SummaryHeading1"
size = Pt(16)
elif level == 2:
style_name = "SummaryHeading2"
size = Pt(14)
else:
style_name = "SummaryHeading3"
size = Pt(12)
doc = paragraph.document
style = _ensure_style(doc, style_name, font_size=size)
paragraph.style = style
paragraph.paragraph_format.space_before = Pt(4)
paragraph.paragraph_format.space_after = Pt(2)
def apply_bullet_style(paragraph):
"""
Apply a simple bullet style to a paragraph.
"""
doc = paragraph.document
style_name = "SummaryBullet"
style = _ensure_style(doc, style_name)
paragraph.style = style
pPr = paragraph._p.get_or_add_pPr()
tabs = OxmlElement("w:tabs")
tab = OxmlElement("w:tab")
tab.set(qn("w:val"), "left")
tab.set(qn("w:pos"), "360")
tabs.append(tab)
pPr.append(tabs)
def parse_simple_md_to_paragraphs(doc, text: str):
"""
Convert simple markdown text into DOCX paragraphs with styles.
Supported:
- # / ## / ### for headings
- - / * for bullet lists
- **bold** and *italic*
This is intentionally simple and robust for legal/business summaries.
"""
lines = text.splitlines()
current_paragraph = None
in_list = False
for line in lines:
stripped = line.strip()
if not stripped:
current_paragraph = None
in_list = False
continue
# Headings
heading_match = re.match(r"^(#{1,3})\s+(.*)", stripped)
if heading_match:
level = len(heading_match.group(1))
content = heading_match.group(2).strip()
p = doc.add_paragraph()
apply_heading_style(p, level)
_add_run_with_inline_md(p, content)
current_paragraph = p
in_list = False
continue
# Bullet list
bullet_match = re.match(r"^[-*]\s+(.*)", stripped)
if bullet_match:
content = bullet_match.group(1).strip()
if not in_list or current_paragraph is None:
in_list = True
current_paragraph = doc.add_paragraph()
apply_bullet_style(current_paragraph)
else:
current_paragraph = doc.add_paragraph()
apply_bullet_style(current_paragraph)
_add_run_with_inline_md(current_paragraph, content)
continue
# Normal paragraph
if not in_list or current_paragraph is None:
in_list = False
current_paragraph = doc.add_paragraph()
else:
current_paragraph = doc.add_paragraph()
_add_run_with_inline_md(current_paragraph, stripped)
def _add_run_with_inline_md(paragraph, text: str):
"""
Add runs to a paragraph, interpreting **bold** and *italic*.
"""
# Simple regex for bold and italic
parts = re.split(r"(\*\*\*.*?\*\*\*|\*\*.*?\*\*|\*.*?\*)", text)
for part in parts:
if not part:
continue
run = paragraph.add_run(part)
run.font.name = "Courier"
run.font.size = Pt(12)
# Bold
bold_match = re.fullmatch(r"\*\*(.+?)\*\*", part)
if bold_match:
run.bold = True
part = bold_match.group(1)
# Italic
italic_match = re.fullmatch(r"\*(.+?)\*", part)
if italic_match:
run.italic = True
part = italic_match.group(1)
run.text = part
+32 -11
View File
@@ -505,7 +505,19 @@ def create_transcript_docx(text: str, filename: str):
_create_transcript_section_properties(doc.sections[0]) _create_transcript_section_properties(doc.sections[0])
# Step 3: Write prepared pages into DOCX # Step 3: Optionally add cover page
from . import docx_cover
cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
if cover_enabled:
docx_cover.add_cover_page(
doc,
title="TRANSCRIPT",
subtitle=None,
metadata=None,
include_logo=True,
)
# Step 4: Write prepared pages into DOCX
for page_idx, page_lines in enumerate(prepared_pages): for page_idx, page_lines in enumerate(prepared_pages):
# Insert page break between pages # Insert page break between pages
if page_idx > 0: if page_idx > 0:
@@ -523,7 +535,7 @@ def create_transcript_docx(text: str, filename: str):
for line_num, line_text in enumerate(page_lines, start=1): for line_num, line_text in enumerate(page_lines, start=1):
_add_transcript_paragraph(doc, line_text, line_number=line_num) _add_transcript_paragraph(doc, line_text, line_number=line_num)
# Step 4: Add footer: "X of Y" centered # Step 5: Add footer: "X of Y" centered
section = doc.sections[0] section = doc.sections[0]
footer = section.footer footer = section.footer
footer.is_linked_to_previous = False footer.is_linked_to_previous = False
@@ -563,8 +575,10 @@ def create_summary_docx(text: str, filename: str):
Create a summary DOCX with: Create a summary DOCX with:
- 1" margins on all sides - 1" margins on all sides
- 12pt Courier font - 12pt Courier font
- No line numbering - Markdown-aware WYSIWYG styling (headings, bullets, bold/italic)
""" """
from . import docx_styles
doc = Document() doc = Document()
# Base font # Base font
@@ -584,13 +598,20 @@ def create_summary_docx(text: str, filename: str):
for p in list(body.findall(f"{{{W_NS}}}p")): for p in list(body.findall(f"{{{W_NS}}}p")):
body.remove(p) body.remove(p)
# Add summary content # Optionally add cover page
lines = text.strip().splitlines() from . import docx_cover
for line in lines: cover_enabled = os.getenv("COVER_PAGE_ENABLED", "false").strip().lower() in ("true", "1", "yes")
line = line.strip() if cover_enabled:
if not line: docx_cover.add_cover_page(
continue doc,
p = doc.add_paragraph(line) title="SUMMARY",
p.paragraph_format.space_after = Pt(4) subtitle=None,
metadata=None,
include_logo=True,
)
# Add summary content using markdown-aware styling
if text.strip():
docx_styles.parse_simple_md_to_paragraphs(doc, text.strip())
doc.save(filename) doc.save(filename)
+57 -13
View File
@@ -148,8 +148,46 @@ class SummarizerClient:
start = break_pos start = break_pos
return chunks return chunks
def _summarize_chunk(self, chunk: str, index: int, total: int) -> str: def _load_summary_prompt(self, role: str) -> str:
system_prompt = ( """
Load summary prompt for the given role: 'chunk' or 'combined'.
Priority:
1) SUMMARY_PROMPT_{ROLE} (env)
2) SUMMARY_PROMPT_FILE (env) with [chunk] / [combined] sections
3) Built-in default prompt
"""
role_upper = role.upper()
# 1) Direct env var: SUMMARY_PROMPT_CHUNK / SUMMARY_PROMPT_COMBINED
env_key = f"SUMMARY_PROMPT_{role_upper}"
env_prompt = (os.getenv(env_key) or "").strip()
if env_prompt:
return env_prompt
# 2) File-based prompt with sections
prompt_file = (os.getenv("SUMMARY_PROMPT_FILE") or "").strip()
if prompt_file and os.path.exists(prompt_file):
try:
with open(prompt_file, "r", encoding="utf-8") as f:
content = f.read()
# Simple section parser: [chunk], [combined]
import re
pattern = re.compile(
r"\[" + role + r"\]\s*\n(.*?)(?=\n\[|$)",
re.DOTALL,
)
m = pattern.search(content)
if m:
text = m.group(1).strip()
if text:
return text
except Exception as e:
logger.warning("Failed to load SUMMARY_PROMPT_FILE for %s: %s", role, e)
# 3) Default prompts
if role == "chunk":
return (
"You are an expert legal and business meeting summarizer. " "You are an expert legal and business meeting summarizer. "
"You will receive a segment of a longer transcript. " "You will receive a segment of a longer transcript. "
"Provide a detailed, structured summary of this segment, focusing on: " "Provide a detailed, structured summary of this segment, focusing on: "
@@ -158,19 +196,11 @@ class SummarizerClient:
"- Decisions and agreements\n" "- Decisions and agreements\n"
"- Action items and responsibilities\n" "- Action items and responsibilities\n"
"- Any risks, conflicts, or open issues\n\n" "- Any risks, conflicts, or open issues\n\n"
"Be concise but complete. Use bullet points when helpful. " "Be concise but complete. Use bullet points where helpful. "
"Do not add information that is not present in the transcript." "Do not add information that is not present in the transcript."
) )
else:
user_prompt = ( return (
f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
f"{chunk}"
)
return self._chat_completion(system_prompt, user_prompt)
def _summarize_combined(self, combined_summaries: str) -> str:
system_prompt = (
"You are an expert legal and business meeting summarizer. " "You are an expert legal and business meeting summarizer. "
"You will receive several intermediate summaries of a longer conversation. " "You will receive several intermediate summaries of a longer conversation. "
"Produce a single, comprehensive summary that makes it clear: " "Produce a single, comprehensive summary that makes it clear: "
@@ -183,8 +213,22 @@ class SummarizerClient:
"The summary should be detailed enough that a reader who was not present " "The summary should be detailed enough that a reader who was not present "
"can understand what happened and what is expected going forward. " "can understand what happened and what is expected going forward. "
"Use clear, concise language and bullet points where appropriate. " "Use clear, concise language and bullet points where appropriate. "
"Use markdown formatting (headings, lists, bold) to structure the summary."
) )
def _summarize_chunk(self, chunk: str, index: int, total: int) -> str:
system_prompt = self._load_summary_prompt("chunk")
user_prompt = (
f"This is segment {index + 1} of {total} from a longer conversation.\n\n"
f"{chunk}"
)
return self._chat_completion(system_prompt, user_prompt)
def _summarize_combined(self, combined_summaries: str) -> str:
system_prompt = self._load_summary_prompt("combined")
user_prompt = ( user_prompt = (
"Here are the intermediate summaries from different parts of the same conversation:\n\n" "Here are the intermediate summaries from different parts of the same conversation:\n\n"
f"{combined_summaries}" f"{combined_summaries}"