Revert PDF generation; simplify to DOCX + MD + JSON only

- Remove PDF helpers, LibreOffice, PyPDF2, reportlab - Transcript DOCX: standalone, no cover page, with line numbering - Summary DOCX: standalone, no cover page, no line numbering - Attachments: - Transcribe: JSON, transcript MD, transcript DOCX - Transcribe & Summarize: JSON, transcript MD, transcript DOCX, summary MD, summary DOCX
2026-06-15 03:38:12 +00:00
parent 6233a41f61
commit e27e5b8522
4 changed files with 14 additions and 480 deletions
@@ -9,14 +9,9 @@ LABEL description="Scraibe: LocalAI-backed transcription and diarization client
                    Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization."
 LABEL url="https://git.optimex.systems/admin/scribe"

-# Install system dependencies (ffmpeg, redis, LibreOffice for PDF generation)
+# Install system dependencies (ffmpeg, redis)
 RUN apt update -y && \
-    apt install -y --no-install-recommends \
-        ffmpeg \
-        redis-server \
-        libreoffice-writer \
-        fonts-dejavu-core \
-        && \
+    apt install -y --no-install-recommends ffmpeg redis-server && \
    apt clean && \
    rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*

@@ -6,5 +6,3 @@ PyYAML>=6.0
 celery[redis]>=5.3.0
 redis>=5.0.0
 python-docx>=1.1.0
-PyPDF2>=3.0.0
-reportlab>=4.0.0
@@ -13,8 +13,6 @@ import logging
 import os
 import re
 import smtplib
-import subprocess
-import tempfile
 from email import encoders
 from email.mime.base import MIMEBase
 from email.mime.multipart import MIMEMultipart
@@ -336,58 +334,6 @@ def _enable_line_numbering(section, start_at=1, count_by=1, restart=True):
    sectPr.append(lnNumType)


-def _add_cover_page(doc, doc_type, date, description):
-    """
-    Add a cover page:
-      - Centered horizontally and vertically using a full-page table.
-      - Lines:
-          1) Document type
-          2) Date (e.g. "June 14, 2026")
-          3-5) Empty space
-          6) One-sentence description
-    """
-    # Create a full-page table to center content vertically and horizontally
-    table = doc.add_table(rows=1, cols=1)
-    table.autofit = False
-    cell = table.cell(0, 0)
-
-    # Make table span full page width (approx)
-    cell.width = Inches(6.5)
-
-    # Center content inside the cell
-    for paragraph in cell.paragraphs:
-        paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
-
-    # Clear default paragraph
-    cell.paragraphs[0].clear()
-
-    # Line 1: Document type
-    p_type = cell.add_paragraph()
-    p_type.alignment = WD_ALIGN_PARAGRAPH.CENTER
-    run_type = p_type.add_run(doc_type)
-    run_type.bold = True
-    run_type.font.name = "Courier"
-    run_type.font.size = Pt(12)
-
-    # Line 2: Date
-    p_date = cell.add_paragraph()
-    p_date.alignment = WD_ALIGN_PARAGRAPH.CENTER
-    run_date = p_date.add_run(date)
-    run_date.font.name = "Courier"
-    run_date.font.size = Pt(12)
-
-    # Lines 3-5: blank space
-    for _ in range(3):
-        cell.add_paragraph()
-
-    # Line 6: Description
-    p_desc = cell.add_paragraph()
-    p_desc.alignment = WD_ALIGN_PARAGRAPH.CENTER
-    run_desc = p_desc.add_run(description)
-    run_desc.font.name = "Courier"
-    run_desc.font.size = Pt(12)
-
-
 def _add_transcript_content(doc, text):
    """
    Add transcript lines to the document with formatting.
@@ -467,146 +413,27 @@ def _add_summary_content(doc, text):
            p.paragraph_format.space_after = Pt(4)


-# ------------ PDF helpers ------------
-
-def _docx_to_pdf(docx_path: str, output_dir: str) -> str:
-    """
-    Convert a .docx file to PDF using LibreOffice.
-    Returns the path of the generated PDF, or None on failure.
-    """
-    try:
-        # Use LibreOffice in headless mode
-        result = subprocess.run(
-            [
-                "libreoffice",
-                "--headless",
-                "--convert-to", "pdf",
-                "--outdir", output_dir,
-                docx_path,
-            ],
-            capture_output=True,
-            text=True,
-            timeout=120,
-        )
-
-        if result.returncode != 0:
-            logger.warning("LibreOffice conversion failed: %s", result.stderr)
-
-        # LibreOffice creates a PDF with the same base name
-        base = os.path.splitext(os.path.basename(docx_path))[0]
-        pdf_path = os.path.join(output_dir, f"{base}.pdf")
-
-        if os.path.exists(pdf_path):
-            return pdf_path
-        else:
-            logger.warning("Converted PDF not found at: %s", pdf_path)
-            return None
-
-    except Exception as e:
-        logger.error("Error converting DOCX to PDF: %s", e)
-        return None
-
-
-def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool:
-    """
-    Merge multiple PDF files into a single PDF using PyPDF2.
-    Returns True on success, False on failure.
-    """
-    try:
-        from PyPDF2 import PdfMerger
-
-        merger = PdfMerger()
-        for pdf in input_pdfs:
-            if os.path.exists(pdf):
-                merger.append(pdf)
-
-        merger.write(output_pdf)
-        merger.close()
-        return True
-
-    except Exception as e:
-        logger.error("Error merging PDFs: %s", e)
-        return False
-
-
-def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool:
-    """
-    Add page numbers to a PDF using reportlab.
-    Page numbers appear at the bottom-right of each page.
-    """
-    try:
-        from reportlab.pdfgen import canvas
-        from reportlab.lib.pagesizes import letter
-        from reportlab.pdfbase import pdfmetrics
-        from reportlab.pdfbase.ttfonts import TTFont
-        from PyPDF2 import PdfReader
-
-        # Try to use a standard font
-        font_name = "Courier"
-        try:
-            pdfmetrics.registerFont(
-                TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf")
-            )
-        except Exception:
-            pass
-
-        reader = PdfReader(input_pdf)
-        num_pages = len(reader.pages)
-
-        # Create a canvas to add page numbers
-        c = canvas.Canvas(output_pdf, pagesize=letter)
-
-        for page_num in range(num_pages):
-            page = reader.pages[page_num]
-            c = canvas.Canvas(output_pdf, pagesize=letter)
-
-            # Add page number
-            c.setFont(font_name, 10)
-            page_text = f"Page {page_num + 1} of {num_pages}"
-            text_width = c.stringWidth(page_text, font_name, 10)
-            x = letter[0] - 1 * 72 - text_width  # 1 inch from right
-            y = 1 * 72  # 1 inch from bottom
-            c.drawString(x, y, page_text)
-
-            c.showPage()
-            c.save()
-
-        return True
-
-    except Exception as e:
-        logger.error("Error adding page numbers to PDF: %s", e)
-        return False
-
-
-# ------------ Public DOCX/PDF functions ------------
+# ------------ Public DOCX functions ------------

 def create_transcript_docx(
    text: str,
    filename: str,
-    include_cover: bool = False,
-    cover_date: str = "",
-    cover_desc: str = "",
 ):
    """
    Create a .docx transcript with:
      - 1" margins on all sides
      - 12pt Courier
-      - Continuous line numbering on the left (for transcript content only)
-      - Optional cover page with type, date, and AI-generated description.
+      - Continuous line numbering on the left
+      - No cover page (standalone transcript only).
    """
    doc = Document()
    _configure_base_font(doc)
    _configure_section_margins(doc)

-    # 1) Optional cover page (no line numbering)
-    if include_cover and cover_date and cover_desc:
-        _add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc)
-        doc.add_page_break()
-
-    # 2) Enable line numbering for transcript section
+    # Enable line numbering for transcript section
    _enable_line_numbering(doc.sections[0])

-    # 3) Transcript content (with line numbering)
+    # Transcript content (with line numbering)
    _add_transcript_content(doc, text)

    doc.save(filename)
@@ -615,198 +442,21 @@ def create_transcript_docx(
 def create_summary_docx(
    text: str,
    filename: str,
-    include_cover: bool = False,
-    cover_date: str = "",
-    cover_desc: str = "",
 ):
    """
    Create a .docx summary with consistent font and heading styles.
-    Optional cover page with type, date, and AI-generated description.
-    No line numbering.
+    No cover page, no line numbering.
    """
    doc = Document()
    _configure_base_font(doc)
    _configure_section_margins(doc)

-    # 1) Optional cover page
-    if include_cover and cover_date and cover_desc:
-        _add_cover_page(doc, "SUMMARY", cover_date, cover_desc)
-        doc.add_page_break()
-
-    # 2) Summary content (no line numbering)
+    # Summary content (no line numbering)
    _add_summary_content(doc, text)

    doc.save(filename)


-def create_combined_docx(
-    transcript_text: str,
-    summary_text: str,
-    filename: str,
-    transcript_cover_date: str,
-    transcript_cover_desc: str,
-    summary_cover_date: str,
-    summary_cover_desc: str,
-):
-    """
-    Create a combined .docx with:
-      1) Transcript cover page (no line numbering)
-      2) Page break
-      3) Summary content (no line numbering)
-      4) Page break
-      5) Transcript content (line numbering enabled only here)
-
-    Line numbering is restricted to the transcript section only.
-    """
-    doc = Document()
-    _configure_base_font(doc)
-    _configure_section_margins(doc)
-
-    # 1) Transcript cover page (no line numbering)
-    _add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
-    doc.add_page_break()
-
-    # 3) Summary content (no line numbering)
-    _add_summary_content(doc, summary_text)
-    doc.add_page_break()
-
-    # 4) Enable line numbering for transcript section
-    _enable_line_numbering(doc.sections[0])
-
-    # 5) Transcript content (with line numbering)
-    _add_transcript_content(doc, transcript_text)
-
-    doc.save(filename)
-
-
-def generate_pdf_documents(
-    transcript_text: str,
-    summary_text: str,
-    output_dir: str,
-    transcript_cover_date: str,
-    transcript_cover_desc: str,
-    summary_cover_date: str,
-    summary_cover_desc: str,
-) -> Dict[str, str]:
-    """
-    Generate PDF documents by:
-      - Creating individual .docx components (cover pages, transcript, summary)
-      - Converting each .docx to PDF
-      - Merging PDFs in the correct order
-      - Adding page numbers to the final PDFs
-
-    Behavior:
-      - Always:
-        - Generate TRANSCRIPT.pdf:
-          - transcript_cover.pdf + transcript_with_line_numbers.pdf
-      - If summary_text is provided:
-        - Generate SUMMARY.pdf:
-          - summary_cover.pdf + summary.pdf
-        - Generate COMBINED.pdf:
-          - transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf
-
-    Returns a dict with paths:
-      - transcript_pdf (always)
-      - summary_pdf (if summary_text provided)
-      - combined_pdf (if summary_text provided)
-    """
-    import os
-
-    os.makedirs(output_dir, exist_ok=True)
-
-    result = {}
-
-    # Create temporary directory for intermediate files
-    with tempfile.TemporaryDirectory() as tmpdir:
-        # 1) Generate constituent DOCX files
-
-        # Transcript cover
-        transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx")
-        doc = Document()
-        _configure_base_font(doc)
-        _configure_section_margins(doc)
-        _add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc)
-        doc.save(transcript_cover_docx)
-
-        # Summary cover (only if summary is requested)
-        summary_cover_docx = None
-        if summary_text:
-            summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx")
-            doc = Document()
-            _configure_base_font(doc)
-            _configure_section_margins(doc)
-            _add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc)
-            doc.save(summary_cover_docx)
-
-        # Transcript (with line numbering)
-        transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx")
-        doc = Document()
-        _configure_base_font(doc)
-        _configure_section_margins(doc)
-        _enable_line_numbering(doc.sections[0])
-        _add_transcript_content(doc, transcript_text)
-        doc.save(transcript_docx)
-
-        # Summary (no line numbering)
-        summary_docx = None
-        if summary_text:
-            summary_docx = os.path.join(tmpdir, "SUMMARY.docx")
-            doc = Document()
-            _configure_base_font(doc)
-            _configure_section_margins(doc)
-            _add_summary_content(doc, summary_text)
-            doc.save(summary_docx)
-
-        # 2) Convert DOCX to PDF
-
-        transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir)
-        transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir)
-
-        summary_cover_pdf = None
-        summary_pdf = None
-        if summary_text:
-            summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir)
-            summary_pdf = _docx_to_pdf(summary_docx, tmpdir)
-
-        # 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers
-        transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf")
-        merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf")
-        if (
-            transcript_cover_pdf
-            and transcript_pdf
-            and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript)
-            and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf)
-        ):
-            result["transcript_pdf"] = transcript_output_pdf
-
-        # 4) If summary is provided:
-        #    - SUMMARY.pdf: summary_cover + summary + page numbers
-        #    - COMBINED.pdf: transcript_cover + summary + transcript + page numbers
-        if summary_text and summary_pdf and summary_cover_pdf:
-            # SUMMARY.pdf
-            summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf")
-            merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf")
-            if (
-                _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary)
-                and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf)
-            ):
-                result["summary_pdf"] = summary_output_pdf
-
-            # COMBINED.pdf
-            combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf")
-            merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf")
-            if (
-                _merge_pdfs(
-                    [transcript_cover_pdf, summary_pdf, transcript_pdf],
-                    merged_combined,
-                )
-                and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf)
-            ):
-                result["combined_pdf"] = combined_output_pdf
-
-    return result
-
-
 def send_success_email(
    to: str,
    transcript_text: str,
@@ -14,7 +14,6 @@ from .summarizer import SummarizerClient, SummarizerError
 from .misc import setup_logging
 from .email_sender import send_email, EmailError, load_template
 from .email_sender import create_transcript_docx, create_summary_docx
-from .email_sender import create_combined_docx, generate_pdf_documents

 logger = logging.getLogger("scraibe.tasks")

@@ -93,7 +92,7 @@ def get_queue_position(task_id: str) -> int:
            if found:
                break

-        # If not found in reserved, it may already be active or not yet visible. 
+        # If not found in reserved, it may already be active or not yet visible.
        # In that case, treat it as position 1.
        if found:
            return max(ahead + 1, 1)
@@ -415,59 +414,6 @@ def process_transcription_task(
                )
                speaker_map = {}

-        # 3c) Generate short cover-page descriptions using summarizer
-        transcript_cover_desc = ""
-        summary_cover_desc = ""
-        dt = datetime.utcnow()
-        today_str = f"{dt.strftime('%B')} {dt.day}, {dt.year}"
-
-        try:
-            scraibe._ensure_summarizer()
-            summarizer = scraibe._summarizer
-
-            # Transcript description
-            prompt = (
-                "In one short sentence (max 25 words), describe the content of this transcript "
-                "for use on a cover page. Do not include speaker names. Output only the sentence.\n\n"
-                + transcript_text
-            )
-            response = summarizer._chat_completion(
-                messages=[{"role": "user", "content": prompt}],
-                temperature=0.3,
-                max_tokens=60,
-            )
-            transcript_cover_desc = (
-                (response or {}).get("choices", [{}])[0].get("message", {}).get("content", "")
-            ).strip()
-            if not transcript_cover_desc:
-                transcript_cover_desc = "Transcript of the recorded conversation."
-            logger.info("Transcript cover description: %s", transcript_cover_desc)
-
-            # Summary description (if summary requested)
-            if include_summary:
-                prompt = (
-                    "In one short sentence (max 25 words), describe the content of this summary "
-                    "for use on a cover page. Output only the sentence.\n\n"
-                    + summary_text
-                )
-                response = summarizer._chat_completion(
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=0.3,
-                    max_tokens=60,
-                )
-                summary_cover_desc = (
-                    (response or {}).get("choices", [{}])[0].get("message", {}).get("content", "")
-                ).strip()
-                if not summary_cover_desc:
-                    summary_cover_desc = "Summary of the recorded conversation."
-                logger.info("Summary cover description: %s", summary_cover_desc)
-
-        except (SummarizerError, Exception) as e:
-            logger.warning("Failed to generate cover descriptions: %s", e)
-            transcript_cover_desc = transcript_cover_desc or "Transcript of the recorded conversation."
-            if include_summary:
-                summary_cover_desc = summary_cover_desc or "Summary of the recorded conversation."
-
        # 4) Prepare files

        # Transcript .md
@@ -477,14 +423,11 @@ def process_transcription_task(
            f.write(transcript_text)
        temp_files.append(md_transcript_path)

-        # Transcript .docx (standalone, with cover page)
+        # Transcript .docx (standalone, no cover page)
        docx_transcript_path = _safe_filename("TRANSCRIPT", local, date_tag, ".docx")
        create_transcript_docx(
            transcript_text,
            docx_transcript_path,
-            include_cover=True,
-            cover_date=today_str,
-            cover_desc=transcript_cover_desc,
        )
        temp_files.append(docx_transcript_path)

@@ -511,7 +454,6 @@ def process_transcription_task(
        # Summary files (if present)
        md_summary_path = None
        docx_summary_path = None
-        docx_combined_path = None

        if summary_text:
            # Summary .md
@@ -521,77 +463,26 @@ def process_transcription_task(
                f.write(summary_text)
            temp_files.append(md_summary_path)

-            # Summary .docx (standalone, with cover page)
+            # Summary .docx (standalone, no cover page)
            docx_summary_path = _safe_filename("SUMMARY", local, date_tag, ".docx")
            create_summary_docx(
                summary_text,
                docx_summary_path,
-                include_cover=True,
-                cover_date=today_str,
-                cover_desc=summary_cover_desc,
            )
            temp_files.append(docx_summary_path)

-            # Combined .docx: Transcript + Summary with cover pages
-            docx_combined_path = _safe_filename("TRANSCRIPT_AND_SUMMARY", local, date_tag, ".docx")
-            create_combined_docx(
-                transcript_text=transcript_text,
-                summary_text=summary_text,
-                filename=docx_combined_path,
-                transcript_cover_date=today_str,
-                transcript_cover_desc=transcript_cover_desc,
-                summary_cover_date=today_str,
-                summary_cover_desc=summary_cover_desc,
-            )
-            temp_files.append(docx_combined_path)
-
-        # 4b) Generate PDF documents
-
-        # Always use transcript text; use summary_text only if include_summary is True
-        pdf_summary_text = summary_text if include_summary else ""
-
-        pdf_files = {}
-        try:
-            pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_")
-            pdf_files = generate_pdf_documents(
-                transcript_text=transcript_text,
-                summary_text=pdf_summary_text,
-                output_dir=pdf_output_dir,
-                transcript_cover_date=today_str,
-                transcript_cover_desc=transcript_cover_desc,
-                summary_cover_date=today_str,
-                summary_cover_desc=summary_cover_desc,
-            )
-            for pdf_path in pdf_files.values():
-                if pdf_path and os.path.exists(pdf_path):
-                    temp_files.append(pdf_path)
-        except Exception as e:
-            logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e)
-
        # 5) Build attachments list

-        # Base attachments: JSON, transcript MD, transcript DOCX
+        # Always: JSON, transcript MD, transcript DOCX
        attachments = [
            md_transcript_path,
            docx_transcript_path,
            json_path,
        ]

-        # If summary is present, add summary MD and DOCXs
+        # If summary is present, add summary MD and DOCX
        if summary_text:
            attachments += [md_summary_path, docx_summary_path]
-            if docx_combined_path:
-                attachments.append(docx_combined_path)
-
-        # Add PDFs based on mode:
-        # - Always: TRANSCRIPT.pdf
-        # - If summary: SUMMARY.pdf and COMBINED.pdf
-        if pdf_files.get("transcript_pdf"):
-            attachments.append(pdf_files["transcript_pdf"])
-        if pdf_files.get("summary_pdf"):
-            attachments.append(pdf_files["summary_pdf"])
-        if pdf_files.get("combined_pdf"):
-            attachments.append(pdf_files["combined_pdf"])

        # 6) Send success email
        send_success_email(