From e27e5b8522afc61ae12c3b79d3b7b00dffdd0807 Mon Sep 17 00:00:00 2001 From: admin Date: Mon, 15 Jun 2026 03:38:12 +0000 Subject: [PATCH] Revert PDF generation; simplify to DOCX + MD + JSON only - Remove PDF helpers, LibreOffice, PyPDF2, reportlab - Transcript DOCX: standalone, no cover page, with line numbering - Summary DOCX: standalone, no cover page, no line numbering - Attachments: - Transcribe: JSON, transcript MD, transcript DOCX - Transcribe & Summarize: JSON, transcript MD, transcript DOCX, summary MD, summary DOCX --- Dockerfile | 9 +- requirements.txt | 2 - scraibe/email_sender.py | 364 +--------------------------------------- scraibe/tasks.py | 119 +------------ 4 files changed, 14 insertions(+), 480 deletions(-) diff --git a/Dockerfile b/Dockerfile index 65e9f3a..e7f3574 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,14 +9,9 @@ LABEL description="Scraibe: LocalAI-backed transcription and diarization client Sends audio to a LocalAI server running vibevoice.cpp and uses a second LLM for summarization." LABEL url="https://git.optimex.systems/admin/scribe" -# Install system dependencies (ffmpeg, redis, LibreOffice for PDF generation) +# Install system dependencies (ffmpeg, redis) RUN apt update -y && \ - apt install -y --no-install-recommends \ - ffmpeg \ - redis-server \ - libreoffice-writer \ - fonts-dejavu-core \ - && \ + apt install -y --no-install-recommends ffmpeg redis-server && \ apt clean && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* diff --git a/requirements.txt b/requirements.txt index 5f2189b..01b0c5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,5 +6,3 @@ PyYAML>=6.0 celery[redis]>=5.3.0 redis>=5.0.0 python-docx>=1.1.0 -PyPDF2>=3.0.0 -reportlab>=4.0.0 diff --git a/scraibe/email_sender.py b/scraibe/email_sender.py index a3b75e7..69c8cf9 100644 --- a/scraibe/email_sender.py +++ b/scraibe/email_sender.py @@ -13,8 +13,6 @@ import logging import os import re import smtplib -import subprocess -import tempfile from email import encoders from email.mime.base import MIMEBase from email.mime.multipart import MIMEMultipart @@ -336,58 +334,6 @@ def _enable_line_numbering(section, start_at=1, count_by=1, restart=True): sectPr.append(lnNumType) -def _add_cover_page(doc, doc_type, date, description): - """ - Add a cover page: - - Centered horizontally and vertically using a full-page table. - - Lines: - 1) Document type - 2) Date (e.g. "June 14, 2026") - 3-5) Empty space - 6) One-sentence description - """ - # Create a full-page table to center content vertically and horizontally - table = doc.add_table(rows=1, cols=1) - table.autofit = False - cell = table.cell(0, 0) - - # Make table span full page width (approx) - cell.width = Inches(6.5) - - # Center content inside the cell - for paragraph in cell.paragraphs: - paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER - - # Clear default paragraph - cell.paragraphs[0].clear() - - # Line 1: Document type - p_type = cell.add_paragraph() - p_type.alignment = WD_ALIGN_PARAGRAPH.CENTER - run_type = p_type.add_run(doc_type) - run_type.bold = True - run_type.font.name = "Courier" - run_type.font.size = Pt(12) - - # Line 2: Date - p_date = cell.add_paragraph() - p_date.alignment = WD_ALIGN_PARAGRAPH.CENTER - run_date = p_date.add_run(date) - run_date.font.name = "Courier" - run_date.font.size = Pt(12) - - # Lines 3-5: blank space - for _ in range(3): - cell.add_paragraph() - - # Line 6: Description - p_desc = cell.add_paragraph() - p_desc.alignment = WD_ALIGN_PARAGRAPH.CENTER - run_desc = p_desc.add_run(description) - run_desc.font.name = "Courier" - run_desc.font.size = Pt(12) - - def _add_transcript_content(doc, text): """ Add transcript lines to the document with formatting. @@ -467,146 +413,27 @@ def _add_summary_content(doc, text): p.paragraph_format.space_after = Pt(4) -# ------------ PDF helpers ------------ - -def _docx_to_pdf(docx_path: str, output_dir: str) -> str: - """ - Convert a .docx file to PDF using LibreOffice. - Returns the path of the generated PDF, or None on failure. - """ - try: - # Use LibreOffice in headless mode - result = subprocess.run( - [ - "libreoffice", - "--headless", - "--convert-to", "pdf", - "--outdir", output_dir, - docx_path, - ], - capture_output=True, - text=True, - timeout=120, - ) - - if result.returncode != 0: - logger.warning("LibreOffice conversion failed: %s", result.stderr) - - # LibreOffice creates a PDF with the same base name - base = os.path.splitext(os.path.basename(docx_path))[0] - pdf_path = os.path.join(output_dir, f"{base}.pdf") - - if os.path.exists(pdf_path): - return pdf_path - else: - logger.warning("Converted PDF not found at: %s", pdf_path) - return None - - except Exception as e: - logger.error("Error converting DOCX to PDF: %s", e) - return None - - -def _merge_pdfs(input_pdfs: List[str], output_pdf: str) -> bool: - """ - Merge multiple PDF files into a single PDF using PyPDF2. - Returns True on success, False on failure. - """ - try: - from PyPDF2 import PdfMerger - - merger = PdfMerger() - for pdf in input_pdfs: - if os.path.exists(pdf): - merger.append(pdf) - - merger.write(output_pdf) - merger.close() - return True - - except Exception as e: - logger.error("Error merging PDFs: %s", e) - return False - - -def _add_page_numbers_to_pdf(input_pdf: str, output_pdf: str) -> bool: - """ - Add page numbers to a PDF using reportlab. - Page numbers appear at the bottom-right of each page. - """ - try: - from reportlab.pdfgen import canvas - from reportlab.lib.pagesizes import letter - from reportlab.pdfbase import pdfmetrics - from reportlab.pdfbase.ttfonts import TTFont - from PyPDF2 import PdfReader - - # Try to use a standard font - font_name = "Courier" - try: - pdfmetrics.registerFont( - TTFont("Courier", "/usr/share/fonts/truetype/dejavu/DejaVuSansMono.ttf") - ) - except Exception: - pass - - reader = PdfReader(input_pdf) - num_pages = len(reader.pages) - - # Create a canvas to add page numbers - c = canvas.Canvas(output_pdf, pagesize=letter) - - for page_num in range(num_pages): - page = reader.pages[page_num] - c = canvas.Canvas(output_pdf, pagesize=letter) - - # Add page number - c.setFont(font_name, 10) - page_text = f"Page {page_num + 1} of {num_pages}" - text_width = c.stringWidth(page_text, font_name, 10) - x = letter[0] - 1 * 72 - text_width # 1 inch from right - y = 1 * 72 # 1 inch from bottom - c.drawString(x, y, page_text) - - c.showPage() - c.save() - - return True - - except Exception as e: - logger.error("Error adding page numbers to PDF: %s", e) - return False - - -# ------------ Public DOCX/PDF functions ------------ +# ------------ Public DOCX functions ------------ def create_transcript_docx( text: str, filename: str, - include_cover: bool = False, - cover_date: str = "", - cover_desc: str = "", ): """ Create a .docx transcript with: - 1" margins on all sides - 12pt Courier - - Continuous line numbering on the left (for transcript content only) - - Optional cover page with type, date, and AI-generated description. + - Continuous line numbering on the left + - No cover page (standalone transcript only). """ doc = Document() _configure_base_font(doc) _configure_section_margins(doc) - # 1) Optional cover page (no line numbering) - if include_cover and cover_date and cover_desc: - _add_cover_page(doc, "TRANSCRIPT", cover_date, cover_desc) - doc.add_page_break() - - # 2) Enable line numbering for transcript section + # Enable line numbering for transcript section _enable_line_numbering(doc.sections[0]) - # 3) Transcript content (with line numbering) + # Transcript content (with line numbering) _add_transcript_content(doc, text) doc.save(filename) @@ -615,198 +442,21 @@ def create_transcript_docx( def create_summary_docx( text: str, filename: str, - include_cover: bool = False, - cover_date: str = "", - cover_desc: str = "", ): """ Create a .docx summary with consistent font and heading styles. - Optional cover page with type, date, and AI-generated description. - No line numbering. + No cover page, no line numbering. """ doc = Document() _configure_base_font(doc) _configure_section_margins(doc) - # 1) Optional cover page - if include_cover and cover_date and cover_desc: - _add_cover_page(doc, "SUMMARY", cover_date, cover_desc) - doc.add_page_break() - - # 2) Summary content (no line numbering) + # Summary content (no line numbering) _add_summary_content(doc, text) doc.save(filename) -def create_combined_docx( - transcript_text: str, - summary_text: str, - filename: str, - transcript_cover_date: str, - transcript_cover_desc: str, - summary_cover_date: str, - summary_cover_desc: str, -): - """ - Create a combined .docx with: - 1) Transcript cover page (no line numbering) - 2) Page break - 3) Summary content (no line numbering) - 4) Page break - 5) Transcript content (line numbering enabled only here) - - Line numbering is restricted to the transcript section only. - """ - doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - - # 1) Transcript cover page (no line numbering) - _add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc) - doc.add_page_break() - - # 3) Summary content (no line numbering) - _add_summary_content(doc, summary_text) - doc.add_page_break() - - # 4) Enable line numbering for transcript section - _enable_line_numbering(doc.sections[0]) - - # 5) Transcript content (with line numbering) - _add_transcript_content(doc, transcript_text) - - doc.save(filename) - - -def generate_pdf_documents( - transcript_text: str, - summary_text: str, - output_dir: str, - transcript_cover_date: str, - transcript_cover_desc: str, - summary_cover_date: str, - summary_cover_desc: str, -) -> Dict[str, str]: - """ - Generate PDF documents by: - - Creating individual .docx components (cover pages, transcript, summary) - - Converting each .docx to PDF - - Merging PDFs in the correct order - - Adding page numbers to the final PDFs - - Behavior: - - Always: - - Generate TRANSCRIPT.pdf: - - transcript_cover.pdf + transcript_with_line_numbers.pdf - - If summary_text is provided: - - Generate SUMMARY.pdf: - - summary_cover.pdf + summary.pdf - - Generate COMBINED.pdf: - - transcript_cover.pdf + summary.pdf + transcript_with_line_numbers.pdf - - Returns a dict with paths: - - transcript_pdf (always) - - summary_pdf (if summary_text provided) - - combined_pdf (if summary_text provided) - """ - import os - - os.makedirs(output_dir, exist_ok=True) - - result = {} - - # Create temporary directory for intermediate files - with tempfile.TemporaryDirectory() as tmpdir: - # 1) Generate constituent DOCX files - - # Transcript cover - transcript_cover_docx = os.path.join(tmpdir, "TRANSCRIPT_COVER.docx") - doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - _add_cover_page(doc, "TRANSCRIPT", transcript_cover_date, transcript_cover_desc) - doc.save(transcript_cover_docx) - - # Summary cover (only if summary is requested) - summary_cover_docx = None - if summary_text: - summary_cover_docx = os.path.join(tmpdir, "SUMMARY_COVER.docx") - doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - _add_cover_page(doc, "SUMMARY", summary_cover_date, summary_cover_desc) - doc.save(summary_cover_docx) - - # Transcript (with line numbering) - transcript_docx = os.path.join(tmpdir, "TRANSCRIPT.docx") - doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - _enable_line_numbering(doc.sections[0]) - _add_transcript_content(doc, transcript_text) - doc.save(transcript_docx) - - # Summary (no line numbering) - summary_docx = None - if summary_text: - summary_docx = os.path.join(tmpdir, "SUMMARY.docx") - doc = Document() - _configure_base_font(doc) - _configure_section_margins(doc) - _add_summary_content(doc, summary_text) - doc.save(summary_docx) - - # 2) Convert DOCX to PDF - - transcript_cover_pdf = _docx_to_pdf(transcript_cover_docx, tmpdir) - transcript_pdf = _docx_to_pdf(transcript_docx, tmpdir) - - summary_cover_pdf = None - summary_pdf = None - if summary_text: - summary_cover_pdf = _docx_to_pdf(summary_cover_docx, tmpdir) - summary_pdf = _docx_to_pdf(summary_docx, tmpdir) - - # 3) Assemble TRANSCRIPT.pdf: transcript_cover + transcript + page numbers - transcript_output_pdf = os.path.join(output_dir, "TRANSCRIPT.pdf") - merged_transcript = os.path.join(tmpdir, "TRANSCRIPT_MERGED.pdf") - if ( - transcript_cover_pdf - and transcript_pdf - and _merge_pdfs([transcript_cover_pdf, transcript_pdf], merged_transcript) - and _add_page_numbers_to_pdf(merged_transcript, transcript_output_pdf) - ): - result["transcript_pdf"] = transcript_output_pdf - - # 4) If summary is provided: - # - SUMMARY.pdf: summary_cover + summary + page numbers - # - COMBINED.pdf: transcript_cover + summary + transcript + page numbers - if summary_text and summary_pdf and summary_cover_pdf: - # SUMMARY.pdf - summary_output_pdf = os.path.join(output_dir, "SUMMARY.pdf") - merged_summary = os.path.join(tmpdir, "SUMMARY_MERGED.pdf") - if ( - _merge_pdfs([summary_cover_pdf, summary_pdf], merged_summary) - and _add_page_numbers_to_pdf(merged_summary, summary_output_pdf) - ): - result["summary_pdf"] = summary_output_pdf - - # COMBINED.pdf - combined_output_pdf = os.path.join(output_dir, "COMBINED.pdf") - merged_combined = os.path.join(tmpdir, "COMBINED_MERGED.pdf") - if ( - _merge_pdfs( - [transcript_cover_pdf, summary_pdf, transcript_pdf], - merged_combined, - ) - and _add_page_numbers_to_pdf(merged_combined, combined_output_pdf) - ): - result["combined_pdf"] = combined_output_pdf - - return result - - def send_success_email( to: str, transcript_text: str, diff --git a/scraibe/tasks.py b/scraibe/tasks.py index c4a6da4..da8d1e9 100644 --- a/scraibe/tasks.py +++ b/scraibe/tasks.py @@ -14,7 +14,6 @@ from .summarizer import SummarizerClient, SummarizerError from .misc import setup_logging from .email_sender import send_email, EmailError, load_template from .email_sender import create_transcript_docx, create_summary_docx -from .email_sender import create_combined_docx, generate_pdf_documents logger = logging.getLogger("scraibe.tasks") @@ -93,7 +92,7 @@ def get_queue_position(task_id: str) -> int: if found: break - # If not found in reserved, it may already be active or not yet visible. + # If not found in reserved, it may already be active or not yet visible. # In that case, treat it as position 1. if found: return max(ahead + 1, 1) @@ -415,59 +414,6 @@ def process_transcription_task( ) speaker_map = {} - # 3c) Generate short cover-page descriptions using summarizer - transcript_cover_desc = "" - summary_cover_desc = "" - dt = datetime.utcnow() - today_str = f"{dt.strftime('%B')} {dt.day}, {dt.year}" - - try: - scraibe._ensure_summarizer() - summarizer = scraibe._summarizer - - # Transcript description - prompt = ( - "In one short sentence (max 25 words), describe the content of this transcript " - "for use on a cover page. Do not include speaker names. Output only the sentence.\n\n" - + transcript_text - ) - response = summarizer._chat_completion( - messages=[{"role": "user", "content": prompt}], - temperature=0.3, - max_tokens=60, - ) - transcript_cover_desc = ( - (response or {}).get("choices", [{}])[0].get("message", {}).get("content", "") - ).strip() - if not transcript_cover_desc: - transcript_cover_desc = "Transcript of the recorded conversation." - logger.info("Transcript cover description: %s", transcript_cover_desc) - - # Summary description (if summary requested) - if include_summary: - prompt = ( - "In one short sentence (max 25 words), describe the content of this summary " - "for use on a cover page. Output only the sentence.\n\n" - + summary_text - ) - response = summarizer._chat_completion( - messages=[{"role": "user", "content": prompt}], - temperature=0.3, - max_tokens=60, - ) - summary_cover_desc = ( - (response or {}).get("choices", [{}])[0].get("message", {}).get("content", "") - ).strip() - if not summary_cover_desc: - summary_cover_desc = "Summary of the recorded conversation." - logger.info("Summary cover description: %s", summary_cover_desc) - - except (SummarizerError, Exception) as e: - logger.warning("Failed to generate cover descriptions: %s", e) - transcript_cover_desc = transcript_cover_desc or "Transcript of the recorded conversation." - if include_summary: - summary_cover_desc = summary_cover_desc or "Summary of the recorded conversation." - # 4) Prepare files # Transcript .md @@ -477,14 +423,11 @@ def process_transcription_task( f.write(transcript_text) temp_files.append(md_transcript_path) - # Transcript .docx (standalone, with cover page) + # Transcript .docx (standalone, no cover page) docx_transcript_path = _safe_filename("TRANSCRIPT", local, date_tag, ".docx") create_transcript_docx( transcript_text, docx_transcript_path, - include_cover=True, - cover_date=today_str, - cover_desc=transcript_cover_desc, ) temp_files.append(docx_transcript_path) @@ -511,7 +454,6 @@ def process_transcription_task( # Summary files (if present) md_summary_path = None docx_summary_path = None - docx_combined_path = None if summary_text: # Summary .md @@ -521,77 +463,26 @@ def process_transcription_task( f.write(summary_text) temp_files.append(md_summary_path) - # Summary .docx (standalone, with cover page) + # Summary .docx (standalone, no cover page) docx_summary_path = _safe_filename("SUMMARY", local, date_tag, ".docx") create_summary_docx( summary_text, docx_summary_path, - include_cover=True, - cover_date=today_str, - cover_desc=summary_cover_desc, ) temp_files.append(docx_summary_path) - # Combined .docx: Transcript + Summary with cover pages - docx_combined_path = _safe_filename("TRANSCRIPT_AND_SUMMARY", local, date_tag, ".docx") - create_combined_docx( - transcript_text=transcript_text, - summary_text=summary_text, - filename=docx_combined_path, - transcript_cover_date=today_str, - transcript_cover_desc=transcript_cover_desc, - summary_cover_date=today_str, - summary_cover_desc=summary_cover_desc, - ) - temp_files.append(docx_combined_path) - - # 4b) Generate PDF documents - - # Always use transcript text; use summary_text only if include_summary is True - pdf_summary_text = summary_text if include_summary else "" - - pdf_files = {} - try: - pdf_output_dir = tempfile.mkdtemp(prefix="pdf_output_") - pdf_files = generate_pdf_documents( - transcript_text=transcript_text, - summary_text=pdf_summary_text, - output_dir=pdf_output_dir, - transcript_cover_date=today_str, - transcript_cover_desc=transcript_cover_desc, - summary_cover_date=today_str, - summary_cover_desc=summary_cover_desc, - ) - for pdf_path in pdf_files.values(): - if pdf_path and os.path.exists(pdf_path): - temp_files.append(pdf_path) - except Exception as e: - logger.warning("PDF generation failed (continuing with DOCX attachments): %s", e) - # 5) Build attachments list - # Base attachments: JSON, transcript MD, transcript DOCX + # Always: JSON, transcript MD, transcript DOCX attachments = [ md_transcript_path, docx_transcript_path, json_path, ] - # If summary is present, add summary MD and DOCXs + # If summary is present, add summary MD and DOCX if summary_text: attachments += [md_summary_path, docx_summary_path] - if docx_combined_path: - attachments.append(docx_combined_path) - - # Add PDFs based on mode: - # - Always: TRANSCRIPT.pdf - # - If summary: SUMMARY.pdf and COMBINED.pdf - if pdf_files.get("transcript_pdf"): - attachments.append(pdf_files["transcript_pdf"]) - if pdf_files.get("summary_pdf"): - attachments.append(pdf_files["summary_pdf"]) - if pdf_files.get("combined_pdf"): - attachments.append(pdf_files["combined_pdf"]) # 6) Send success email send_success_email(