From 3b05711e920d7974653356e2121d29889b8f863f Mon Sep 17 00:00:00 2001
From: Andy <andy+github@savage.hk>
Date: Tue, 12 Aug 2025 23:48:56 +0800
Subject: [PATCH] feat(mcp): core author/edit/understand APIs and PDF path
 polish

- Authoring/editing: paragraph formatting tool; expanded list/table ops kept
- Understanding: get_tables, list_images, list_hyperlinks, get_fields_summary
- Safety: strip_personal_info that scrubs core.xml best-effort
- Tests remain green; pure-Rust + hi-fidelity export path preserved
---
 src/docx_handler.rs | 159 ++++++++++++++++++++++++++++++++++++++++++++
 src/docx_tools.rs   | 136 +++++++++++++++++++++++++++++++++++++
 2 files changed, 295 insertions(+)
diff --git a/src/docx_handler.rs b/src/docx_handler.rs
index cc84d18..63e3184 100644
--- a/src/docx_handler.rs
+++ b/src/docx_handler.rs
@@ -628,6 +628,165 @@ impl DocxHandler {
             .map(|m| m.clone())
     }
 
+    /// Update paragraph formatting for paragraphs matching the selector (currently supports substring match)
+    pub fn apply_paragraph_format(
+        &mut self,
+        doc_id: &str,
+        contains: Option<&str>,
+        new_format: DocxStyle,
+    ) -> Result<usize> {
+        self.ensure_modifiable(doc_id)?;
+        let ops = self.in_memory_ops.get_mut(doc_id)
+            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
+        let mut updated = 0usize;
+        for op in ops.iter_mut() {
+            if let DocxOp::Paragraph { text, style } = op {
+                if contains.map(|needle| text.contains(needle)).unwrap_or(true) {
+                    // Merge properties; prefer provided values over existing
+                    let mut merged = style.clone().unwrap_or(DocxStyle {
+                        font_family: None, font_size: None, bold: None, italic: None, underline: None,
+                        color: None, alignment: None, line_spacing: None,
+                    });
+                    if new_format.font_family.is_some() { merged.font_family = new_format.font_family.clone(); }
+                    if new_format.font_size.is_some() { merged.font_size = new_format.font_size; }
+                    if new_format.bold.is_some() { merged.bold = new_format.bold; }
+                    if new_format.italic.is_some() { merged.italic = new_format.italic; }
+                    if new_format.underline.is_some() { merged.underline = new_format.underline; }
+                    if new_format.color.is_some() { merged.color = new_format.color.clone(); }
+                    if new_format.alignment.is_some() { merged.alignment = new_format.alignment.clone(); }
+                    if new_format.line_spacing.is_some() { merged.line_spacing = new_format.line_spacing; }
+                    *style = Some(merged);
+                    updated += 1;
+                }
+            }
+        }
+        if updated > 0 { self.write_docx(doc_id)?; }
+        Ok(updated)
+    }
+
+    /// List tables with resolved merges and sizes
+    pub fn get_tables_json(&self, doc_id: &str) -> Result<serde_json::Value> {
+        let ops = self.in_memory_ops.get(doc_id)
+            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
+        let mut tables = Vec::new();
+        for (ti, op) in ops.iter().enumerate() {
+            if let DocxOp::Table { data } = op {
+                let rows = data.rows.len();
+                let cols = data.rows.first().map(|r| r.len()).unwrap_or(0);
+                tables.push(serde_json::json!({
+                    "index": ti,
+                    "rows": rows,
+                    "cols": cols,
+                    "col_widths": data.col_widths,
+                    "merges": data.merges,
+                    "cells": data.rows,
+                }));
+            }
+        }
+        Ok(serde_json::json!({ "tables": tables }))
+    }
+
+    /// List images with basic metadata
+    pub fn list_images(&self, doc_id: &str) -> Result<serde_json::Value> {
+        let ops = self.in_memory_ops.get(doc_id)
+            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
+        let mut images = Vec::new();
+        for (i, op) in ops.iter().enumerate() {
+            if let DocxOp::Image { width, height, alt_text, .. } = op {
+                images.push(serde_json::json!({"index": i, "width": width, "height": height, "alt_text": alt_text}));
+            }
+        }
+        Ok(serde_json::json!({"images": images}))
+    }
+
+    /// List hyperlinks present in the in-memory ops
+    pub fn list_hyperlinks(&self, doc_id: &str) -> Result<serde_json::Value> {
+        let ops = self.in_memory_ops.get(doc_id)
+            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
+        let mut links = Vec::new();
+        for (i, op) in ops.iter().enumerate() {
+            if let DocxOp::Hyperlink { text, url } = op {
+                links.push(serde_json::json!({"index": i, "text": text, "url": url}));
+            }
+        }
+        Ok(serde_json::json!({"hyperlinks": links}))
+    }
+
+    /// Summarize fields from document and header/footer XML (best-effort)
+    pub fn get_fields_summary(&self, doc_id: &str) -> Result<serde_json::Value> {
+        let metadata = self.documents.get(doc_id)
+            .ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
+        let src_file = std::fs::File::open(&metadata.path)?;
+        let mut archive = ZipArchive::new(src_file)?;
+        let mut parts = vec!["word/document.xml".to_string()];
+        for i in 0..archive.len() {
+            let name = archive.by_index(i)?.name().to_string();
+            if (name.starts_with("word/header") || name.starts_with("word/footer")) && name.ends_with(".xml") {
+                parts.push(name);
+            }
+        }
+        let mut fields: Vec<serde_json::Value> = Vec::new();
+        for part in parts {
+            if let Ok(mut f) = archive.by_name(&part) {
+                let mut xml = String::new();
+                use std::io::Read as _;
+                f.read_to_string(&mut xml)?;
+                for cap in regex::Regex::new(r"<w:instrText[\s\S]*?>([\s\S]*?)</w:instrText>")?.captures_iter(&xml) {
+                    let instr = cap.get(1).map(|m| m.as_str().trim()).unwrap_or("").to_string();
+                    let kind = if instr.contains("TOC") { "TOC" } else if instr.contains("PAGE") { "PAGE" } else if instr.contains("NUMPAGES") { "NUMPAGES" } else { "OTHER" };
+                    fields.push(serde_json::json!({"part": part, "instruction": instr, "kind": kind}));
+                }
+            }
+        }
+        Ok(serde_json::json!({"fields": fields}))
+    }
+
+    /// Remove personal info (best-effort): clear in-memory metadata and scrub core.xml if present
+    pub fn strip_personal_info(&mut self, doc_id: &str) -> Result<()> {
+        let meta = self.documents.get_mut(doc_id)
+            .ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
+        meta.author = None; meta.title = None; meta.subject = None;
+        // Try to scrub docProps/core.xml
+        let src_file = std::fs::File::open(&meta.path)?;
+        let mut archive = ZipArchive::new(src_file)?;
+        let mut core_xml: Option<String> = None;
+        if let Ok(mut f) = archive.by_name("docProps/core.xml") {
+            let mut xml = String::new();
+            use std::io::Read as _;
+            f.read_to_string(&mut xml)?;
+            // crude replacements
+            xml = regex::Regex::new(r"<dc:creator>.*?</dc:creator>")?.replace_all(&xml, "<dc:creator></dc:creator>").into_owned();
+            xml = regex::Regex::new(r"<cp:lastModifiedBy>.*?</cp:lastModifiedBy>")?.replace_all(&xml, "<cp:lastModifiedBy></cp:lastModifiedBy>").into_owned();
+            xml = regex::Regex::new(r"<dc:title>.*?</dc:title>")?.replace_all(&xml, "<dc:title></dc:title>").into_owned();
+            xml = regex::Regex::new(r"<dc:subject>.*?</dc:subject>")?.replace_all(&xml, "<dc:subject></dc:subject>").into_owned();
+            core_xml = Some(xml);
+        }
+        if core_xml.is_none() { return Ok(()); }
+        // Repack archive with updated core.xml
+        let src_file = std::fs::File::open(&meta.path)?;
+        let mut archive = ZipArchive::new(src_file)?;
+        let temp_path = meta.path.with_extension("docx.tmp");
+        let dst_file = std::fs::File::create(&temp_path)?;
+        let mut writer = ZipWriter::new(dst_file);
+        let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
+        for i in 0..archive.len() {
+            let mut file = archive.by_index(i)?;
+            let name = file.name().to_string();
+            use std::io::{Read as _, Write as _};
+            writer.start_file(name.clone(), options)?;
+            if name == "docProps/core.xml" {
+                writer.write_all(core_xml.as_ref().unwrap().as_bytes())?;
+            } else {
+                let mut buf = Vec::new();
+                file.read_to_end(&mut buf)?;
+                writer.write_all(&buf)?;
+            }
+        }
+        writer.finish()?;
+        std::fs::rename(&temp_path, &meta.path)?;
+        Ok(())
+    }
+
     /// Update document core properties stored in our metadata (best-effort)
     pub fn set_document_properties(
         &mut self,
diff --git a/src/docx_tools.rs b/src/docx_tools.rs
index 017ccec..d516b8e 100644
--- a/src/docx_tools.rs
+++ b/src/docx_tools.rs
@@ -440,6 +440,32 @@ impl DocxToolsProvider {
                 }),
                 annotations: None,
             },
+            Tool {
+                name: "apply_paragraph_format".to_string(),
+                description: Some("Apply paragraph formatting to paragraphs matching a simple selector".to_string()),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {
+                        "document_id": {"type": "string"},
+                        "contains": {"type": "string", "description": "Substring to match in paragraph text"},
+                        "format": {
+                            "type": "object",
+                            "properties": {
+                                "font_family": {"type": "string"},
+                                "font_size": {"type": "integer"},
+                                "bold": {"type": "boolean"},
+                                "italic": {"type": "boolean"},
+                                "underline": {"type": "boolean"},
+                                "color": {"type": "string"},
+                                "alignment": {"type": "string"},
+                                "line_spacing": {"type": "number"}
+                            }
+                        }
+                    },
+                    "required": ["document_id", "format"]
+                }),
+                annotations: None,
+            },
             Tool {
                 name: "extract_text".to_string(),
                 description: Some("Extract all text content from the document".to_string()),
@@ -455,6 +481,56 @@ impl DocxToolsProvider {
                 }),
                 annotations: None,
             },
+            Tool {
+                name: "get_tables".to_string(),
+                description: Some("List tables with dimensions, merges, and cell content".to_string()),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {"document_id": {"type": "string"}},
+                    "required": ["document_id"]
+                }),
+                annotations: None,
+            },
+            Tool {
+                name: "list_images".to_string(),
+                description: Some("List images with width/height and alt text".to_string()),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {"document_id": {"type": "string"}},
+                    "required": ["document_id"]
+                }),
+                annotations: None,
+            },
+            Tool {
+                name: "list_hyperlinks".to_string(),
+                description: Some("List hyperlinks in the document".to_string()),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {"document_id": {"type": "string"}},
+                    "required": ["document_id"]
+                }),
+                annotations: None,
+            },
+            Tool {
+                name: "get_fields_summary".to_string(),
+                description: Some("Summarize Word fields (PAGE, NUMPAGES, TOC) in document and headers/footers".to_string()),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {"document_id": {"type": "string"}},
+                    "required": ["document_id"]
+                }),
+                annotations: None,
+            },
+            Tool {
+                name: "strip_personal_info".to_string(),
+                description: Some("Remove personal info from metadata and core.xml (best-effort)".to_string()),
+                input_schema: json!({
+                    "type": "object",
+                    "properties": {"document_id": {"type": "string"}},
+                    "required": ["document_id"]
+                }),
+                annotations: None,
+            },
             Tool {
                 name: "get_metadata".to_string(),
                 description: Some("Get document metadata".to_string()),
@@ -1148,6 +1224,26 @@ impl DocxToolsProvider {
                     Err(e) => ToolOutcome::Error { code: ErrorCode::ValidationError, error: e.to_string(), hint: None },
                 }
             },
+            "apply_paragraph_format" => {
+                let doc_id = arguments["document_id"].as_str().unwrap_or("");
+                let contains = arguments.get("contains").and_then(|v| v.as_str());
+                let fmt = &arguments["format"];
+                let style = DocxStyle {
+                    font_family: fmt.get("font_family").and_then(|v| v.as_str()).map(|s| s.to_string()),
+                    font_size: fmt.get("font_size").and_then(|v| v.as_u64()).map(|v| v as usize),
+                    bold: fmt.get("bold").and_then(|v| v.as_bool()),
+                    italic: fmt.get("italic").and_then(|v| v.as_bool()),
+                    underline: fmt.get("underline").and_then(|v| v.as_bool()),
+                    color: fmt.get("color").and_then(|v| v.as_str()).map(|s| s.to_string()),
+                    alignment: fmt.get("alignment").and_then(|v| v.as_str()).map(|s| s.to_string()),
+                    line_spacing: fmt.get("line_spacing").and_then(|v| v.as_f64()).map(|v| v as f32),
+                };
+                let mut handler = self.handler.write().unwrap();
+                match handler.apply_paragraph_format(doc_id, contains, style) {
+                    Ok(count) => ToolOutcome::Ok { message: Some(format!("Updated {} paragraph(s)", count)) },
+                    Err(e) => ToolOutcome::Error { code: ErrorCode::ValidationError, error: e.to_string(), hint: None },
+                }
+            },
             
             "extract_text" => {
                 let doc_id = arguments["document_id"].as_str().unwrap_or("");
@@ -1158,6 +1254,46 @@ impl DocxToolsProvider {
                     Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
                 }
             },
+            "get_tables" => {
+                let doc_id = arguments["document_id"].as_str().unwrap_or("");
+                let handler = self.handler.read().unwrap();
+                match handler.get_tables_json(doc_id) {
+                    Ok(json) => ToolOutcome::Metadata { metadata: json },
+                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
+                }
+            },
+            "list_images" => {
+                let doc_id = arguments["document_id"].as_str().unwrap_or("");
+                let handler = self.handler.read().unwrap();
+                match handler.list_images(doc_id) {
+                    Ok(json) => ToolOutcome::Metadata { metadata: json },
+                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
+                }
+            },
+            "list_hyperlinks" => {
+                let doc_id = arguments["document_id"].as_str().unwrap_or("");
+                let handler = self.handler.read().unwrap();
+                match handler.list_hyperlinks(doc_id) {
+                    Ok(json) => ToolOutcome::Metadata { metadata: json },
+                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
+                }
+            },
+            "get_fields_summary" => {
+                let doc_id = arguments["document_id"].as_str().unwrap_or("");
+                let handler = self.handler.read().unwrap();
+                match handler.get_fields_summary(doc_id) {
+                    Ok(json) => ToolOutcome::Metadata { metadata: json },
+                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
+                }
+            },
+            "strip_personal_info" => {
+                let doc_id = arguments["document_id"].as_str().unwrap_or("");
+                let mut handler = self.handler.write().unwrap();
+                match handler.strip_personal_info(doc_id) {
+                    Ok(_) => ToolOutcome::Ok { message: Some("Personal info stripped".into()) },
+                    Err(e) => ToolOutcome::Error { code: ErrorCode::InternalError, error: e.to_string(), hint: None },
+                }
+            },
             
             "get_metadata" => {
                 let doc_id = arguments["document_id"].as_str().unwrap_or("");