feat(mcp): core author/edit/understand APIs and PDF path polish

- Authoring/editing: paragraph formatting tool; expanded list/table ops kept - Understanding: get_tables, list_images, list_hyperlinks, get_fields_summary - Safety: strip_personal_info that scrubs core.xml best-effort - Tests remain green; pure-Rust + hi-fidelity export path preserved
2025-08-12 23:48:56 +08:00
parent 0e43a50ad2
commit 3b05711e92
2 changed files with 295 additions and 0 deletions
@@ -628,6 +628,165 @@ impl DocxHandler {
            .map(|m| m.clone())
    }
    /// Update paragraph formatting for paragraphs matching the selector (currently supports substring match)
    pub fn apply_paragraph_format(
        &mut self,
        doc_id: &str,
        contains: Option<&str>,
        new_format: DocxStyle,
    ) -> Result<usize> {
        self.ensure_modifiable(doc_id)?;
        let ops = self.in_memory_ops.get_mut(doc_id)
            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
        let mut updated = 0usize;
        for op in ops.iter_mut() {
            if let DocxOp::Paragraph { text, style } = op {
                if contains.map(|needle| text.contains(needle)).unwrap_or(true) {
                    // Merge properties; prefer provided values over existing
                    let mut merged = style.clone().unwrap_or(DocxStyle {
                        font_family: None, font_size: None, bold: None, italic: None, underline: None,
                        color: None, alignment: None, line_spacing: None,
                    });
                    if new_format.font_family.is_some() { merged.font_family = new_format.font_family.clone(); }
                    if new_format.font_size.is_some() { merged.font_size = new_format.font_size; }
                    if new_format.bold.is_some() { merged.bold = new_format.bold; }
                    if new_format.italic.is_some() { merged.italic = new_format.italic; }
                    if new_format.underline.is_some() { merged.underline = new_format.underline; }
                    if new_format.color.is_some() { merged.color = new_format.color.clone(); }
                    if new_format.alignment.is_some() { merged.alignment = new_format.alignment.clone(); }
                    if new_format.line_spacing.is_some() { merged.line_spacing = new_format.line_spacing; }
                    *style = Some(merged);
                    updated += 1;
                }
            }
        }
        if updated > 0 { self.write_docx(doc_id)?; }
        Ok(updated)
    }
    /// List tables with resolved merges and sizes
    pub fn get_tables_json(&self, doc_id: &str) -> Result<serde_json::Value> {
        let ops = self.in_memory_ops.get(doc_id)
            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
        let mut tables = Vec::new();
        for (ti, op) in ops.iter().enumerate() {
            if let DocxOp::Table { data } = op {
                let rows = data.rows.len();
                let cols = data.rows.first().map(|r| r.len()).unwrap_or(0);
                tables.push(serde_json::json!({
                    "index": ti,
                    "rows": rows,
                    "cols": cols,
                    "col_widths": data.col_widths,
                    "merges": data.merges,
                    "cells": data.rows,
                }));
            }
        }
        Ok(serde_json::json!({ "tables": tables }))
    }
    /// List images with basic metadata
    pub fn list_images(&self, doc_id: &str) -> Result<serde_json::Value> {
        let ops = self.in_memory_ops.get(doc_id)
            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
        let mut images = Vec::new();
        for (i, op) in ops.iter().enumerate() {
            if let DocxOp::Image { width, height, alt_text, .. } = op {
                images.push(serde_json::json!({"index": i, "width": width, "height": height, "alt_text": alt_text}));
            }
        }
        Ok(serde_json::json!({"images": images}))
    }
    /// List hyperlinks present in the in-memory ops
    pub fn list_hyperlinks(&self, doc_id: &str) -> Result<serde_json::Value> {
        let ops = self.in_memory_ops.get(doc_id)
            .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
        let mut links = Vec::new();
        for (i, op) in ops.iter().enumerate() {
            if let DocxOp::Hyperlink { text, url } = op {
                links.push(serde_json::json!({"index": i, "text": text, "url": url}));
            }
        }
        Ok(serde_json::json!({"hyperlinks": links}))
    }
    /// Summarize fields from document and header/footer XML (best-effort)
    pub fn get_fields_summary(&self, doc_id: &str) -> Result<serde_json::Value> {
        let metadata = self.documents.get(doc_id)
            .ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
        let src_file = std::fs::File::open(&metadata.path)?;
        let mut archive = ZipArchive::new(src_file)?;
        let mut parts = vec!["word/document.xml".to_string()];
        for i in 0..archive.len() {
            let name = archive.by_index(i)?.name().to_string();
            if (name.starts_with("word/header") || name.starts_with("word/footer")) && name.ends_with(".xml") {
                parts.push(name);
            }
        }
        let mut fields: Vec<serde_json::Value> = Vec::new();
        for part in parts {
            if let Ok(mut f) = archive.by_name(&part) {
                let mut xml = String::new();
                use std::io::Read as _;
                f.read_to_string(&mut xml)?;
                for cap in regex::Regex::new(r"<w:instrText[\s\S]*?>([\s\S]*?)</w:instrText>")?.captures_iter(&xml) {
                    let instr = cap.get(1).map(|m| m.as_str().trim()).unwrap_or("").to_string();
                    let kind = if instr.contains("TOC") { "TOC" } else if instr.contains("PAGE") { "PAGE" } else if instr.contains("NUMPAGES") { "NUMPAGES" } else { "OTHER" };
                    fields.push(serde_json::json!({"part": part, "instruction": instr, "kind": kind}));
                }
            }
        }
        Ok(serde_json::json!({"fields": fields}))
    }
    /// Remove personal info (best-effort): clear in-memory metadata and scrub core.xml if present
    pub fn strip_personal_info(&mut self, doc_id: &str) -> Result<()> {
        let meta = self.documents.get_mut(doc_id)
            .ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
        meta.author = None; meta.title = None; meta.subject = None;
        // Try to scrub docProps/core.xml
        let src_file = std::fs::File::open(&meta.path)?;
        let mut archive = ZipArchive::new(src_file)?;
        let mut core_xml: Option<String> = None;
        if let Ok(mut f) = archive.by_name("docProps/core.xml") {
            let mut xml = String::new();
            use std::io::Read as _;
            f.read_to_string(&mut xml)?;
            // crude replacements
            xml = regex::Regex::new(r"<dc:creator>.*?</dc:creator>")?.replace_all(&xml, "<dc:creator></dc:creator>").into_owned();
            xml = regex::Regex::new(r"<cp:lastModifiedBy>.*?</cp:lastModifiedBy>")?.replace_all(&xml, "<cp:lastModifiedBy></cp:lastModifiedBy>").into_owned();
            xml = regex::Regex::new(r"<dc:title>.*?</dc:title>")?.replace_all(&xml, "<dc:title></dc:title>").into_owned();
            xml = regex::Regex::new(r"<dc:subject>.*?</dc:subject>")?.replace_all(&xml, "<dc:subject></dc:subject>").into_owned();
            core_xml = Some(xml);
        }
        if core_xml.is_none() { return Ok(()); }
        // Repack archive with updated core.xml
        let src_file = std::fs::File::open(&meta.path)?;
        let mut archive = ZipArchive::new(src_file)?;
        let temp_path = meta.path.with_extension("docx.tmp");
        let dst_file = std::fs::File::create(&temp_path)?;
        let mut writer = ZipWriter::new(dst_file);
        let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
        for i in 0..archive.len() {
            let mut file = archive.by_index(i)?;
            let name = file.name().to_string();
            use std::io::{Read as _, Write as _};
            writer.start_file(name.clone(), options)?;
            if name == "docProps/core.xml" {
                writer.write_all(core_xml.as_ref().unwrap().as_bytes())?;
            } else {
                let mut buf = Vec::new();
                file.read_to_end(&mut buf)?;
                writer.write_all(&buf)?;
            }
        }
        writer.finish()?;
        std::fs::rename(&temp_path, &meta.path)?;
        Ok(())
    }
    /// Update document core properties stored in our metadata (best-effort)
    pub fn set_document_properties(
        &mut self,
@@ -440,6 +440,32 @@ impl DocxToolsProvider {
                }),
                annotations: None,
            },
            Tool {
                name: "apply_paragraph_format".to_string(),
                description: Some("Apply paragraph formatting to paragraphs matching a simple selector".to_string()),
                input_schema: json!({
                    "type": "object",
                    "properties": {
                        "document_id": {"type": "string"},
                        "contains": {"type": "string", "description": "Substring to match in paragraph text"},
                        "format": {
                            "type": "object",
                            "properties": {
                                "font_family": {"type": "string"},
                                "font_size": {"type": "integer"},
                                "bold": {"type": "boolean"},
                                "italic": {"type": "boolean"},
                                "underline": {"type": "boolean"},
                                "color": {"type": "string"},
                                "alignment": {"type": "string"},
                                "line_spacing": {"type": "number"}
                            }
                        }
                    },
                    "required": ["document_id", "format"]
                }),
                annotations: None,
            },
            Tool {
                name: "extract_text".to_string(),
                description: Some("Extract all text content from the document".to_string()),
@@ -455,6 +481,56 @@ impl DocxToolsProvider {
                }),
                annotations: None,
            },
            Tool {
                name: "get_tables".to_string(),
                description: Some("List tables with dimensions, merges, and cell content".to_string()),
                input_schema: json!({
                    "type": "object",
                    "properties": {"document_id": {"type": "string"}},
                    "required": ["document_id"]
                }),
                annotations: None,
            },
            Tool {
                name: "list_images".to_string(),
                description: Some("List images with width/height and alt text".to_string()),
                input_schema: json!({
                    "type": "object",
                    "properties": {"document_id": {"type": "string"}},
                    "required": ["document_id"]
                }),
                annotations: None,
            },
            Tool {
                name: "list_hyperlinks".to_string(),
                description: Some("List hyperlinks in the document".to_string()),
                input_schema: json!({
                    "type": "object",
                    "properties": {"document_id": {"type": "string"}},
                    "required": ["document_id"]
                }),
                annotations: None,
            },
            Tool {
                name: "get_fields_summary".to_string(),
                description: Some("Summarize Word fields (PAGE, NUMPAGES, TOC) in document and headers/footers".to_string()),
                input_schema: json!({
                    "type": "object",
                    "properties": {"document_id": {"type": "string"}},
                    "required": ["document_id"]
                }),
                annotations: None,
            },
            Tool {
                name: "strip_personal_info".to_string(),
                description: Some("Remove personal info from metadata and core.xml (best-effort)".to_string()),
                input_schema: json!({
                    "type": "object",
                    "properties": {"document_id": {"type": "string"}},
                    "required": ["document_id"]
                }),
                annotations: None,
            },
            Tool {
                name: "get_metadata".to_string(),
                description: Some("Get document metadata".to_string()),
@@ -1148,6 +1224,26 @@ impl DocxToolsProvider {
                    Err(e) => ToolOutcome::Error { code: ErrorCode::ValidationError, error: e.to_string(), hint: None },
                }
            },
            "apply_paragraph_format" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");
                let contains = arguments.get("contains").and_then(|v| v.as_str());
                let fmt = &arguments["format"];
                let style = DocxStyle {
                    font_family: fmt.get("font_family").and_then(|v| v.as_str()).map(|s| s.to_string()),
                    font_size: fmt.get("font_size").and_then(|v| v.as_u64()).map(|v| v as usize),
                    bold: fmt.get("bold").and_then(|v| v.as_bool()),
                    italic: fmt.get("italic").and_then(|v| v.as_bool()),
                    underline: fmt.get("underline").and_then(|v| v.as_bool()),
                    color: fmt.get("color").and_then(|v| v.as_str()).map(|s| s.to_string()),
                    alignment: fmt.get("alignment").and_then(|v| v.as_str()).map(|s| s.to_string()),
                    line_spacing: fmt.get("line_spacing").and_then(|v| v.as_f64()).map(|v| v as f32),
                };
                let mut handler = self.handler.write().unwrap();
                match handler.apply_paragraph_format(doc_id, contains, style) {
                    Ok(count) => ToolOutcome::Ok { message: Some(format!("Updated {} paragraph(s)", count)) },
                    Err(e) => ToolOutcome::Error { code: ErrorCode::ValidationError, error: e.to_string(), hint: None },
                }
            },
            "extract_text" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");
@@ -1158,6 +1254,46 @@ impl DocxToolsProvider {
                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
                }
            },
            "get_tables" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");
                let handler = self.handler.read().unwrap();
                match handler.get_tables_json(doc_id) {
                    Ok(json) => ToolOutcome::Metadata { metadata: json },
                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
                }
            },
            "list_images" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");
                let handler = self.handler.read().unwrap();
                match handler.list_images(doc_id) {
                    Ok(json) => ToolOutcome::Metadata { metadata: json },
                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
                }
            },
            "list_hyperlinks" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");
                let handler = self.handler.read().unwrap();
                match handler.list_hyperlinks(doc_id) {
                    Ok(json) => ToolOutcome::Metadata { metadata: json },
                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
                }
            },
            "get_fields_summary" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");
                let handler = self.handler.read().unwrap();
                match handler.get_fields_summary(doc_id) {
                    Ok(json) => ToolOutcome::Metadata { metadata: json },
                    Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None },
                }
            },
            "strip_personal_info" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");
                let mut handler = self.handler.write().unwrap();
                match handler.strip_personal_info(doc_id) {
                    Ok(_) => ToolOutcome::Ok { message: Some("Personal info stripped".into()) },
                    Err(e) => ToolOutcome::Error { code: ErrorCode::InternalError, error: e.to_string(), hint: None },
                }
            },
            "get_metadata" => {
                let doc_id = arguments["document_id"].as_str().unwrap_or("");