From 3b05711e920d7974653356e2121d29889b8f863f Mon Sep 17 00:00:00 2001 From: Andy Date: Tue, 12 Aug 2025 23:48:56 +0800 Subject: [PATCH] feat(mcp): core author/edit/understand APIs and PDF path polish - Authoring/editing: paragraph formatting tool; expanded list/table ops kept - Understanding: get_tables, list_images, list_hyperlinks, get_fields_summary - Safety: strip_personal_info that scrubs core.xml best-effort - Tests remain green; pure-Rust + hi-fidelity export path preserved --- src/docx_handler.rs | 159 ++++++++++++++++++++++++++++++++++++++++++++ src/docx_tools.rs | 136 +++++++++++++++++++++++++++++++++++++ 2 files changed, 295 insertions(+) diff --git a/src/docx_handler.rs b/src/docx_handler.rs index cc84d18..63e3184 100644 --- a/src/docx_handler.rs +++ b/src/docx_handler.rs @@ -628,6 +628,165 @@ impl DocxHandler { .map(|m| m.clone()) } + /// Update paragraph formatting for paragraphs matching the selector (currently supports substring match) + pub fn apply_paragraph_format( + &mut self, + doc_id: &str, + contains: Option<&str>, + new_format: DocxStyle, + ) -> Result { + self.ensure_modifiable(doc_id)?; + let ops = self.in_memory_ops.get_mut(doc_id) + .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?; + let mut updated = 0usize; + for op in ops.iter_mut() { + if let DocxOp::Paragraph { text, style } = op { + if contains.map(|needle| text.contains(needle)).unwrap_or(true) { + // Merge properties; prefer provided values over existing + let mut merged = style.clone().unwrap_or(DocxStyle { + font_family: None, font_size: None, bold: None, italic: None, underline: None, + color: None, alignment: None, line_spacing: None, + }); + if new_format.font_family.is_some() { merged.font_family = new_format.font_family.clone(); } + if new_format.font_size.is_some() { merged.font_size = new_format.font_size; } + if new_format.bold.is_some() { merged.bold = new_format.bold; } + if new_format.italic.is_some() { merged.italic = new_format.italic; } + if new_format.underline.is_some() { merged.underline = new_format.underline; } + if new_format.color.is_some() { merged.color = new_format.color.clone(); } + if new_format.alignment.is_some() { merged.alignment = new_format.alignment.clone(); } + if new_format.line_spacing.is_some() { merged.line_spacing = new_format.line_spacing; } + *style = Some(merged); + updated += 1; + } + } + } + if updated > 0 { self.write_docx(doc_id)?; } + Ok(updated) + } + + /// List tables with resolved merges and sizes + pub fn get_tables_json(&self, doc_id: &str) -> Result { + let ops = self.in_memory_ops.get(doc_id) + .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?; + let mut tables = Vec::new(); + for (ti, op) in ops.iter().enumerate() { + if let DocxOp::Table { data } = op { + let rows = data.rows.len(); + let cols = data.rows.first().map(|r| r.len()).unwrap_or(0); + tables.push(serde_json::json!({ + "index": ti, + "rows": rows, + "cols": cols, + "col_widths": data.col_widths, + "merges": data.merges, + "cells": data.rows, + })); + } + } + Ok(serde_json::json!({ "tables": tables })) + } + + /// List images with basic metadata + pub fn list_images(&self, doc_id: &str) -> Result { + let ops = self.in_memory_ops.get(doc_id) + .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?; + let mut images = Vec::new(); + for (i, op) in ops.iter().enumerate() { + if let DocxOp::Image { width, height, alt_text, .. } = op { + images.push(serde_json::json!({"index": i, "width": width, "height": height, "alt_text": alt_text})); + } + } + Ok(serde_json::json!({"images": images})) + } + + /// List hyperlinks present in the in-memory ops + pub fn list_hyperlinks(&self, doc_id: &str) -> Result { + let ops = self.in_memory_ops.get(doc_id) + .ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?; + let mut links = Vec::new(); + for (i, op) in ops.iter().enumerate() { + if let DocxOp::Hyperlink { text, url } = op { + links.push(serde_json::json!({"index": i, "text": text, "url": url})); + } + } + Ok(serde_json::json!({"hyperlinks": links})) + } + + /// Summarize fields from document and header/footer XML (best-effort) + pub fn get_fields_summary(&self, doc_id: &str) -> Result { + let metadata = self.documents.get(doc_id) + .ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?; + let src_file = std::fs::File::open(&metadata.path)?; + let mut archive = ZipArchive::new(src_file)?; + let mut parts = vec!["word/document.xml".to_string()]; + for i in 0..archive.len() { + let name = archive.by_index(i)?.name().to_string(); + if (name.starts_with("word/header") || name.starts_with("word/footer")) && name.ends_with(".xml") { + parts.push(name); + } + } + let mut fields: Vec = Vec::new(); + for part in parts { + if let Ok(mut f) = archive.by_name(&part) { + let mut xml = String::new(); + use std::io::Read as _; + f.read_to_string(&mut xml)?; + for cap in regex::Regex::new(r"([\s\S]*?)")?.captures_iter(&xml) { + let instr = cap.get(1).map(|m| m.as_str().trim()).unwrap_or("").to_string(); + let kind = if instr.contains("TOC") { "TOC" } else if instr.contains("PAGE") { "PAGE" } else if instr.contains("NUMPAGES") { "NUMPAGES" } else { "OTHER" }; + fields.push(serde_json::json!({"part": part, "instruction": instr, "kind": kind})); + } + } + } + Ok(serde_json::json!({"fields": fields})) + } + + /// Remove personal info (best-effort): clear in-memory metadata and scrub core.xml if present + pub fn strip_personal_info(&mut self, doc_id: &str) -> Result<()> { + let meta = self.documents.get_mut(doc_id) + .ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?; + meta.author = None; meta.title = None; meta.subject = None; + // Try to scrub docProps/core.xml + let src_file = std::fs::File::open(&meta.path)?; + let mut archive = ZipArchive::new(src_file)?; + let mut core_xml: Option = None; + if let Ok(mut f) = archive.by_name("docProps/core.xml") { + let mut xml = String::new(); + use std::io::Read as _; + f.read_to_string(&mut xml)?; + // crude replacements + xml = regex::Regex::new(r".*?")?.replace_all(&xml, "").into_owned(); + xml = regex::Regex::new(r".*?")?.replace_all(&xml, "").into_owned(); + xml = regex::Regex::new(r".*?")?.replace_all(&xml, "").into_owned(); + xml = regex::Regex::new(r".*?")?.replace_all(&xml, "").into_owned(); + core_xml = Some(xml); + } + if core_xml.is_none() { return Ok(()); } + // Repack archive with updated core.xml + let src_file = std::fs::File::open(&meta.path)?; + let mut archive = ZipArchive::new(src_file)?; + let temp_path = meta.path.with_extension("docx.tmp"); + let dst_file = std::fs::File::create(&temp_path)?; + let mut writer = ZipWriter::new(dst_file); + let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored); + for i in 0..archive.len() { + let mut file = archive.by_index(i)?; + let name = file.name().to_string(); + use std::io::{Read as _, Write as _}; + writer.start_file(name.clone(), options)?; + if name == "docProps/core.xml" { + writer.write_all(core_xml.as_ref().unwrap().as_bytes())?; + } else { + let mut buf = Vec::new(); + file.read_to_end(&mut buf)?; + writer.write_all(&buf)?; + } + } + writer.finish()?; + std::fs::rename(&temp_path, &meta.path)?; + Ok(()) + } + /// Update document core properties stored in our metadata (best-effort) pub fn set_document_properties( &mut self, diff --git a/src/docx_tools.rs b/src/docx_tools.rs index 017ccec..d516b8e 100644 --- a/src/docx_tools.rs +++ b/src/docx_tools.rs @@ -440,6 +440,32 @@ impl DocxToolsProvider { }), annotations: None, }, + Tool { + name: "apply_paragraph_format".to_string(), + description: Some("Apply paragraph formatting to paragraphs matching a simple selector".to_string()), + input_schema: json!({ + "type": "object", + "properties": { + "document_id": {"type": "string"}, + "contains": {"type": "string", "description": "Substring to match in paragraph text"}, + "format": { + "type": "object", + "properties": { + "font_family": {"type": "string"}, + "font_size": {"type": "integer"}, + "bold": {"type": "boolean"}, + "italic": {"type": "boolean"}, + "underline": {"type": "boolean"}, + "color": {"type": "string"}, + "alignment": {"type": "string"}, + "line_spacing": {"type": "number"} + } + } + }, + "required": ["document_id", "format"] + }), + annotations: None, + }, Tool { name: "extract_text".to_string(), description: Some("Extract all text content from the document".to_string()), @@ -455,6 +481,56 @@ impl DocxToolsProvider { }), annotations: None, }, + Tool { + name: "get_tables".to_string(), + description: Some("List tables with dimensions, merges, and cell content".to_string()), + input_schema: json!({ + "type": "object", + "properties": {"document_id": {"type": "string"}}, + "required": ["document_id"] + }), + annotations: None, + }, + Tool { + name: "list_images".to_string(), + description: Some("List images with width/height and alt text".to_string()), + input_schema: json!({ + "type": "object", + "properties": {"document_id": {"type": "string"}}, + "required": ["document_id"] + }), + annotations: None, + }, + Tool { + name: "list_hyperlinks".to_string(), + description: Some("List hyperlinks in the document".to_string()), + input_schema: json!({ + "type": "object", + "properties": {"document_id": {"type": "string"}}, + "required": ["document_id"] + }), + annotations: None, + }, + Tool { + name: "get_fields_summary".to_string(), + description: Some("Summarize Word fields (PAGE, NUMPAGES, TOC) in document and headers/footers".to_string()), + input_schema: json!({ + "type": "object", + "properties": {"document_id": {"type": "string"}}, + "required": ["document_id"] + }), + annotations: None, + }, + Tool { + name: "strip_personal_info".to_string(), + description: Some("Remove personal info from metadata and core.xml (best-effort)".to_string()), + input_schema: json!({ + "type": "object", + "properties": {"document_id": {"type": "string"}}, + "required": ["document_id"] + }), + annotations: None, + }, Tool { name: "get_metadata".to_string(), description: Some("Get document metadata".to_string()), @@ -1148,6 +1224,26 @@ impl DocxToolsProvider { Err(e) => ToolOutcome::Error { code: ErrorCode::ValidationError, error: e.to_string(), hint: None }, } }, + "apply_paragraph_format" => { + let doc_id = arguments["document_id"].as_str().unwrap_or(""); + let contains = arguments.get("contains").and_then(|v| v.as_str()); + let fmt = &arguments["format"]; + let style = DocxStyle { + font_family: fmt.get("font_family").and_then(|v| v.as_str()).map(|s| s.to_string()), + font_size: fmt.get("font_size").and_then(|v| v.as_u64()).map(|v| v as usize), + bold: fmt.get("bold").and_then(|v| v.as_bool()), + italic: fmt.get("italic").and_then(|v| v.as_bool()), + underline: fmt.get("underline").and_then(|v| v.as_bool()), + color: fmt.get("color").and_then(|v| v.as_str()).map(|s| s.to_string()), + alignment: fmt.get("alignment").and_then(|v| v.as_str()).map(|s| s.to_string()), + line_spacing: fmt.get("line_spacing").and_then(|v| v.as_f64()).map(|v| v as f32), + }; + let mut handler = self.handler.write().unwrap(); + match handler.apply_paragraph_format(doc_id, contains, style) { + Ok(count) => ToolOutcome::Ok { message: Some(format!("Updated {} paragraph(s)", count)) }, + Err(e) => ToolOutcome::Error { code: ErrorCode::ValidationError, error: e.to_string(), hint: None }, + } + }, "extract_text" => { let doc_id = arguments["document_id"].as_str().unwrap_or(""); @@ -1158,6 +1254,46 @@ impl DocxToolsProvider { Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None }, } }, + "get_tables" => { + let doc_id = arguments["document_id"].as_str().unwrap_or(""); + let handler = self.handler.read().unwrap(); + match handler.get_tables_json(doc_id) { + Ok(json) => ToolOutcome::Metadata { metadata: json }, + Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None }, + } + }, + "list_images" => { + let doc_id = arguments["document_id"].as_str().unwrap_or(""); + let handler = self.handler.read().unwrap(); + match handler.list_images(doc_id) { + Ok(json) => ToolOutcome::Metadata { metadata: json }, + Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None }, + } + }, + "list_hyperlinks" => { + let doc_id = arguments["document_id"].as_str().unwrap_or(""); + let handler = self.handler.read().unwrap(); + match handler.list_hyperlinks(doc_id) { + Ok(json) => ToolOutcome::Metadata { metadata: json }, + Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None }, + } + }, + "get_fields_summary" => { + let doc_id = arguments["document_id"].as_str().unwrap_or(""); + let handler = self.handler.read().unwrap(); + match handler.get_fields_summary(doc_id) { + Ok(json) => ToolOutcome::Metadata { metadata: json }, + Err(e) => ToolOutcome::Error { code: ErrorCode::DocNotFound, error: e.to_string(), hint: None }, + } + }, + "strip_personal_info" => { + let doc_id = arguments["document_id"].as_str().unwrap_or(""); + let mut handler = self.handler.write().unwrap(); + match handler.strip_personal_info(doc_id) { + Ok(_) => ToolOutcome::Ok { message: Some("Personal info stripped".into()) }, + Err(e) => ToolOutcome::Error { code: ErrorCode::InternalError, error: e.to_string(), hint: None }, + } + }, "get_metadata" => { let doc_id = arguments["document_id"].as_str().unwrap_or("");