diff --git a/src/docx_handler.rs b/src/docx_handler.rs index 47cbbec..09f1910 100644 --- a/src/docx_handler.rs +++ b/src/docx_handler.rs @@ -306,6 +306,45 @@ impl DocxHandler { pub fn list_documents(&self) -> Vec { self.documents.values().cloned().collect() } + + pub fn temp_dir_path(&self) -> PathBuf { + self.temp_dir.clone() + } + + pub fn get_storage_info(&self) -> Result { + use std::time::{SystemTime, UNIX_EPOCH}; + let mut total_bytes: u64 = 0; + let mut file_count: u64 = 0; + let mut oldest: Option = None; + let mut newest: Option = None; + if self.temp_dir.exists() { + for entry in walkdir::WalkDir::new(&self.temp_dir).into_iter().filter_map(|e| e.ok()) { + if entry.file_type().is_file() { + file_count += 1; + if let Ok(meta) = entry.metadata() { + total_bytes = total_bytes.saturating_add(meta.len()); + if let Ok(modified) = meta.modified() { + if let Ok(secs) = modified.duration_since(UNIX_EPOCH) { + let ts = secs.as_secs(); + oldest = Some(oldest.map_or(ts, |o| o.min(ts))); + newest = Some(newest.map_or(ts, |n| n.max(ts))); + } + } + } + } + } + } + Ok(serde_json::json!({ + "success": true, + "storage": { + "base_dir": self.temp_dir, + "file_count": file_count, + "total_bytes": total_bytes, + "oldest_modified": oldest, + "newest_modified": newest, + } + })) + } } #[derive(Debug, Clone)] diff --git a/src/docx_tools.rs b/src/docx_tools.rs index 1b24027..8aada73 100644 --- a/src/docx_tools.rs +++ b/src/docx_tools.rs @@ -2,7 +2,7 @@ use mcp_core::types::{Tool, CallToolResponse, ToolResponseContent, TextContent}; // Adapt to latest MCP: we'll integrate via mcp-server Router separately use serde_json::{json, Value}; use std::path::PathBuf; -use std::sync::{Arc, Mutex}; +use std::sync::{Arc, RwLock}; use tracing::{debug, info}; use crate::docx_handler::{DocxHandler, DocxStyle, TableData}; @@ -13,7 +13,7 @@ use crate::security::{SecurityConfig, SecurityMiddleware}; #[derive(Clone)] pub struct DocxToolsProvider { - handler: Arc>, + handler: Arc>, converter: Arc, #[cfg(feature = "advanced-docx")] advanced: Arc, @@ -28,7 +28,7 @@ impl DocxToolsProvider { pub fn new_with_security(security_config: SecurityConfig) -> Self { Self { - handler: Arc::new(Mutex::new(DocxHandler::new().expect("Failed to create DocxHandler"))), + handler: Arc::new(RwLock::new(DocxHandler::new().expect("Failed to create DocxHandler"))), converter: Arc::new(DocumentConverter::new()), #[cfg(feature = "advanced-docx")] advanced: Arc::new(AdvancedDocxHandler::new()), @@ -45,7 +45,7 @@ impl DocxToolsProvider { /// Create a provider with a base directory and explicit security config pub fn with_base_dir_and_security>(base_dir: P, security_config: SecurityConfig) -> Self { Self { - handler: Arc::new(Mutex::new(DocxHandler::new_with_base_dir(base_dir).expect("Failed to create DocxHandler"))), + handler: Arc::new(RwLock::new(DocxHandler::new_with_base_dir(base_dir).expect("Failed to create DocxHandler"))), converter: Arc::new(DocumentConverter::new()), #[cfg(feature = "advanced-docx")] advanced: Arc::new(AdvancedDocxHandler::new()), @@ -547,6 +547,16 @@ impl DocxToolsProvider { }), annotations: None, }, + Tool { + name: "get_storage_info".to_string(), + description: Some("Get information about temporary storage usage".to_string()), + input_schema: json!({ + "type": "object", + "properties": {}, + "required": [] + }), + annotations: None, + }, ]; // Filter tools based on security configuration @@ -576,7 +586,7 @@ impl DocxToolsProvider { let result = match name { "create_document" => { - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.create_document() { Ok(doc_id) => json!({ "success": true, @@ -592,7 +602,7 @@ impl DocxToolsProvider { "open_document" => { let path = arguments["path"].as_str().unwrap_or(""); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.open_document(&PathBuf::from(path)) { Ok(doc_id) => json!({ "success": true, @@ -614,7 +624,7 @@ impl DocxToolsProvider { serde_json::from_value::(s.clone()).ok() }); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.add_paragraph(doc_id, text, style) { Ok(_) => json!({ "success": true, @@ -632,7 +642,7 @@ impl DocxToolsProvider { let text = arguments["text"].as_str().unwrap_or(""); let level = arguments["level"].as_u64().unwrap_or(1) as usize; - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.add_heading(doc_id, text, level) { Ok(_) => json!({ "success": true, @@ -679,7 +689,7 @@ impl DocxToolsProvider { border_style, }; - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.add_table(doc_id, table_data) { Ok(_) => json!({ "success": true, @@ -705,7 +715,7 @@ impl DocxToolsProvider { .and_then(|v| v.as_bool()) .unwrap_or(false); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.add_list(doc_id, items, ordered) { Ok(_) => json!({ "success": true, @@ -722,7 +732,7 @@ impl DocxToolsProvider { "add_page_break" => { let doc_id = arguments["document_id"].as_str().unwrap_or(""); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.add_page_break(doc_id) { Ok(_) => json!({ "success": true, @@ -739,7 +749,7 @@ impl DocxToolsProvider { let doc_id = arguments["document_id"].as_str().unwrap_or(""); let text = arguments["text"].as_str().unwrap_or(""); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.set_header(doc_id, text) { Ok(_) => json!({ "success": true, @@ -756,7 +766,7 @@ impl DocxToolsProvider { let doc_id = arguments["document_id"].as_str().unwrap_or(""); let text = arguments["text"].as_str().unwrap_or(""); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.set_footer(doc_id, text) { Ok(_) => json!({ "success": true, @@ -774,7 +784,7 @@ impl DocxToolsProvider { let find_text = arguments["find_text"].as_str().unwrap_or(""); let replace_text = arguments["replace_text"].as_str().unwrap_or(""); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.find_and_replace(doc_id, find_text, replace_text) { Ok(count) => json!({ "success": true, @@ -791,7 +801,7 @@ impl DocxToolsProvider { "extract_text" => { let doc_id = arguments["document_id"].as_str().unwrap_or(""); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); match handler.extract_text(doc_id) { Ok(text) => json!({ "success": true, @@ -807,7 +817,7 @@ impl DocxToolsProvider { "get_metadata" => { let doc_id = arguments["document_id"].as_str().unwrap_or(""); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); match handler.get_metadata(doc_id) { Ok(metadata) => json!({ "success": true, @@ -824,7 +834,7 @@ impl DocxToolsProvider { let doc_id = arguments["document_id"].as_str().unwrap_or(""); let output_path = arguments["output_path"].as_str().unwrap_or(""); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); match handler.save_document(doc_id, &PathBuf::from(output_path)) { Ok(_) => json!({ "success": true, @@ -840,7 +850,7 @@ impl DocxToolsProvider { "close_document" => { let doc_id = arguments["document_id"].as_str().unwrap_or(""); - let mut handler = self.handler.lock().unwrap(); + let mut handler = self.handler.write().unwrap(); match handler.close_document(doc_id) { Ok(_) => json!({ "success": true, @@ -854,7 +864,7 @@ impl DocxToolsProvider { }, "list_documents" => { - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); let documents = handler.list_documents(); json!({ "success": true, @@ -866,7 +876,7 @@ impl DocxToolsProvider { let doc_id = arguments["document_id"].as_str().unwrap_or(""); let output_path = arguments["output_path"].as_str().unwrap_or(""); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); let metadata = match handler.get_metadata(doc_id) { Ok(m) => m, Err(e) => return CallToolResponse { content: vec![ToolResponseContent::Text(TextContent { content_type: "text".into(), text: e.to_string(), annotations: None })], is_error: Some(true), meta: None }, @@ -894,7 +904,7 @@ impl DocxToolsProvider { .and_then(|d| d.as_u64()) .unwrap_or(150) as u32; - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); let metadata = match handler.get_metadata(doc_id) { Ok(m) => m, Err(e) => return CallToolResponse { content: vec![ToolResponseContent::Text(TextContent { content_type: "text".into(), text: e.to_string(), annotations: None })], is_error: Some(true), meta: None }, @@ -927,7 +937,7 @@ impl DocxToolsProvider { "get_document_structure" => { let doc_id = arguments["document_id"].as_str().unwrap_or(""); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); match handler.extract_text(doc_id) { Ok(text) => { // Analyze document structure from text @@ -993,7 +1003,7 @@ impl DocxToolsProvider { "get_word_count" => { let doc_id = arguments["document_id"].as_str().unwrap_or(""); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); match handler.extract_text(doc_id) { Ok(text) => { let words: Vec<&str> = text.split_whitespace().collect(); @@ -1028,7 +1038,7 @@ impl DocxToolsProvider { let case_sensitive = arguments.get("case_sensitive").and_then(|v| v.as_bool()).unwrap_or(false); let whole_word = arguments.get("whole_word").and_then(|v| v.as_bool()).unwrap_or(false); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); match handler.extract_text(doc_id) { Ok(text) => { let search_text = if case_sensitive { text.clone() } else { text.to_lowercase() }; @@ -1071,7 +1081,7 @@ impl DocxToolsProvider { let doc_id = arguments["document_id"].as_str().unwrap_or(""); let output_path = arguments["output_path"].as_str().unwrap_or(""); - let handler = self.handler.lock().unwrap(); + let handler = self.handler.read().unwrap(); match handler.extract_text(doc_id) { Ok(text) => { // Simple conversion to Markdown - in full implementation would preserve formatting @@ -1132,6 +1142,14 @@ impl DocxToolsProvider { }) }, + "get_storage_info" => { + let handler = self.handler.read().unwrap(); + match handler.get_storage_info() { + Ok(info) => info, + Err(e) => json!({"success": false, "error": e.to_string()}), + } + }, + _ => { json!({ "success": false, diff --git a/src/pure_converter.rs b/src/pure_converter.rs index 1987b16..81d24c1 100644 --- a/src/pure_converter.rs +++ b/src/pure_converter.rs @@ -41,24 +41,53 @@ impl PureRustConverter { anyhow::bail!("No document.xml found in DOCX file"); } - // Parse XML and extract text + // Parse XML and extract text with basic whitespace semantics let doc = roxmltree::Document::parse(&document_xml)?; let mut text = String::new(); - - // Extract text from all w:t elements + let mut last_char: Option = None; + for node in doc.descendants() { - if node.tag_name().name() == "t" { - if let Some(node_text) = node.text() { - text.push_str(node_text); - text.push(' '); + let name = node.tag_name().name(); + match name { + // Paragraph boundary + "p" => { + if !text.ends_with('\n') { + text.push('\n'); + last_char = Some('\n'); + } } - } - // Handle line breaks - if node.tag_name().name() == "br" || node.tag_name().name() == "p" { - text.push('\n'); + // Text run + "t" => { + if let Some(node_text) = node.text() { + // Preserve spaces if xml:space="preserve" + let preserve = node.attribute(("xml", "space")).map(|v| v == "preserve").unwrap_or(false); + let mut content = node_text.to_string(); + if !preserve { + // Collapse internal newlines and excessive spaces + content = content.replace('\n', " "); + } + if !content.is_empty() { + // Insert a space if needed between words + if let Some(c) = last_char { if !c.is_whitespace() && !content.starts_with([' ', '\n', '\t']) { text.push(' '); } } + text.push_str(&content); + last_char = content.chars().rev().next(); + } + } + } + // Line break + "br" => { + text.push('\n'); + last_char = Some('\n'); + } + // Tab + "tab" => { + text.push('\t'); + last_char = Some('\t'); + } + _ => {} } } - + Ok(text.trim().to_string()) } diff --git a/src/security.rs b/src/security.rs index 9fa5a61..7f54638 100644 --- a/src/security.rs +++ b/src/security.rs @@ -311,6 +311,7 @@ impl SecurityConfig { commands.insert("export_to_html"); commands.insert("create_preview"); commands.insert("get_security_info"); + commands.insert("get_storage_info"); commands }