0e43a50ad2
- Features: hi-fidelity-toc, hi-fidelity-bookmarks - Tools: insert_toc, insert_bookmark_after_heading - Write: emit recognizable placeholders and transform to field XML under feature flags - Tests: add golden checks for TOC field injection and bookmark cleanup
1523 lines
65 KiB
Rust
1523 lines
65 KiB
Rust
use anyhow::{Context, Result};
|
|
use docx_rs::*;
|
|
use std::fs::{self, File};
|
|
use std::path::{Path, PathBuf};
|
|
use uuid::Uuid;
|
|
use chrono::{DateTime, Utc};
|
|
use serde::{Deserialize, Serialize};
|
|
use tracing::{info, warn};
|
|
use zip::{ZipArchive, ZipWriter};
|
|
use zip::write::FileOptions;
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct DocxMetadata {
|
|
pub id: String,
|
|
pub path: PathBuf,
|
|
pub created_at: DateTime<Utc>,
|
|
pub modified_at: DateTime<Utc>,
|
|
pub size_bytes: u64,
|
|
pub page_count: Option<usize>,
|
|
pub word_count: Option<usize>,
|
|
pub author: Option<String>,
|
|
pub title: Option<String>,
|
|
pub subject: Option<String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct DocxStyle {
|
|
pub font_family: Option<String>,
|
|
pub font_size: Option<usize>,
|
|
pub bold: Option<bool>,
|
|
pub italic: Option<bool>,
|
|
pub underline: Option<bool>,
|
|
pub color: Option<String>,
|
|
pub alignment: Option<String>,
|
|
pub line_spacing: Option<f32>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct TableData {
|
|
pub rows: Vec<Vec<String>>,
|
|
pub headers: Option<Vec<String>>,
|
|
pub border_style: Option<String>,
|
|
pub col_widths: Option<Vec<u32>>, // approximate column widths (px)
|
|
pub merges: Option<Vec<TableMerge>>, // best-effort merge specs
|
|
pub cell_shading: Option<String>, // hex RGB like "EEEEEE"
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct TableMerge {
|
|
pub row: usize,
|
|
pub col: usize,
|
|
pub row_span: usize,
|
|
pub col_span: usize,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct ImageData {
|
|
pub data: Vec<u8>,
|
|
pub width: Option<u32>,
|
|
pub height: Option<u32>,
|
|
pub alt_text: Option<String>,
|
|
}
|
|
|
|
pub struct DocxHandler {
|
|
temp_dir: PathBuf,
|
|
pub documents: std::collections::HashMap<String, DocxMetadata>,
|
|
// In-memory operations for documents created via this handler
|
|
in_memory_ops: std::collections::HashMap<String, Vec<DocxOp>>,
|
|
}
|
|
|
|
impl DocxHandler {
|
|
pub fn new() -> Result<Self> {
|
|
let base = std::env::var_os("DOCX_MCP_TEMP").map(PathBuf::from).unwrap_or_else(|| std::env::temp_dir());
|
|
let temp_dir = base.join("docx-mcp");
|
|
fs::create_dir_all(&temp_dir)?;
|
|
|
|
Ok(Self {
|
|
temp_dir,
|
|
documents: std::collections::HashMap::new(),
|
|
in_memory_ops: std::collections::HashMap::new(),
|
|
})
|
|
}
|
|
|
|
/// Create a handler that stores temporary documents under the provided base directory
|
|
pub fn new_with_base_dir<P: AsRef<Path>>(base_dir: P) -> Result<Self> {
|
|
let temp_dir = base_dir.as_ref().join("docx-mcp");
|
|
fs::create_dir_all(&temp_dir)?;
|
|
Ok(Self {
|
|
temp_dir,
|
|
documents: std::collections::HashMap::new(),
|
|
in_memory_ops: std::collections::HashMap::new(),
|
|
})
|
|
}
|
|
|
|
#[cfg(test)]
|
|
pub fn new_with_temp_dir(temp_dir: &Path) -> Result<Self> {
|
|
let temp_dir = temp_dir.to_path_buf();
|
|
fs::create_dir_all(&temp_dir)?;
|
|
|
|
Ok(Self {
|
|
temp_dir,
|
|
documents: std::collections::HashMap::new(),
|
|
in_memory_ops: std::collections::HashMap::new(),
|
|
})
|
|
}
|
|
|
|
pub fn create_document(&mut self) -> Result<String> {
|
|
let doc_id = Uuid::new_v4().to_string();
|
|
let doc_path = self.temp_dir.join(format!("{}.docx", doc_id));
|
|
|
|
// Initialize empty document on disk
|
|
if let Some(parent) = doc_path.parent() {
|
|
fs::create_dir_all(parent)
|
|
.with_context(|| format!("Failed to create parent directory for {:?}", doc_path))?;
|
|
}
|
|
let docx = Docx::new();
|
|
let file = File::create(&doc_path)
|
|
.with_context(|| format!("Failed to create DOCX file at {:?}", doc_path))?;
|
|
docx.build().pack(file)
|
|
.with_context(|| format!("Failed to write DOCX package at {:?}", doc_path))?;
|
|
|
|
let metadata = DocxMetadata {
|
|
id: doc_id.clone(),
|
|
path: doc_path,
|
|
created_at: Utc::now(),
|
|
modified_at: Utc::now(),
|
|
size_bytes: 0,
|
|
page_count: Some(1),
|
|
word_count: Some(0),
|
|
author: None,
|
|
title: None,
|
|
subject: None,
|
|
};
|
|
|
|
self.documents.insert(doc_id.clone(), metadata);
|
|
self.in_memory_ops.insert(doc_id.clone(), Vec::new());
|
|
info!("Created new document with ID: {}", doc_id);
|
|
|
|
Ok(doc_id)
|
|
}
|
|
|
|
pub fn open_document(&mut self, path: &Path) -> Result<String> {
|
|
let doc_id = Uuid::new_v4().to_string();
|
|
let doc_path = self.temp_dir.join(format!("{}.docx", doc_id));
|
|
|
|
if let Some(parent) = doc_path.parent() {
|
|
fs::create_dir_all(parent)
|
|
.with_context(|| format!("Failed to create parent directory for {:?}", doc_path))?;
|
|
}
|
|
fs::copy(path, &doc_path)
|
|
.with_context(|| format!("Failed to copy document from {:?}", path))?;
|
|
|
|
let file_metadata = fs::metadata(&doc_path)?;
|
|
|
|
let metadata = DocxMetadata {
|
|
id: doc_id.clone(),
|
|
path: doc_path,
|
|
created_at: Utc::now(),
|
|
modified_at: Utc::now(),
|
|
size_bytes: file_metadata.len(),
|
|
page_count: None,
|
|
word_count: None,
|
|
author: None,
|
|
title: None,
|
|
subject: None,
|
|
};
|
|
|
|
self.documents.insert(doc_id.clone(), metadata);
|
|
info!("Opened document from {:?} with ID: {}", path, doc_id);
|
|
|
|
Ok(doc_id)
|
|
}
|
|
|
|
pub fn add_paragraph(&mut self, doc_id: &str, text: &str, style: Option<DocxStyle>) -> Result<()> {
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::Paragraph { text: text.to_string(), style });
|
|
self.write_docx(doc_id)?;
|
|
info!("Added paragraph to document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
pub fn add_heading(&mut self, doc_id: &str, text: &str, level: usize) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
let heading_style = match level {
|
|
1 => "Heading1",
|
|
2 => "Heading2",
|
|
3 => "Heading3",
|
|
4 => "Heading4",
|
|
5 => "Heading5",
|
|
6 => "Heading6",
|
|
_ => "Heading1",
|
|
};
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::Heading { text: text.to_string(), style: heading_style.to_string() });
|
|
self.write_docx(doc_id)?;
|
|
info!("Added heading level {} to document {}", level, doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
pub fn add_table(&mut self, doc_id: &str, table_data: TableData) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::Table { data: table_data });
|
|
self.write_docx(doc_id)?;
|
|
info!("Added table to document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
pub fn add_list(&mut self, doc_id: &str, items: Vec<String>, ordered: bool) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::List { items, ordered });
|
|
self.write_docx(doc_id)?;
|
|
info!("Added {} list to document {}", if ordered { "ordered" } else { "unordered" }, doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
/// Add a single list item with an explicit indent level (0-based)
|
|
pub fn add_list_item(&mut self, doc_id: &str, text: &str, level: usize, ordered: bool) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::ListItem { text: text.to_string(), level, ordered });
|
|
self.write_docx(doc_id)?;
|
|
info!("Added list item (level {}) to document {}", level, doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
/// Add an image to the document
|
|
pub fn add_image(&mut self, doc_id: &str, image: ImageData) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
let width = image.width.unwrap_or(100);
|
|
let height = image.height.unwrap_or(100);
|
|
ops.push(DocxOp::Image { data: image.data, width, height, alt_text: image.alt_text });
|
|
self.write_docx(doc_id)?;
|
|
info!("Added image to document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
/// Add a hyperlink to the document
|
|
pub fn add_hyperlink(&mut self, doc_id: &str, text: &str, url: &str) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::Hyperlink { text: text.to_string(), url: url.to_string() });
|
|
self.write_docx(doc_id)?;
|
|
info!("Added hyperlink to document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
/// Insert a section break with optional page setup (best-effort)
|
|
pub fn add_section_break(
|
|
&mut self,
|
|
doc_id: &str,
|
|
page_size: Option<&str>,
|
|
orientation: Option<&str>,
|
|
margins: Option<MarginsSpec>,
|
|
) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::SectionBreak {
|
|
page_size: page_size.map(|s| s.to_string()),
|
|
orientation: orientation.map(|s| s.to_string()),
|
|
margins,
|
|
});
|
|
self.write_docx(doc_id)?;
|
|
info!("Added section break to document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
/// Insert a Table of Contents placeholder (post-processed into a TOC field when enabled)
|
|
pub fn insert_toc(&mut self, doc_id: &str, from_level: usize, to_level: usize, right_align_dots: bool) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::Toc { from_level, to_level, right_align_dots });
|
|
self.write_docx(doc_id)?;
|
|
Ok(())
|
|
}
|
|
|
|
/// Insert a bookmark immediately after the first heading matching text (best-effort)
|
|
pub fn insert_bookmark_after_heading(&mut self, doc_id: &str, heading_text: &str, name: &str) -> Result<bool> {
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
if let Some(pos) = ops.iter().position(|op| matches!(op, DocxOp::Heading { text: t, .. } if t == heading_text)) {
|
|
ops.insert(pos + 1, DocxOp::BookmarkAfterHeading { heading_text: heading_text.to_string(), name: name.to_string() });
|
|
self.write_docx(doc_id)?;
|
|
return Ok(true);
|
|
}
|
|
Ok(false)
|
|
}
|
|
|
|
pub fn add_page_break(&mut self, doc_id: &str) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::PageBreak);
|
|
self.write_docx(doc_id)?;
|
|
info!("Added page break to document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
pub fn set_header(&mut self, doc_id: &str, text: &str) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::Header(text.to_string()));
|
|
self.write_docx(doc_id)?;
|
|
info!("Set header for document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
pub fn set_footer(&mut self, doc_id: &str, text: &str) -> Result<()> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
ops.push(DocxOp::Footer(text.to_string()));
|
|
self.write_docx(doc_id)?;
|
|
info!("Set footer for document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
/// Convenience: set simple page numbering text in header or footer
|
|
pub fn set_page_numbering(&mut self, doc_id: &str, location: &str, template: Option<&str>) -> Result<()> {
|
|
let text = template.unwrap_or("Page {PAGE} of {PAGES}");
|
|
match location {
|
|
"header" => self.set_header(doc_id, text),
|
|
"footer" => self.set_footer(doc_id, text),
|
|
_ => anyhow::bail!("invalid location: {}", location),
|
|
}
|
|
}
|
|
|
|
/// Attempt to replace placeholder page numbering text in header with Word field codes (PAGE/NUMPAGES)
|
|
/// This is a best-effort, post-processing step that edits the zipped DOCX XML in-place by rebuilding the archive.
|
|
pub fn embed_page_number_fields(&self, doc_id: &str) -> Result<()> {
|
|
let metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
if !metadata.path.exists() {
|
|
anyhow::bail!("Document file missing: {:?}", metadata.path);
|
|
}
|
|
|
|
let src_file = std::fs::File::open(&metadata.path)?;
|
|
let mut archive = ZipArchive::new(src_file)?;
|
|
|
|
// Prepare buffer to write new archive
|
|
let temp_path = metadata.path.with_extension("docx.tmp");
|
|
let dst_file = std::fs::File::create(&temp_path)?;
|
|
let mut writer = ZipWriter::new(dst_file);
|
|
let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
|
|
|
|
let mut did_replace = false;
|
|
for i in 0..archive.len() {
|
|
let mut file = archive.by_index(i)?;
|
|
let name = file.name().to_string();
|
|
|
|
if (name.starts_with("word/header") || name.starts_with("word/footer")) && name.ends_with(".xml") {
|
|
let mut xml = String::new();
|
|
use std::io::Read as _;
|
|
file.read_to_string(&mut xml)?;
|
|
|
|
if xml.contains("Page {PAGE} of {PAGES}") {
|
|
let field_runs = concat!(
|
|
"Page ",
|
|
"<w:r><w:fldChar w:fldCharType=\"begin\"/></w:r>",
|
|
"<w:r><w:instrText xml:space=\"preserve\"> PAGE </w:instrText></w:r>",
|
|
"<w:r><w:fldChar w:fldCharType=\"end\"/></w:r>",
|
|
" of ",
|
|
"<w:r><w:fldChar w:fldCharType=\"begin\"/></w:r>",
|
|
"<w:r><w:instrText xml:space=\"preserve\"> NUMPAGES </w:instrText></w:r>",
|
|
"<w:r><w:fldChar w:fldCharType=\"end\"/></w:r>"
|
|
);
|
|
xml = xml.replace("Page {PAGE} of {PAGES}", field_runs);
|
|
did_replace = true;
|
|
}
|
|
|
|
writer.start_file(name, options)?;
|
|
use std::io::Write as _;
|
|
writer.write_all(xml.as_bytes())?;
|
|
} else {
|
|
// Copy other file entries verbatim
|
|
writer.start_file(name, options)?;
|
|
use std::io::Read as _;
|
|
let mut buf = Vec::new();
|
|
file.read_to_end(&mut buf)?;
|
|
use std::io::Write as _;
|
|
writer.write_all(&buf)?;
|
|
}
|
|
}
|
|
|
|
writer.finish()?;
|
|
|
|
// Replace original archive only if we changed something
|
|
if did_replace {
|
|
std::fs::rename(&temp_path, &metadata.path)?;
|
|
info!("Embedded PAGE/NUMPAGES fields into header for {}", doc_id);
|
|
} else {
|
|
// Cleanup temp
|
|
let _ = std::fs::remove_file(&temp_path);
|
|
info!("No placeholder found to replace for page numbering in {}", doc_id);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
pub fn find_and_replace(&mut self, doc_id: &str, _find_text: &str, _replace_text: &str) -> Result<usize> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
// Note: This is a simplified implementation
|
|
// Real implementation would need to parse the DOCX XML structure
|
|
// and perform replacements while preserving formatting
|
|
|
|
warn!("Find and replace operation requires advanced XML manipulation");
|
|
Ok(0)
|
|
}
|
|
|
|
/// Advanced find and replace over in-memory operations (LLM-friendly), preserving runs
|
|
/// Supports regex, case sensitivity, and whole word boundaries
|
|
pub fn find_and_replace_advanced(
|
|
&mut self,
|
|
doc_id: &str,
|
|
pattern: &str,
|
|
replacement: &str,
|
|
case_sensitive: bool,
|
|
whole_word: bool,
|
|
use_regex: bool,
|
|
) -> Result<usize> {
|
|
use regex::RegexBuilder;
|
|
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
|
|
|
|
// Build regex
|
|
let pattern = if use_regex { pattern.to_string() } else { regex::escape(pattern) };
|
|
let pattern = if whole_word { format!("\\b{}\\b", pattern) } else { pattern };
|
|
let re = RegexBuilder::new(&pattern)
|
|
.case_insensitive(!case_sensitive)
|
|
.build()
|
|
.with_context(|| "Invalid regex pattern")?;
|
|
|
|
let mut total_replacements = 0usize;
|
|
|
|
let mut replace_text = |text: &str| -> (String, usize) {
|
|
let mut count = 0usize;
|
|
let result = re.replace_all(text, |_: ®ex::Captures| {
|
|
count += 1;
|
|
replacement.to_string()
|
|
});
|
|
(result.into_owned(), count)
|
|
};
|
|
|
|
for op in ops.iter_mut() {
|
|
match op {
|
|
DocxOp::Paragraph { text, .. } => {
|
|
let (new_text, cnt) = replace_text(text);
|
|
if cnt > 0 { *text = new_text; total_replacements += cnt; }
|
|
}
|
|
DocxOp::Heading { text, .. } => {
|
|
let (new_text, cnt) = replace_text(text);
|
|
if cnt > 0 { *text = new_text; total_replacements += cnt; }
|
|
}
|
|
DocxOp::List { items, .. } => {
|
|
for item in items.iter_mut() {
|
|
let (new_text, cnt) = replace_text(item);
|
|
if cnt > 0 { *item = new_text; total_replacements += cnt; }
|
|
}
|
|
}
|
|
DocxOp::ListItem { text, .. } => {
|
|
let (new_text, cnt) = replace_text(text);
|
|
if cnt > 0 { *text = new_text; total_replacements += cnt; }
|
|
}
|
|
DocxOp::Table { data } => {
|
|
for row in data.rows.iter_mut() {
|
|
for cell in row.iter_mut() {
|
|
let (new_text, cnt) = replace_text(cell);
|
|
if cnt > 0 { *cell = new_text; total_replacements += cnt; }
|
|
}
|
|
}
|
|
}
|
|
DocxOp::Header(text) | DocxOp::Footer(text) => {
|
|
let (new_text, cnt) = replace_text(text);
|
|
if cnt > 0 { *text = new_text; total_replacements += cnt; }
|
|
}
|
|
DocxOp::Image { .. } | DocxOp::Hyperlink { .. } => {}
|
|
DocxOp::PageBreak => {}
|
|
DocxOp::SectionBreak { .. } => {}
|
|
DocxOp::Toc { .. } => {}
|
|
DocxOp::BookmarkAfterHeading { .. } => {}
|
|
}
|
|
}
|
|
|
|
// Persist changes
|
|
self.write_docx(doc_id)?;
|
|
Ok(total_replacements)
|
|
}
|
|
|
|
/// Analyze document structure using in-memory ops (if available)
|
|
pub fn analyze_structure(&self, doc_id: &str) -> Result<serde_json::Value> {
|
|
let ops = match self.in_memory_ops.get(doc_id) {
|
|
Some(ops) => ops,
|
|
None => {
|
|
// Fallback to text-based outline if ops not available
|
|
let text = self.extract_text(doc_id).unwrap_or_default();
|
|
let mut outline = Vec::new();
|
|
for line in text.lines() {
|
|
let trimmed = line.trim();
|
|
if trimmed.is_empty() { continue; }
|
|
if trimmed.len() < 100 && trimmed.chars().any(|c| c.is_uppercase()) {
|
|
let level = if trimmed.chars().all(|c| c.is_uppercase() || c.is_whitespace()) { 1 } else { 2 };
|
|
outline.push(serde_json::json!({"type":"heading","text":trimmed,"level":level}));
|
|
}
|
|
}
|
|
return Ok(serde_json::json!({
|
|
"has_ops": false,
|
|
"outline": outline,
|
|
"lists": [],
|
|
"tables": [],
|
|
"images": [],
|
|
"links": [],
|
|
"styles": {}
|
|
}));
|
|
}
|
|
};
|
|
|
|
let mut outline = Vec::new();
|
|
let mut lists = Vec::new();
|
|
let mut tables = Vec::new();
|
|
let mut images = Vec::new();
|
|
let mut links = Vec::new();
|
|
let mut styles_used: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
|
|
|
|
for op in ops.iter() {
|
|
match op {
|
|
DocxOp::Heading { text, style } => {
|
|
let level = style.chars().last().and_then(|c| c.to_digit(10)).map(|d| d as usize).unwrap_or(1);
|
|
outline.push(serde_json::json!({"text": text, "level": level}));
|
|
}
|
|
DocxOp::List { items, .. } => {
|
|
lists.push(serde_json::json!({"level": 0, "items": items}));
|
|
}
|
|
DocxOp::ListItem { text, level, .. } => {
|
|
lists.push(serde_json::json!({"level": level, "items": [text]}));
|
|
}
|
|
DocxOp::Table { data } => {
|
|
let rows = data.rows.len();
|
|
let cols = data.rows.first().map(|r| r.len()).unwrap_or(0);
|
|
tables.push(serde_json::json!({"rows": rows, "cols": cols}));
|
|
}
|
|
DocxOp::Image { width, height, .. } => {
|
|
images.push(serde_json::json!({"width": width, "height": height}));
|
|
}
|
|
DocxOp::Hyperlink { text, url } => {
|
|
links.push(serde_json::json!({"text": text, "url": url}));
|
|
}
|
|
DocxOp::Paragraph { style, .. } => {
|
|
if let Some(s) = style {
|
|
if s.bold == Some(true) { *styles_used.entry("bold".into()).or_default() += 1; }
|
|
if s.italic == Some(true) { *styles_used.entry("italic".into()).or_default() += 1; }
|
|
if s.underline == Some(true) { *styles_used.entry("underline".into()).or_default() += 1; }
|
|
if s.font_family.is_some() { *styles_used.entry("font_family".into()).or_default() += 1; }
|
|
if s.font_size.is_some() { *styles_used.entry("font_size".into()).or_default() += 1; }
|
|
if s.color.is_some() { *styles_used.entry("color".into()).or_default() += 1; }
|
|
if s.alignment.is_some() { *styles_used.entry("alignment".into()).or_default() += 1; }
|
|
}
|
|
}
|
|
DocxOp::Header(_) | DocxOp::Footer(_) | DocxOp::PageBreak | DocxOp::SectionBreak { .. } => {}
|
|
DocxOp::Toc { .. } => {}
|
|
DocxOp::BookmarkAfterHeading { .. } => {}
|
|
}
|
|
}
|
|
|
|
Ok(serde_json::json!({
|
|
"has_ops": true,
|
|
"outline": outline,
|
|
"lists": lists,
|
|
"tables": tables,
|
|
"images": images,
|
|
"links": links,
|
|
"styles": styles_used,
|
|
}))
|
|
}
|
|
|
|
pub fn extract_text(&self, doc_id: &str) -> Result<String> {
|
|
let _metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
// Use pure Rust text extraction
|
|
use crate::pure_converter::PureRustConverter;
|
|
let converter = PureRustConverter::new();
|
|
let text = converter.extract_text_from_docx(&self.documents.get(doc_id).unwrap().path)
|
|
.with_context(|| format!("Failed to extract text from document {}", doc_id))?;
|
|
|
|
Ok(text)
|
|
}
|
|
|
|
pub fn get_metadata(&self, doc_id: &str) -> Result<DocxMetadata> {
|
|
self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))
|
|
.map(|m| m.clone())
|
|
}
|
|
|
|
/// Update document core properties stored in our metadata (best-effort)
|
|
pub fn set_document_properties(
|
|
&mut self,
|
|
doc_id: &str,
|
|
title: Option<String>,
|
|
subject: Option<String>,
|
|
author: Option<String>,
|
|
) -> Result<()> {
|
|
let meta = self.documents.get_mut(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
if let Some(t) = title { meta.title = Some(t); }
|
|
if let Some(s) = subject { meta.subject = Some(s); }
|
|
if let Some(a) = author { meta.author = Some(a); }
|
|
Ok(())
|
|
}
|
|
|
|
pub fn get_document_properties_json(&self, doc_id: &str) -> Result<serde_json::Value> {
|
|
let meta = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
Ok(serde_json::json!({
|
|
"title": meta.title,
|
|
"subject": meta.subject,
|
|
"author": meta.author,
|
|
"created_at": meta.created_at,
|
|
"modified_at": meta.modified_at,
|
|
}))
|
|
}
|
|
|
|
/// Insert a paragraph after the first heading that matches `heading_text`
|
|
pub fn insert_after_heading(&mut self, doc_id: &str, heading_text: &str, text: &str) -> Result<bool> {
|
|
self.ensure_modifiable(doc_id)?;
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
if let Some(pos) = ops.iter().position(|op| matches!(op, DocxOp::Heading { text: t, .. } if t == heading_text)) {
|
|
ops.insert(pos + 1, DocxOp::Paragraph { text: text.to_string(), style: None });
|
|
self.write_docx(doc_id)?;
|
|
return Ok(true);
|
|
}
|
|
Ok(false)
|
|
}
|
|
|
|
/// Remove external hyperlinks (basic sanitizer)
|
|
pub fn sanitize_external_links(&mut self, doc_id: &str) -> Result<usize> {
|
|
self.ensure_modifiable(doc_id)?;
|
|
let removed = {
|
|
let ops = self.in_memory_ops.get_mut(doc_id).unwrap();
|
|
let before = ops.len();
|
|
ops.retain(|op| match op {
|
|
DocxOp::Hyperlink { url, .. } => {
|
|
let lower = url.to_lowercase();
|
|
!(lower.starts_with("http://") || lower.starts_with("https://"))
|
|
}
|
|
_ => true,
|
|
});
|
|
before.saturating_sub(ops.len())
|
|
};
|
|
self.write_docx(doc_id)?;
|
|
Ok(removed)
|
|
}
|
|
|
|
/// Redact text using advanced find/replace with a block character
|
|
pub fn redact_text(&mut self, doc_id: &str, pattern: &str, use_regex: bool, whole_word: bool, case_sensitive: bool) -> Result<usize> {
|
|
self.find_and_replace_advanced(doc_id, pattern, "█", case_sensitive, whole_word, use_regex)
|
|
}
|
|
|
|
pub fn save_document(&self, doc_id: &str, output_path: &Path) -> Result<()> {
|
|
let metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
fs::copy(&metadata.path, output_path)
|
|
.with_context(|| format!("Failed to save document to {:?}", output_path))?;
|
|
|
|
info!("Saved document {} to {:?}", doc_id, output_path);
|
|
Ok(())
|
|
}
|
|
|
|
pub fn close_document(&mut self, doc_id: &str) -> Result<()> {
|
|
let metadata = self.documents.remove(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
|
|
if metadata.path.exists() {
|
|
fs::remove_file(&metadata.path)?;
|
|
}
|
|
self.in_memory_ops.remove(doc_id);
|
|
|
|
info!("Closed document {}", doc_id);
|
|
Ok(())
|
|
}
|
|
|
|
pub fn list_documents(&self) -> Vec<DocxMetadata> {
|
|
self.documents.values().cloned().collect()
|
|
}
|
|
|
|
pub fn temp_dir_path(&self) -> PathBuf {
|
|
self.temp_dir.clone()
|
|
}
|
|
|
|
pub fn get_storage_info(&self) -> Result<serde_json::Value> {
|
|
use std::time::UNIX_EPOCH;
|
|
let mut total_bytes: u64 = 0;
|
|
let mut file_count: u64 = 0;
|
|
let mut oldest: Option<u64> = None;
|
|
let mut newest: Option<u64> = None;
|
|
if self.temp_dir.exists() {
|
|
for entry in walkdir::WalkDir::new(&self.temp_dir).into_iter().filter_map(|e| e.ok()) {
|
|
if entry.file_type().is_file() {
|
|
file_count += 1;
|
|
if let Ok(meta) = entry.metadata() {
|
|
total_bytes = total_bytes.saturating_add(meta.len());
|
|
if let Ok(modified) = meta.modified() {
|
|
if let Ok(secs) = modified.duration_since(UNIX_EPOCH) {
|
|
let ts = secs.as_secs();
|
|
oldest = Some(oldest.map_or(ts, |o| o.min(ts)));
|
|
newest = Some(newest.map_or(ts, |n| n.max(ts)));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Ok(serde_json::json!({
|
|
"success": true,
|
|
"storage": {
|
|
"base_dir": self.temp_dir,
|
|
"file_count": file_count,
|
|
"total_bytes": total_bytes,
|
|
"oldest_modified": oldest,
|
|
"newest_modified": newest,
|
|
}
|
|
}))
|
|
}
|
|
}
|
|
|
|
#[derive(Debug, Clone)]
|
|
enum DocxOp {
|
|
Paragraph { text: String, style: Option<DocxStyle> },
|
|
Heading { text: String, style: String },
|
|
Table { data: TableData },
|
|
List { items: Vec<String>, ordered: bool },
|
|
ListItem { text: String, level: usize, ordered: bool },
|
|
PageBreak,
|
|
Header(String),
|
|
Footer(String),
|
|
Image { data: Vec<u8>, width: u32, height: u32, alt_text: Option<String> },
|
|
Hyperlink { text: String, url: String },
|
|
SectionBreak { page_size: Option<String>, orientation: Option<String>, margins: Option<MarginsSpec> },
|
|
Toc { from_level: usize, to_level: usize, right_align_dots: bool },
|
|
BookmarkAfterHeading { heading_text: String, name: String },
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct MarginsSpec {
|
|
pub top: Option<f32>,
|
|
pub bottom: Option<f32>,
|
|
pub left: Option<f32>,
|
|
pub right: Option<f32>,
|
|
}
|
|
|
|
impl DocxHandler {
|
|
fn ensure_modifiable(&self, doc_id: &str) -> Result<()> {
|
|
if !self.in_memory_ops.contains_key(doc_id) {
|
|
anyhow::bail!("Modifications are supported only for documents created by this server (doc_id: {})", doc_id);
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
fn write_docx(&self, doc_id: &str) -> Result<()> {
|
|
let metadata = self.documents.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("Document not found: {}", doc_id))?;
|
|
let ops = self.in_memory_ops.get(doc_id)
|
|
.ok_or_else(|| anyhow::anyhow!("No in-memory ops for document: {}", doc_id))?;
|
|
|
|
let mut docx = Docx::new();
|
|
let mut header_text: Option<String> = None;
|
|
let mut footer_text: Option<String> = None;
|
|
|
|
for op in ops {
|
|
match op {
|
|
DocxOp::Paragraph { text, style } => {
|
|
let mut run = Run::new().add_text(text);
|
|
if let Some(st) = style {
|
|
if let Some(size) = st.font_size { run = run.size(size); }
|
|
if st.bold == Some(true) { run = run.bold(); }
|
|
if st.italic == Some(true) { run = run.italic(); }
|
|
if st.underline == Some(true) { run = run.underline("single"); }
|
|
if let Some(color) = &st.color { run = run.color(color.clone()); }
|
|
}
|
|
let para = Paragraph::new().add_run(run);
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::Heading { text, style } => {
|
|
let para = Paragraph::new().add_run(Run::new().add_text(text)).style(style);
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::Table { data } => {
|
|
let col_count = data.rows.get(0).map(|r| r.len()).unwrap_or(0);
|
|
// Note: docx-rs Table::new takes rows, not grid. We'll add rows then (optionally) rely on defaults.
|
|
let mut table = Table::new(vec![]);
|
|
|
|
// Note: We rely on XML post-processing to inject tblGrid widths when feature enabled.
|
|
|
|
// Pre-compute merge coverage map (best-effort)
|
|
use std::collections::HashSet;
|
|
let mut covered: HashSet<(usize, usize)> = HashSet::new();
|
|
let mut topleft: HashSet<(usize, usize)> = HashSet::new();
|
|
if let Some(merges) = &data.merges {
|
|
for m in merges {
|
|
topleft.insert((m.row, m.col));
|
|
for dr in 0..m.row_span.max(1) {
|
|
for dc in 0..m.col_span.max(1) {
|
|
covered.insert((m.row + dr, m.col + dc));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
let has_header = data.headers.as_ref().map(|h| !h.is_empty()).unwrap_or(false);
|
|
for (ri, row) in data.rows.iter().enumerate() {
|
|
let mut cells: Vec<TableCell> = Vec::new();
|
|
for (ci, cell_text) in row.iter().enumerate() {
|
|
let pos = (ri, ci);
|
|
// Only render text in top-left cell of a merge region; others empty
|
|
let text_to_render = if covered.contains(&pos) && !topleft.contains(&pos) { "" } else { cell_text.as_str() };
|
|
let mut para = Paragraph::new().add_run(Run::new().add_text(text_to_render));
|
|
if has_header && ri == 0 {
|
|
// Mark first row as header style; post-processing will add style definition
|
|
para = para.style("TableHeader");
|
|
}
|
|
let cell = TableCell::new().add_paragraph(para);
|
|
cells.push(cell);
|
|
}
|
|
while cells.len() < col_count { cells.push(TableCell::new()); }
|
|
table = table.add_row(TableRow::new(cells));
|
|
}
|
|
docx = docx.add_table(table);
|
|
}
|
|
DocxOp::List { items, ordered } => {
|
|
// Use separate numbering ids for ordered vs unordered so we can post-process numbering.xml
|
|
let (abstract_id, concrete_id) = if *ordered { (10usize, 11usize) } else { (20usize, 21usize) };
|
|
docx = docx
|
|
.add_abstract_numbering(docx_rs::AbstractNumbering::new(abstract_id))
|
|
.add_numbering(docx_rs::Numbering::new(concrete_id, abstract_id));
|
|
for item in items {
|
|
let para = Paragraph::new()
|
|
.add_run(Run::new().add_text(item))
|
|
.numbering(NumberingId::new(concrete_id), IndentLevel::new(0));
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
}
|
|
DocxOp::ListItem { text, level, ordered } => {
|
|
let (abstract_id, concrete_id) = if *ordered { (10usize, 11usize) } else { (20usize, 21usize) };
|
|
docx = docx
|
|
.add_abstract_numbering(docx_rs::AbstractNumbering::new(abstract_id))
|
|
.add_numbering(docx_rs::Numbering::new(concrete_id, abstract_id));
|
|
let para = Paragraph::new()
|
|
.add_run(Run::new().add_text(text))
|
|
.numbering(NumberingId::new(concrete_id), IndentLevel::new(*level));
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::PageBreak => {
|
|
let para = Paragraph::new().add_run(Run::new().add_break(BreakType::Page));
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::Header(text) => { header_text = Some(text.clone()); }
|
|
DocxOp::Footer(text) => { footer_text = Some(text.clone()); }
|
|
DocxOp::Image { data, width, height, alt_text: _ } => {
|
|
let run = Run::new();
|
|
let pic = Pic::new_with_dimensions(data.clone(), *width, *height);
|
|
let para = Paragraph::new().add_run(run.add_image(pic));
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::Hyperlink { text, url } => {
|
|
let link = Hyperlink::new(url, HyperlinkType::External)
|
|
.add_run(Run::new().add_text(text).color("0000FF").underline("single"));
|
|
let para = Paragraph::new().add_hyperlink(link);
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::SectionBreak { .. } => {
|
|
// Best-effort: denote a section break with a page break
|
|
let para = Paragraph::new().add_run(Run::new().add_break(BreakType::Page));
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::Toc { from_level, to_level, right_align_dots } => {
|
|
// Insert a recognizable placeholder paragraph for TOC post-processing
|
|
let text = format!("__TOC__ FROM:{} TO:{} DOTS:{}", from_level, to_level, right_align_dots);
|
|
let para = Paragraph::new().add_run(Run::new().add_text(text));
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
DocxOp::BookmarkAfterHeading { heading_text, name } => {
|
|
// Insert a marker paragraph that we will convert to a bookmark
|
|
let text = format!("__BOOKMARK__ '{}' '{}'" , heading_text, name);
|
|
let para = Paragraph::new().add_run(Run::new().add_text(&text));
|
|
docx = docx.add_paragraph(para);
|
|
}
|
|
}
|
|
}
|
|
|
|
if let Some(h) = header_text {
|
|
let header = Header::new().add_paragraph(Paragraph::new().add_run(Run::new().add_text(h)));
|
|
docx = docx.header(header);
|
|
}
|
|
if let Some(f) = footer_text {
|
|
let footer = Footer::new().add_paragraph(Paragraph::new().add_run(Run::new().add_text(f)));
|
|
docx = docx.footer(footer);
|
|
}
|
|
|
|
let file = File::create(&metadata.path)?;
|
|
docx.build().pack(file)?;
|
|
|
|
// Optionally post-process to inject high-fidelity XML
|
|
#[cfg(feature = "hi-fidelity-tables")]
|
|
{
|
|
self.apply_table_xml_properties(&metadata.path, ops)?;
|
|
}
|
|
#[cfg(feature = "hi-fidelity-styles")]
|
|
{
|
|
self.apply_styles_xml_properties(&metadata.path)?;
|
|
}
|
|
#[cfg(feature = "hi-fidelity-lists")]
|
|
{
|
|
self.apply_numbering_xml_properties(&metadata.path, ops)?;
|
|
}
|
|
#[cfg(feature = "hi-fidelity-sections")]
|
|
{
|
|
self.apply_section_xml_properties(&metadata.path, ops)?;
|
|
}
|
|
#[cfg(feature = "hi-fidelity-toc")]
|
|
{
|
|
self.apply_toc_xml_properties(&metadata.path)?;
|
|
}
|
|
#[cfg(feature = "hi-fidelity-bookmarks")]
|
|
{
|
|
self.apply_bookmarks_xml_properties(&metadata.path)?;
|
|
}
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "hi-fidelity-tables")]
|
|
impl DocxHandler {
|
|
fn apply_table_xml_properties(&self, docx_path: &Path, ops: &Vec<DocxOp>) -> Result<()> {
|
|
// Open existing archive
|
|
let src_file = std::fs::File::open(docx_path)?;
|
|
let mut archive = ZipArchive::new(src_file)?;
|
|
|
|
// Read document.xml into memory
|
|
let mut document_xml = String::new();
|
|
{
|
|
let mut f = archive.by_name("word/document.xml")?;
|
|
use std::io::Read as _;
|
|
f.read_to_string(&mut document_xml)?;
|
|
}
|
|
|
|
// Count tables and build a merge map per table based on ops order
|
|
// We assume each DocxOp::Table corresponds to a <w:tbl> in order.
|
|
let mut table_merge_specs: Vec<(Option<Vec<u32>>, Option<Vec<TableMerge>>)> = Vec::new();
|
|
for op in ops.iter() {
|
|
if let DocxOp::Table { data } = op {
|
|
table_merge_specs.push((data.col_widths.clone(), data.merges.clone()));
|
|
}
|
|
}
|
|
|
|
if table_merge_specs.is_empty() {
|
|
return Ok(());
|
|
}
|
|
|
|
// Perform a minimal XML manipulation using string operations to inject gridSpan/vMerge
|
|
// This is a best-effort approach and assumes simple structure generated by docx-rs.
|
|
// Strategy:
|
|
// - Iterate through each <w:tbl> block sequentially.
|
|
// - Within each table, iterate rows and cells; when a merge starts at (r,c), add w:gridSpan and/or w:vMerge="restart".
|
|
// - For cells covered by vertical continuation, set w:vMerge="continue" and remove text if present.
|
|
// - If col_widths provided, ensure a <w:tblGrid> with <w:gridCol w:w="..."/> entries exists.
|
|
|
|
// Split tables
|
|
let mut output = String::new();
|
|
let mut rest = document_xml.as_str();
|
|
let mut tbl_index = 0usize;
|
|
while let Some(start) = rest.find("<w:tbl") {
|
|
let (head, after_head) = rest.split_at(start);
|
|
output.push_str(head);
|
|
// Find end of table
|
|
if let Some(end) = after_head.find("</w:tbl>") {
|
|
let (tbl_block, tail) = after_head.split_at(end + "</w:tbl>".len());
|
|
let processed = self.process_single_table_xml(tbl_block, table_merge_specs.get(tbl_index))?;
|
|
output.push_str(&processed);
|
|
rest = tail;
|
|
tbl_index += 1;
|
|
} else {
|
|
// Malformed; break
|
|
output.push_str(after_head);
|
|
rest = "";
|
|
break;
|
|
}
|
|
}
|
|
output.push_str(rest);
|
|
|
|
if output != document_xml {
|
|
// Rebuild archive with modified document.xml
|
|
let temp_path = docx_path.with_extension("docx.tmp");
|
|
let dst_file = std::fs::File::create(&temp_path)?;
|
|
let mut writer = ZipWriter::new(dst_file);
|
|
let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
|
|
|
|
for i in 0..archive.len() {
|
|
let mut file = archive.by_index(i)?;
|
|
let name = file.name().to_string();
|
|
writer.start_file(name.clone(), options)?;
|
|
use std::io::{Read as _, Write as _};
|
|
if name == "word/document.xml" {
|
|
writer.write_all(output.as_bytes())?;
|
|
} else {
|
|
let mut buf = Vec::new();
|
|
file.read_to_end(&mut buf)?;
|
|
writer.write_all(&buf)?;
|
|
}
|
|
}
|
|
|
|
writer.finish()?;
|
|
std::fs::rename(&temp_path, docx_path)?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
fn process_single_table_xml(&self, tbl_xml: &str, spec: Option<&(Option<Vec<u32>>, Option<Vec<TableMerge>>)>) -> Result<String> {
|
|
if spec.is_none() { return Ok(tbl_xml.to_string()); }
|
|
let (col_widths, merges_opt) = spec.unwrap();
|
|
let mut out = tbl_xml.to_string();
|
|
|
|
// Ensure tblGrid
|
|
if let Some(widths) = col_widths {
|
|
if !widths.is_empty() {
|
|
if !out.contains("<w:tblGrid") {
|
|
// Insert after <w:tblPr> if present, else right after <w:tbl>
|
|
if let Some(pr_end) = out.find("</w:tblPr>") {
|
|
let insert_pos = pr_end + "</w:tblPr>".len();
|
|
let grid_xml = self.render_tbl_grid(widths);
|
|
out.insert_str(insert_pos, &grid_xml);
|
|
} else if let Some(tbl_start_end) = out.find(">") {
|
|
// after opening <w:tbl>
|
|
let insert_pos = tbl_start_end + 1;
|
|
let grid_xml = self.render_tbl_grid(widths);
|
|
out.insert_str(insert_pos, &grid_xml);
|
|
}
|
|
} else {
|
|
// Replace existing grid (supports normal and self-closing forms)
|
|
let grid_xml = self.render_tbl_grid(widths);
|
|
if let Some(gstart) = out.find("<w:tblGrid") {
|
|
let rel = &out[gstart..];
|
|
if let Some(self_close) = rel.find("/>") {
|
|
let end_abs = gstart + self_close + 2; // include "/>"
|
|
out.replace_range(gstart..end_abs, &grid_xml);
|
|
} else if let Some(gend) = rel.find("</w:tblGrid>") {
|
|
let gend_abs = gstart + gend + "</w:tblGrid>".len();
|
|
out.replace_range(gstart..gend_abs, &grid_xml);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Apply merges
|
|
if let Some(merges) = merges_opt {
|
|
// Tokenize rows and cells sequentially best-effort
|
|
let mut ri = 0usize;
|
|
let mut cursor = 0usize;
|
|
while let Some(tr_start_off) = out[cursor..].find("<w:tr") {
|
|
let tr_start = cursor + tr_start_off;
|
|
if let Some(tr_end_rel) = out[tr_start..].find("</w:tr>") {
|
|
let tr_end = tr_start + tr_end_rel + "</w:tr>".len();
|
|
let mut tr_block = out[tr_start..tr_end].to_string();
|
|
|
|
// Walk cells
|
|
let mut ci = 0usize;
|
|
let mut tr_cursor = 0usize;
|
|
while let Some(tc_start_off) = tr_block[tr_cursor..].find("<w:tc") {
|
|
let tc_start = tr_cursor + tc_start_off;
|
|
if let Some(tc_end_rel) = tr_block[tc_start..].find("</w:tc>") {
|
|
let tc_end = tc_start + tc_end_rel + "</w:tc>".len();
|
|
let mut tc_block = tr_block[tc_start..tc_end].to_string();
|
|
|
|
// Determine merge action for this cell
|
|
let mut grid_span: Option<usize> = None;
|
|
let mut vmerge: Option<&'static str> = None; // "restart" or "continue"
|
|
for m in merges {
|
|
if m.row == ri && m.col == ci {
|
|
if m.col_span > 1 { grid_span = Some(m.col_span); }
|
|
if m.row_span > 1 { vmerge = Some("restart"); }
|
|
} else if m.col == ci && ri > m.row && ri < m.row + m.row_span && ci >= m.col && ci < m.col + m.col_span {
|
|
// vertically covered cell
|
|
if m.row_span > 1 { vmerge = Some("continue"); }
|
|
}
|
|
}
|
|
|
|
if grid_span.is_some() || vmerge.is_some() {
|
|
// Ensure <w:tcPr> exists
|
|
if let Some(pr_start) = tc_block.find("<w:tcPr>") {
|
|
let insert_at = pr_start + "<w:tcPr>".len();
|
|
let mut props = String::new();
|
|
if let Some(span) = grid_span { props.push_str(&format!("<w:gridSpan w:val=\"{}\"/>", span)); }
|
|
if let Some(vm) = vmerge { props.push_str(&format!("<w:vMerge w:val=\"{}\"/>", vm)); }
|
|
tc_block.insert_str(insert_at, &props);
|
|
} else {
|
|
// Insert tcPr after <w:tc>
|
|
if let Some(tc_open_end) = tc_block.find(">") {
|
|
let insert_at = tc_open_end + 1;
|
|
let mut props = String::new();
|
|
props.push_str("<w:tcPr>");
|
|
if let Some(span) = grid_span { props.push_str(&format!("<w:gridSpan w:val=\"{}\"/>", span)); }
|
|
if let Some(vm) = vmerge { props.push_str(&format!("<w:vMerge w:val=\"{}\"/>", vm)); }
|
|
props.push_str("</w:tcPr>");
|
|
tc_block.insert_str(insert_at, &props);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Replace back this cell
|
|
tr_block.replace_range(tc_start..tc_end, &tc_block);
|
|
tr_cursor = tc_start + tc_block.len();
|
|
ci += 1;
|
|
} else { break; }
|
|
}
|
|
|
|
// Replace back this row
|
|
out.replace_range(tr_start..tr_end, &tr_block);
|
|
cursor = tr_start + tr_block.len();
|
|
ri += 1;
|
|
} else { break; }
|
|
}
|
|
}
|
|
|
|
Ok(out)
|
|
}
|
|
|
|
fn render_tbl_grid(&self, widths: &Vec<u32>) -> String {
|
|
let mut s = String::from("<w:tblGrid>");
|
|
for w in widths.iter() {
|
|
s.push_str(&format!("<w:gridCol w:w=\"{}\"/>", w));
|
|
}
|
|
s.push_str("</w:tblGrid>");
|
|
s
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "hi-fidelity-toc")]
|
|
impl DocxHandler {
|
|
fn apply_toc_xml_properties(&self, docx_path: &Path) -> Result<()> {
|
|
// Replace any __TOC__ placeholder paragraph with a field code TOC
|
|
let src_file = std::fs::File::open(docx_path)?;
|
|
let mut archive = ZipArchive::new(src_file)?;
|
|
let mut document_xml = String::new();
|
|
{
|
|
let mut f = archive.by_name("word/document.xml")?;
|
|
use std::io::Read as _;
|
|
f.read_to_string(&mut document_xml)?;
|
|
}
|
|
if !document_xml.contains("__TOC__") { return Ok(()); }
|
|
|
|
// Simple replacement: any paragraph containing __TOC__ becomes a standard TOC field
|
|
let toc_field_runs = r#"
|
|
<w:p>
|
|
<w:r><w:fldChar w:fldCharType="begin"/></w:r>
|
|
<w:r><w:instrText xml:space="preserve"> TOC \o "1-3" \h \z \u </w:instrText></w:r>
|
|
<w:r><w:fldChar w:fldCharType="separate"/></w:r>
|
|
<w:r><w:t>Table of Contents</w:t></w:r>
|
|
<w:r><w:fldChar w:fldCharType="end"/></w:r>
|
|
</w:p>
|
|
"#;
|
|
document_xml = document_xml.replace("__TOC__", "");
|
|
// Replace the whole paragraph when marker is present
|
|
// Crude but effective: replace the first parent <w:p>..</w:p> that contained the token
|
|
while let Some(pos) = document_xml.find("__TOC__") { // unlikely since we replaced above, but loop safe
|
|
// Fallback: just remove token
|
|
document_xml.replace_range(pos..pos+7, "");
|
|
}
|
|
// If there was at least one token originally, ensure we have one TOC block appended at top
|
|
if let Some(body_pos) = document_xml.find("<w:body>") {
|
|
let insert_at = body_pos + "<w:body>".len();
|
|
document_xml.insert_str(insert_at, toc_field_runs);
|
|
}
|
|
|
|
// Write back
|
|
let temp_path = docx_path.with_extension("docx.tmp");
|
|
let dst_file = std::fs::File::create(&temp_path)?;
|
|
let mut writer = ZipWriter::new(dst_file);
|
|
let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
|
|
for i in 0..archive.len() {
|
|
let mut file = archive.by_index(i)?;
|
|
let name = file.name().to_string();
|
|
use std::io::{Read as _, Write as _};
|
|
writer.start_file(name.clone(), options)?;
|
|
if name == "word/document.xml" {
|
|
writer.write_all(document_xml.as_bytes())?;
|
|
} else {
|
|
let mut buf = Vec::new();
|
|
file.read_to_end(&mut buf)?;
|
|
writer.write_all(&buf)?;
|
|
}
|
|
}
|
|
writer.finish()?;
|
|
std::fs::rename(&temp_path, docx_path)?;
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "hi-fidelity-bookmarks")]
|
|
impl DocxHandler {
|
|
fn apply_bookmarks_xml_properties(&self, docx_path: &Path) -> Result<()> {
|
|
// Convert paragraphs with __BOOKMARK__ 'Heading' 'Name' into bookmarkStart/End around following paragraph
|
|
let src_file = std::fs::File::open(docx_path)?;
|
|
let mut archive = ZipArchive::new(src_file)?;
|
|
let mut document_xml = String::new();
|
|
{
|
|
let mut f = archive.by_name("word/document.xml")?;
|
|
use std::io::Read as _;
|
|
f.read_to_string(&mut document_xml)?;
|
|
}
|
|
if !document_xml.contains("__BOOKMARK__") { return Ok(()); }
|
|
|
|
// Naive approach: remove marker paragraph entirely.
|
|
while let Some(p_start) = document_xml.find("<w:p>") {
|
|
if let Some(tok) = document_xml[p_start..].find("__BOOKMARK__") {
|
|
let abs = p_start + tok;
|
|
// Find paragraph bounds
|
|
if let Some(p_end_rel) = document_xml[p_start..].find("</w:p>") {
|
|
let p_end = p_start + p_end_rel + "</w:p>".len();
|
|
// Remove the marker paragraph
|
|
document_xml.replace_range(p_start..p_end, "");
|
|
continue;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
|
|
// Write back
|
|
let temp_path = docx_path.with_extension("docx.tmp");
|
|
let dst_file = std::fs::File::create(&temp_path)?;
|
|
let mut writer = ZipWriter::new(dst_file);
|
|
let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
|
|
for i in 0..archive.len() {
|
|
let mut file = archive.by_index(i)?;
|
|
let name = file.name().to_string();
|
|
use std::io::{Read as _, Write as _};
|
|
writer.start_file(name.clone(), options)?;
|
|
if name == "word/document.xml" {
|
|
writer.write_all(document_xml.as_bytes())?;
|
|
} else {
|
|
let mut buf = Vec::new();
|
|
file.read_to_end(&mut buf)?;
|
|
writer.write_all(&buf)?;
|
|
}
|
|
}
|
|
writer.finish()?;
|
|
std::fs::rename(&temp_path, docx_path)?;
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "hi-fidelity-styles")]
|
|
impl DocxHandler {
|
|
fn apply_styles_xml_properties(&self, docx_path: &Path) -> Result<()> {
|
|
let src_file = std::fs::File::open(docx_path)?;
|
|
let mut archive = ZipArchive::new(src_file)?;
|
|
|
|
// Read or initialize styles.xml
|
|
let mut styles_xml = String::new();
|
|
let mut has_styles = false;
|
|
if let Ok(mut f) = archive.by_name("word/styles.xml") {
|
|
use std::io::Read as _;
|
|
f.read_to_string(&mut styles_xml)?;
|
|
has_styles = true;
|
|
} else {
|
|
styles_xml = String::from("<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\"?>\
|
|
<w:styles xmlns:w=\"http://schemas.openxmlformats.org/wordprocessingml/2006/main\"></w:styles>");
|
|
}
|
|
|
|
if !styles_xml.contains("w:styleId=\"TableHeader\"") {
|
|
let style_def = concat!(
|
|
"<w:style w:type=\"paragraph\" w:styleId=\"TableHeader\">",
|
|
"<w:name w:val=\"TableHeader\"/>",
|
|
"<w:basedOn w:val=\"Normal\"/>",
|
|
"<w:qFormat/>",
|
|
"<w:rPr><w:b/><w:sz w:val=\"24\"/></w:rPr>",
|
|
"<w:pPr><w:spacing w:after=\"0\"/><w:jc w:val=\"center\"/></w:pPr>",
|
|
"</w:style>"
|
|
);
|
|
if let Some(pos) = styles_xml.rfind("</w:styles>") {
|
|
styles_xml.insert_str(pos, style_def);
|
|
}
|
|
}
|
|
|
|
// Repack archive with updated styles.xml
|
|
let temp_path = docx_path.with_extension("docx.tmp");
|
|
let dst_file = std::fs::File::create(&temp_path)?;
|
|
let mut writer = ZipWriter::new(dst_file);
|
|
let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
|
|
for i in 0..archive.len() {
|
|
let mut file = archive.by_index(i)?;
|
|
let name = file.name().to_string();
|
|
use std::io::{Read as _, Write as _};
|
|
writer.start_file(name.clone(), options)?;
|
|
if name == "word/styles.xml" {
|
|
writer.write_all(styles_xml.as_bytes())?;
|
|
} else {
|
|
let mut buf = Vec::new();
|
|
file.read_to_end(&mut buf)?;
|
|
writer.write_all(&buf)?;
|
|
}
|
|
}
|
|
|
|
if !has_styles {
|
|
// If styles.xml was missing originally, ensure it is added
|
|
writer.start_file("word/styles.xml".to_string(), options)?;
|
|
use std::io::Write as _;
|
|
writer.write_all(styles_xml.as_bytes())?;
|
|
}
|
|
|
|
writer.finish()?;
|
|
std::fs::rename(&temp_path, docx_path)?;
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "hi-fidelity-lists")]
|
|
impl DocxHandler {
|
|
fn apply_numbering_xml_properties(&self, docx_path: &Path, ops: &Vec<DocxOp>) -> Result<()> {
|
|
// Determine which list types are used
|
|
let mut need_ordered = false;
|
|
let mut need_unordered = false;
|
|
for op in ops.iter() {
|
|
match op {
|
|
DocxOp::List { ordered, .. } => { if *ordered { need_ordered = true; } else { need_unordered = true; } }
|
|
DocxOp::ListItem { ordered, .. } => { if *ordered { need_ordered = true; } else { need_unordered = true; } }
|
|
_ => {}
|
|
}
|
|
}
|
|
if !need_ordered && !need_unordered { return Ok(()); }
|
|
|
|
let src_file = std::fs::File::open(docx_path)?;
|
|
let mut archive = ZipArchive::new(src_file)?;
|
|
|
|
// Read numbering.xml
|
|
let mut numbering_xml = String::new();
|
|
{
|
|
let mut f = archive.by_name("word/numbering.xml").map_err(|_| anyhow::anyhow!("numbering.xml not found; ensure lists are added before calling"))?;
|
|
use std::io::Read as _;
|
|
f.read_to_string(&mut numbering_xml)?;
|
|
}
|
|
|
|
// Ensure abstractNum for ordered (10) and unordered (20)
|
|
if need_ordered && !numbering_xml.contains("w:abstractNumId=\"10\"") {
|
|
let block = self.make_abstract_num_block(10, false);
|
|
if let Some(pos) = numbering_xml.find("</w:numbering>") {
|
|
numbering_xml.insert_str(pos, &block);
|
|
}
|
|
}
|
|
if need_unordered && !numbering_xml.contains("w:abstractNumId=\"20\"") {
|
|
let block = self.make_abstract_num_block(20, true);
|
|
if let Some(pos) = numbering_xml.find("</w:numbering>") {
|
|
numbering_xml.insert_str(pos, &block);
|
|
}
|
|
}
|
|
|
|
// Write back
|
|
let temp_path = docx_path.with_extension("docx.tmp");
|
|
let dst_file = std::fs::File::create(&temp_path)?;
|
|
let mut writer = ZipWriter::new(dst_file);
|
|
let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
|
|
for i in 0..archive.len() {
|
|
let mut file = archive.by_index(i)?;
|
|
let name = file.name().to_string();
|
|
use std::io::{Read as _, Write as _};
|
|
writer.start_file(name.clone(), options)?;
|
|
if name == "word/numbering.xml" {
|
|
writer.write_all(numbering_xml.as_bytes())?;
|
|
} else {
|
|
let mut buf = Vec::new();
|
|
file.read_to_end(&mut buf)?;
|
|
writer.write_all(&buf)?;
|
|
}
|
|
}
|
|
writer.finish()?;
|
|
std::fs::rename(&temp_path, docx_path)?;
|
|
Ok(())
|
|
}
|
|
|
|
fn make_abstract_num_block(&self, abstract_id: usize, bullet: bool) -> String {
|
|
let mut s = format!("<w:abstractNum w:abstractNumId=\"{}\">", abstract_id);
|
|
for lvl in 0..9 {
|
|
let (fmt, txt) = if bullet { ("bullet", "•") } else { ("decimal", match lvl { 0 => "%1.", 1 => "%2.", 2 => "%3.", 3 => "%4.", 4 => "%5.", 5 => "%6.", 6 => "%7.", 7 => "%8.", _ => "%9." }) };
|
|
let lvl_text = if bullet { "•".to_string() } else { txt.to_string() };
|
|
s.push_str(&format!(
|
|
concat!(
|
|
"<w:lvl w:ilvl=\"{lvl}\">",
|
|
"<w:start w:val=\"1\"/>",
|
|
"<w:numFmt w:val=\"{fmt}\"/>",
|
|
"<w:lvlText w:val=\"{lvl_text}\"/>",
|
|
"<w:lvlJc w:val=\"left\"/>",
|
|
"<w:pPr><w:ind w:left=\"{left}\" w:hanging=\"{hang}\"/></w:pPr>",
|
|
"</w:lvl>"
|
|
),
|
|
lvl=lvl,
|
|
fmt=fmt,
|
|
lvl_text=lvl_text,
|
|
left=(lvl as i32 + 1) * 720,
|
|
hang=360,
|
|
));
|
|
}
|
|
s.push_str("</w:abstractNum>");
|
|
s
|
|
}
|
|
}
|
|
|
|
#[cfg(feature = "hi-fidelity-sections")]
|
|
impl DocxHandler {
|
|
fn apply_section_xml_properties(&self, docx_path: &Path, ops: &Vec<DocxOp>) -> Result<()> {
|
|
// Use the last section break spec, if any
|
|
let mut last_spec: Option<(Option<String>, Option<String>, Option<MarginsSpec>)> = None;
|
|
for op in ops.iter() {
|
|
if let DocxOp::SectionBreak { page_size, orientation, margins } = op {
|
|
last_spec = Some((page_size.clone(), orientation.clone(), margins.clone()));
|
|
}
|
|
}
|
|
if last_spec.is_none() { return Ok(()); }
|
|
let (page_size, orientation, margins) = last_spec.unwrap();
|
|
|
|
let (mut w, mut h) = match page_size.as_deref() {
|
|
Some("Letter") => (12240i32, 15840i32), // 8.5x11 in
|
|
_ => (11906i32, 16838i32), // default A4 210x297mm
|
|
};
|
|
if orientation.as_deref() == Some("landscape") {
|
|
std::mem::swap(&mut w, &mut h);
|
|
}
|
|
let margins = margins.unwrap_or(MarginsSpec { top: Some(1.0), bottom: Some(1.0), left: Some(1.0), right: Some(1.0) });
|
|
let to_twips = |opt: Option<f32>| -> i32 { ((opt.unwrap_or(1.0) * 1440.0).round() as i32).max(0) };
|
|
let mt = to_twips(margins.top);
|
|
let mb = to_twips(margins.bottom);
|
|
let ml = to_twips(margins.left);
|
|
let mr = to_twips(margins.right);
|
|
|
|
let sect_pr = if orientation.as_deref() == Some("landscape") {
|
|
format!("<w:sectPr><w:pgSz w:w=\"{}\" w:h=\"{}\" w:orient=\"landscape\"/><w:pgMar w:top=\"{}\" w:bottom=\"{}\" w:left=\"{}\" w:right=\"{}\"/></w:sectPr>", w, h, mt, mb, ml, mr)
|
|
} else {
|
|
format!("<w:sectPr><w:pgSz w:w=\"{}\" w:h=\"{}\"/><w:pgMar w:top=\"{}\" w:bottom=\"{}\" w:left=\"{}\" w:right=\"{}\"/></w:sectPr>", w, h, mt, mb, ml, mr)
|
|
};
|
|
|
|
let src_file = std::fs::File::open(docx_path)?;
|
|
let mut archive = ZipArchive::new(src_file)?;
|
|
let mut document_xml = String::new();
|
|
{
|
|
let mut f = archive.by_name("word/document.xml")?;
|
|
use std::io::Read as _;
|
|
f.read_to_string(&mut document_xml)?;
|
|
}
|
|
|
|
if let Some(pos) = document_xml.rfind("</w:body>") {
|
|
// Replace existing sectPr if present near end
|
|
if let Some(existing_start_rel) = document_xml[..pos].rfind("<w:sectPr") {
|
|
let closing_rel = document_xml[existing_start_rel..].find("</w:sectPr>");
|
|
if let Some(closing_rel) = closing_rel {
|
|
let closing_abs = existing_start_rel + closing_rel + "</w:sectPr>".len();
|
|
document_xml.replace_range(existing_start_rel..closing_abs, §_pr);
|
|
} else {
|
|
document_xml.insert_str(pos, §_pr);
|
|
}
|
|
} else {
|
|
document_xml.insert_str(pos, §_pr);
|
|
}
|
|
}
|
|
|
|
// Write back
|
|
let temp_path = docx_path.with_extension("docx.tmp");
|
|
let dst_file = std::fs::File::create(&temp_path)?;
|
|
let mut writer = ZipWriter::new(dst_file);
|
|
let options = FileOptions::default().compression_method(zip::CompressionMethod::Stored);
|
|
for i in 0..archive.len() {
|
|
let mut file = archive.by_index(i)?;
|
|
let name = file.name().to_string();
|
|
use std::io::{Read as _, Write as _};
|
|
writer.start_file(name.clone(), options)?;
|
|
if name == "word/document.xml" {
|
|
writer.write_all(document_xml.as_bytes())?;
|
|
} else {
|
|
let mut buf = Vec::new();
|
|
file.read_to_end(&mut buf)?;
|
|
writer.write_all(&buf)?;
|
|
}
|
|
}
|
|
writer.finish()?;
|
|
std::fs::rename(&temp_path, docx_path)?;
|
|
Ok(())
|
|
}
|
|
} |