feat(docx): add hi-fidelity XML injections for tables, styles, lists, and sections; extend tools and tests

- Add feature flags: hi-fidelity-tables, hi-fidelity-styles, hi-fidelity-lists, hi-fidelity-sections
- Tables: inject true w:gridSpan/w:vMerge and w:tblGrid widths via post-build XML when enabled
- Styles: ensure TableHeader style in styles.xml; tag first row when headers present
- Lists: robust numbering.xml for ordered/unordered with multi-level definitions
- Sections: write tail w:sectPr with page size/orientation/margins
- Tools: expose new operations (sections, list items, images, hyperlinks, props, redaction, storage)
- Converters: add preference-aware methods for hi-fidelity export paths; HTML export tool
- Tests: add golden XML assertions gated by feature flags; keep default build green

This enables high-fidelity DOCX output while keeping pure-Rust paths by default.
This commit is contained in:
Andy
2025-08-12 23:25:29 +08:00
parent c30f55d16d
commit 90305551cc
14 changed files with 1983 additions and 277 deletions
+36 -5
View File
@@ -21,7 +21,7 @@ impl DocumentConverter {
pub fn new() -> Self {
Self {
pure_converter: PureRustConverter::new(),
prefer_external_tools: false, // Default to pure Rust implementation
prefer_external_tools: cfg!(feature = "hi-fidelity"), // Prefer external/hi-fi if feature enabled
}
}
@@ -47,6 +47,24 @@ impl DocumentConverter {
Ok(())
}
/// Convert with explicit preference overriding internal default
pub fn docx_to_pdf_with_preference(&self, docx_path: &Path, pdf_path: &Path, prefer_external: bool) -> Result<()> {
if prefer_external {
if self.try_libreoffice_conversion(docx_path, pdf_path).is_ok() {
info!("Successfully converted DOCX to PDF using LibreOffice (explicit preference)");
return Ok(());
}
if self.try_unoconv_conversion(docx_path, pdf_path).is_ok() {
info!("Successfully converted DOCX to PDF using unoconv (explicit preference)");
return Ok(());
}
}
// Fallback to pure implementation
self.pure_converter.docx_to_pdf_pure(docx_path, pdf_path)?;
info!("Successfully converted DOCX to PDF using pure Rust implementation (explicit preference)");
Ok(())
}
fn try_libreoffice_conversion(&self, docx_path: &Path, pdf_path: &Path) -> Result<()> {
let output = Command::new("libreoffice")
.args(&[
@@ -120,7 +138,7 @@ impl DocumentConverter {
// Create a basic PDF with the extracted text
let (doc, page1, layer1) = PdfDocument::new("Document", Mm(210.0), Mm(297.0), "Layer 1");
let current_layer = doc.get_page(page1).get_layer(layer1);
let _current_layer = doc.get_page(page1).get_layer(layer1);
// Load a basic font
let font = doc.add_builtin_font(BuiltinFont::Helvetica)?;
@@ -130,14 +148,13 @@ impl DocumentConverter {
let mut y_position = Mm(280.0);
let line_height = Mm(5.0);
let mut current_layer = doc.get_page(page1).get_layer(layer1);
for line in lines {
if y_position < Mm(20.0) {
// Add new page if needed
let (page, layer) = doc.add_page(Mm(210.0), Mm(297.0), "Page layer");
let current_layer = doc.get_page(page).get_layer(layer);
current_layer = doc.get_page(page).get_layer(layer);
y_position = Mm(280.0);
}
current_layer.use_text(line, 12.0, Mm(10.0), y_position, &font);
y_position -= line_height;
}
@@ -343,6 +360,20 @@ impl DocumentConverter {
Ok(images)
}
pub fn docx_to_images_with_preference(
&self,
docx_path: &Path,
output_dir: &Path,
format: ImageFormat,
dpi: u32,
prefer_external: bool,
) -> Result<Vec<PathBuf>> {
let temp_pdf = NamedTempFile::new()?.into_temp_path();
self.docx_to_pdf_with_preference(docx_path, &temp_pdf, prefer_external)?;
let images = self.pdf_to_images(&temp_pdf, output_dir, format, dpi)?;
Ok(images)
}
pub fn create_thumbnail(
&self,
image_path: &Path,
+929 -17
View File
File diff suppressed because it is too large Load Diff
+584 -251
View File
File diff suppressed because it is too large Load Diff
+1
View File
@@ -1,5 +1,6 @@
pub mod security;
pub mod fonts_cli;
pub mod response;
// Expose primary modules for tests and external use
pub mod docx_tools;
+2 -2
View File
@@ -2,7 +2,7 @@ use anyhow::{Context, Result};
use ::image::{DynamicImage, ImageFormat, Rgba, RgbaImage};
use printpdf::*;
use std::fs::{self, File};
use std::io::{BufReader, BufWriter, Read};
use std::io::{BufWriter, Read};
use std::path::{Path, PathBuf};
use tempfile::NamedTempFile;
use tracing::{info};
@@ -297,7 +297,7 @@ impl PureRustConverter {
/// Merge multiple PDFs using pure Rust
pub fn merge_pdfs_pure(&self, pdf_paths: &[PathBuf], output_path: &Path) -> Result<()> {
use ::lopdf::{Document, Object, ObjectId};
use ::lopdf::{Document, Object};
// Create a new document for merging
let mut merged_doc = Document::with_version("1.5");
+42
View File
@@ -0,0 +1,42 @@
use serde::{Serialize, Deserialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ToolOutcome {
Ok { message: Option<String> },
Created { document_id: String, message: Option<String> },
Text { text: String },
Metadata { metadata: serde_json::Value },
Documents { documents: serde_json::Value },
Images { images: Vec<String>, message: Option<String> },
Security { security: serde_json::Value },
Storage { storage: serde_json::Value },
Statistics { statistics: serde_json::Value },
Structure { structure: serde_json::Value },
Error { code: ErrorCode, error: String, hint: Option<String> },
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
pub enum ErrorCode {
DocNotFound,
ValidationError,
SecurityDenied,
LimitExceeded,
UnknownTool,
InternalError,
}
impl ToolOutcome {
pub fn success(&self) -> bool {
!matches!(self, ToolOutcome::Error { .. })
}
pub fn into_json(self) -> serde_json::Value {
serde_json::to_value(self).unwrap_or_else(|e| serde_json::json!({
"type": "error",
"code": ErrorCode::InternalError,
"error": format!("serialization failed: {}", e),
}))
}
}