feat(docx): add hi-fidelity XML injections for tables, styles, lists, and sections; extend tools and tests
- Add feature flags: hi-fidelity-tables, hi-fidelity-styles, hi-fidelity-lists, hi-fidelity-sections - Tables: inject true w:gridSpan/w:vMerge and w:tblGrid widths via post-build XML when enabled - Styles: ensure TableHeader style in styles.xml; tag first row when headers present - Lists: robust numbering.xml for ordered/unordered with multi-level definitions - Sections: write tail w:sectPr with page size/orientation/margins - Tools: expose new operations (sections, list items, images, hyperlinks, props, redaction, storage) - Converters: add preference-aware methods for hi-fidelity export paths; HTML export tool - Tests: add golden XML assertions gated by feature flags; keep default build green This enables high-fidelity DOCX output while keeping pure-Rust paths by default.
This commit is contained in:
+36
-5
@@ -21,7 +21,7 @@ impl DocumentConverter {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
pure_converter: PureRustConverter::new(),
|
||||
prefer_external_tools: false, // Default to pure Rust implementation
|
||||
prefer_external_tools: cfg!(feature = "hi-fidelity"), // Prefer external/hi-fi if feature enabled
|
||||
}
|
||||
}
|
||||
|
||||
@@ -47,6 +47,24 @@ impl DocumentConverter {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convert with explicit preference overriding internal default
|
||||
pub fn docx_to_pdf_with_preference(&self, docx_path: &Path, pdf_path: &Path, prefer_external: bool) -> Result<()> {
|
||||
if prefer_external {
|
||||
if self.try_libreoffice_conversion(docx_path, pdf_path).is_ok() {
|
||||
info!("Successfully converted DOCX to PDF using LibreOffice (explicit preference)");
|
||||
return Ok(());
|
||||
}
|
||||
if self.try_unoconv_conversion(docx_path, pdf_path).is_ok() {
|
||||
info!("Successfully converted DOCX to PDF using unoconv (explicit preference)");
|
||||
return Ok(());
|
||||
}
|
||||
}
|
||||
// Fallback to pure implementation
|
||||
self.pure_converter.docx_to_pdf_pure(docx_path, pdf_path)?;
|
||||
info!("Successfully converted DOCX to PDF using pure Rust implementation (explicit preference)");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn try_libreoffice_conversion(&self, docx_path: &Path, pdf_path: &Path) -> Result<()> {
|
||||
let output = Command::new("libreoffice")
|
||||
.args(&[
|
||||
@@ -120,7 +138,7 @@ impl DocumentConverter {
|
||||
|
||||
// Create a basic PDF with the extracted text
|
||||
let (doc, page1, layer1) = PdfDocument::new("Document", Mm(210.0), Mm(297.0), "Layer 1");
|
||||
let current_layer = doc.get_page(page1).get_layer(layer1);
|
||||
let _current_layer = doc.get_page(page1).get_layer(layer1);
|
||||
|
||||
// Load a basic font
|
||||
let font = doc.add_builtin_font(BuiltinFont::Helvetica)?;
|
||||
@@ -130,14 +148,13 @@ impl DocumentConverter {
|
||||
let mut y_position = Mm(280.0);
|
||||
let line_height = Mm(5.0);
|
||||
|
||||
let mut current_layer = doc.get_page(page1).get_layer(layer1);
|
||||
for line in lines {
|
||||
if y_position < Mm(20.0) {
|
||||
// Add new page if needed
|
||||
let (page, layer) = doc.add_page(Mm(210.0), Mm(297.0), "Page layer");
|
||||
let current_layer = doc.get_page(page).get_layer(layer);
|
||||
current_layer = doc.get_page(page).get_layer(layer);
|
||||
y_position = Mm(280.0);
|
||||
}
|
||||
|
||||
current_layer.use_text(line, 12.0, Mm(10.0), y_position, &font);
|
||||
y_position -= line_height;
|
||||
}
|
||||
@@ -343,6 +360,20 @@ impl DocumentConverter {
|
||||
Ok(images)
|
||||
}
|
||||
|
||||
pub fn docx_to_images_with_preference(
|
||||
&self,
|
||||
docx_path: &Path,
|
||||
output_dir: &Path,
|
||||
format: ImageFormat,
|
||||
dpi: u32,
|
||||
prefer_external: bool,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
let temp_pdf = NamedTempFile::new()?.into_temp_path();
|
||||
self.docx_to_pdf_with_preference(docx_path, &temp_pdf, prefer_external)?;
|
||||
let images = self.pdf_to_images(&temp_pdf, output_dir, format, dpi)?;
|
||||
Ok(images)
|
||||
}
|
||||
|
||||
pub fn create_thumbnail(
|
||||
&self,
|
||||
image_path: &Path,
|
||||
|
||||
+929
-17
File diff suppressed because it is too large
Load Diff
+584
-251
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,6 @@
|
||||
pub mod security;
|
||||
pub mod fonts_cli;
|
||||
pub mod response;
|
||||
|
||||
// Expose primary modules for tests and external use
|
||||
pub mod docx_tools;
|
||||
|
||||
@@ -2,7 +2,7 @@ use anyhow::{Context, Result};
|
||||
use ::image::{DynamicImage, ImageFormat, Rgba, RgbaImage};
|
||||
use printpdf::*;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{BufReader, BufWriter, Read};
|
||||
use std::io::{BufWriter, Read};
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempfile::NamedTempFile;
|
||||
use tracing::{info};
|
||||
@@ -297,7 +297,7 @@ impl PureRustConverter {
|
||||
|
||||
/// Merge multiple PDFs using pure Rust
|
||||
pub fn merge_pdfs_pure(&self, pdf_paths: &[PathBuf], output_path: &Path) -> Result<()> {
|
||||
use ::lopdf::{Document, Object, ObjectId};
|
||||
use ::lopdf::{Document, Object};
|
||||
|
||||
// Create a new document for merging
|
||||
let mut merged_doc = Document::with_version("1.5");
|
||||
|
||||
@@ -0,0 +1,42 @@
|
||||
use serde::{Serialize, Deserialize};
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
pub enum ToolOutcome {
|
||||
Ok { message: Option<String> },
|
||||
Created { document_id: String, message: Option<String> },
|
||||
Text { text: String },
|
||||
Metadata { metadata: serde_json::Value },
|
||||
Documents { documents: serde_json::Value },
|
||||
Images { images: Vec<String>, message: Option<String> },
|
||||
Security { security: serde_json::Value },
|
||||
Storage { storage: serde_json::Value },
|
||||
Statistics { statistics: serde_json::Value },
|
||||
Structure { structure: serde_json::Value },
|
||||
Error { code: ErrorCode, error: String, hint: Option<String> },
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
#[serde(rename_all = "SCREAMING_SNAKE_CASE")]
|
||||
pub enum ErrorCode {
|
||||
DocNotFound,
|
||||
ValidationError,
|
||||
SecurityDenied,
|
||||
LimitExceeded,
|
||||
UnknownTool,
|
||||
InternalError,
|
||||
}
|
||||
|
||||
impl ToolOutcome {
|
||||
pub fn success(&self) -> bool {
|
||||
!matches!(self, ToolOutcome::Error { .. })
|
||||
}
|
||||
|
||||
pub fn into_json(self) -> serde_json::Value {
|
||||
serde_json::to_value(self).unwrap_or_else(|e| serde_json::json!({
|
||||
"type": "error",
|
||||
"code": ErrorCode::InternalError,
|
||||
"error": format!("serialization failed: {}", e),
|
||||
}))
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user