Files
mcp-docx/tests/converter_tests.rs
T

497 lines
17 KiB
Rust

use anyhow::Result;
use docx_mcp::docx_handler::{DocxHandler, DocxStyle, TableData};
use docx_mcp::pure_converter::PureRustConverter;
use tempfile::TempDir;
use std::path::{Path, PathBuf};
use std::fs;
use pretty_assertions::assert_eq;
use rstest::*;
fn setup_test_handler_with_content() -> (DocxHandler, String, TempDir) {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
let doc_id = handler.create_document().unwrap();
// Add comprehensive content for testing
handler.add_heading(&doc_id, "Test Document Title", 1).unwrap();
handler.add_paragraph(&doc_id, "This is a comprehensive test document with various content types.", None).unwrap();
handler.add_heading(&doc_id, "Table Example", 2).unwrap();
let table_data = TableData {
rows: vec![
vec!["Product".to_string(), "Price".to_string(), "Quantity".to_string()],
vec!["Widget A".to_string(), "$10.00".to_string(), "5".to_string()],
vec!["Widget B".to_string(), "$15.00".to_string(), "3".to_string()],
],
headers: Some(vec!["Product".to_string(), "Price".to_string(), "Quantity".to_string()]),
border_style: Some("single".to_string()),
};
handler.add_table(&doc_id, table_data).unwrap();
handler.add_heading(&doc_id, "List Example", 2).unwrap();
let list_items = vec![
"First important point".to_string(),
"Second key feature".to_string(),
"Third critical aspect".to_string(),
];
handler.add_list(&doc_id, list_items, false).unwrap();
handler.add_paragraph(&doc_id, "Conclusion: This document demonstrates various formatting capabilities.", None).unwrap();
(handler, doc_id, temp_dir)
}
#[test]
fn test_pure_converter_creation() {
let converter = PureRustConverter::new();
// Just verify it can be created without panicking
assert!(true);
}
#[test]
fn test_extract_text_from_docx() -> Result<()> {
let (handler, doc_id, _temp_dir) = setup_test_handler_with_content();
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let extracted_text = converter.extract_text_from_docx(&metadata.path)?;
// Should contain all the content we added
assert!(extracted_text.contains("Test Document Title"));
assert!(extracted_text.contains("comprehensive test document"));
assert!(extracted_text.contains("Table Example"));
assert!(extracted_text.contains("Widget A"));
assert!(extracted_text.contains("First important point"));
assert!(extracted_text.contains("Conclusion"));
Ok(())
}
#[test]
fn test_extract_text_empty_document() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
let doc_id = handler.create_document().unwrap();
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let extracted_text = converter.extract_text_from_docx(&metadata.path)?;
// Empty document should return empty or whitespace-only text
assert!(extracted_text.trim().is_empty());
Ok(())
}
#[test]
fn test_convert_docx_to_pdf_basic() -> Result<()> {
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let output_path = temp_dir.path().join("test_output.pdf");
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
// Verify PDF file was created
assert!(output_path.exists());
// Check file size is reasonable (should be larger than empty PDF)
let file_size = fs::metadata(&output_path)?.len();
assert!(file_size > 1000); // PDF should be at least 1KB
// Verify it's actually a PDF file (starts with PDF signature)
let pdf_content = fs::read(&output_path)?;
assert!(pdf_content.starts_with(b"%PDF"));
Ok(())
}
#[test]
fn test_convert_docx_to_pdf_with_complex_content() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
let doc_id = handler.create_document().unwrap();
// Add content with special characters and formatting
handler.add_paragraph(&doc_id, "Special characters: éñüñ, 中文, русский, العربية", None)?;
let style = DocxStyle {
font_family: Some("Arial".to_string()),
font_size: Some(16),
bold: Some(true),
italic: Some(false),
underline: Some(true),
color: Some("#FF0000".to_string()),
alignment: Some("center".to_string()),
line_spacing: Some(1.5),
};
handler.add_paragraph(&doc_id, "Bold and underlined text", Some(style))?;
// Add multiple headings
for level in 1..=3 {
handler.add_heading(&doc_id, &format!("Heading Level {}", level), level)?;
}
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let output_path = temp_dir.path().join("complex_output.pdf");
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
assert!(output_path.exists());
let file_size = fs::metadata(&output_path)?.len();
assert!(file_size > 500); // Should be larger due to more content
Ok(())
}
#[test]
fn test_convert_docx_to_images() -> Result<()> {
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let output_dir = temp_dir.path().join("images");
fs::create_dir_all(&output_dir)?;
let image_paths = converter.convert_docx_to_images(&metadata.path, &output_dir)?;
// Should generate at least one image
assert!(!image_paths.is_empty());
// Verify all generated images exist
for image_path in &image_paths {
assert!(image_path.exists(), "Generated image should exist: {:?}", image_path);
let file_size = fs::metadata(image_path)?.len();
assert!(file_size > 100, "Image file should have reasonable size");
// Verify it's a PNG file (our default format)
if image_path.extension().and_then(|s| s.to_str()) == Some("png") {
let image_content = fs::read(image_path)?;
assert!(image_content.starts_with(&[0x89, 0x50, 0x4E, 0x47]), "Should be valid PNG");
}
}
Ok(())
}
#[test]
fn test_convert_docx_to_images_custom_format() -> Result<()> {
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let output_dir = temp_dir.path().join("jpeg_images");
fs::create_dir_all(&output_dir)?;
let image_paths = converter.convert_docx_to_images_with_format(&metadata.path, &output_dir, "jpeg", 150)?;
assert!(!image_paths.is_empty());
for image_path in &image_paths {
assert!(image_path.exists());
// Verify JPEG format
if image_path.extension().and_then(|s| s.to_str()) == Some("jpg") ||
image_path.extension().and_then(|s| s.to_str()) == Some("jpeg") {
let image_content = fs::read(image_path)?;
assert!(image_content.starts_with(&[0xFF, 0xD8, 0xFF]), "Should be valid JPEG");
}
}
Ok(())
}
#[test]
fn test_pdf_generation_with_embedded_fonts() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
let doc_id = handler.create_document().unwrap();
// Add text that might require different fonts
handler.add_paragraph(&doc_id, "Regular ASCII text", None)?;
handler.add_paragraph(&doc_id, "Unicode: àáâãäå çèéêë ìíîï ñòóôõö ùúûü ýÿ", None)?;
handler.add_paragraph(&doc_id, "Math symbols: ∑ ∏ ∫ √ ≤ ≥ ≠ ± ∞", None)?;
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let output_path = temp_dir.path().join("embedded_fonts.pdf");
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
assert!(output_path.exists());
let file_size = fs::metadata(&output_path)?.len();
assert!(file_size > 1000); // Should be larger due to embedded fonts
Ok(())
}
#[test]
fn test_batch_conversion() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
// Create multiple documents
let mut doc_paths = Vec::new();
for i in 0..3 {
let doc_id = handler.create_document().unwrap();
handler.add_paragraph(&doc_id, &format!("Document {} content", i), None)?;
let metadata = handler.get_metadata(&doc_id)?;
doc_paths.push(metadata.path);
}
let converter = PureRustConverter::new();
let output_dir = temp_dir.path().join("batch_output");
fs::create_dir_all(&output_dir)?;
// Convert all documents to PDF
for (i, doc_path) in doc_paths.iter().enumerate() {
let output_path = output_dir.join(format!("document_{}.pdf", i));
converter.convert_docx_to_pdf(doc_path, &output_path)?;
assert!(output_path.exists());
}
// Verify all PDFs were created
let pdf_files: Vec<_> = fs::read_dir(&output_dir)?
.filter_map(|entry| entry.ok())
.filter(|entry| entry.path().extension().and_then(|s| s.to_str()) == Some("pdf"))
.collect();
assert_eq!(pdf_files.len(), 3);
Ok(())
}
#[test]
fn test_error_handling_invalid_docx() {
let temp_dir = TempDir::new().unwrap();
let converter = PureRustConverter::new();
// Create a fake DOCX file (actually just text)
let fake_docx = temp_dir.path().join("fake.docx");
fs::write(&fake_docx, "This is not a DOCX file").unwrap();
// Should handle the error gracefully
let result = converter.extract_text_from_docx(&fake_docx);
assert!(result.is_err());
let output_path = temp_dir.path().join("output.pdf");
let result = converter.convert_docx_to_pdf(&fake_docx, &output_path);
assert!(result.is_err());
}
#[test]
fn test_error_handling_nonexistent_file() {
let temp_dir = TempDir::new().unwrap();
let converter = PureRustConverter::new();
let nonexistent = temp_dir.path().join("nonexistent.docx");
let result = converter.extract_text_from_docx(&nonexistent);
assert!(result.is_err());
let output_path = temp_dir.path().join("output.pdf");
let result = converter.convert_docx_to_pdf(&nonexistent, &output_path);
assert!(result.is_err());
}
#[test]
fn test_large_document_conversion() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
let doc_id = handler.create_document().unwrap();
// Create a large document with many pages
for i in 0..50 {
handler.add_heading(&doc_id, &format!("Section {}", i + 1), 1)?;
for j in 0..10 {
let content = format!("This is paragraph {} in section {}. It contains enough text to make the document substantial and test the conversion capabilities with larger files.", j + 1, i + 1);
handler.add_paragraph(&doc_id, &content, None)?;
}
if i % 10 == 9 {
handler.add_page_break(&doc_id)?;
}
}
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
// Test PDF conversion
let pdf_path = temp_dir.path().join("large_document.pdf");
converter.convert_docx_to_pdf(&metadata.path, &pdf_path)?;
assert!(pdf_path.exists());
let pdf_size = fs::metadata(&pdf_path)?.len();
assert!(pdf_size > 50000); // Should be a substantial PDF
// Test image conversion (but only first few pages to avoid excessive test time)
let images_dir = temp_dir.path().join("large_images");
fs::create_dir_all(&images_dir)?;
let image_paths = converter.convert_docx_to_images(&metadata.path, &images_dir)?;
assert!(!image_paths.is_empty());
// Should generate multiple images for multiple pages
assert!(image_paths.len() >= 2);
Ok(())
}
#[test]
fn test_text_extraction_accuracy() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
let doc_id = handler.create_document().unwrap();
// Add specific test content
let test_sentences = vec![
"The quick brown fox jumps over the lazy dog.",
"Pack my box with five dozen liquor jugs.",
"How vexingly quick daft zebras jump!",
"Sphinx of black quartz, judge my vow.",
];
for sentence in &test_sentences {
handler.add_paragraph(&doc_id, sentence, None)?;
}
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let extracted_text = converter.extract_text_from_docx(&metadata.path)?;
// Verify all sentences are present in the extracted text
for sentence in &test_sentences {
assert!(extracted_text.contains(sentence),
"Extracted text should contain: '{}'", sentence);
}
// Check word count accuracy
let expected_words: usize = test_sentences.iter()
.map(|s| s.split_whitespace().count())
.sum();
let extracted_words = extracted_text.split_whitespace().count();
// Should be approximately equal (allowing for minor differences)
let word_diff = if extracted_words > expected_words {
extracted_words - expected_words
} else {
expected_words - extracted_words
};
assert!(word_diff <= 5, "Word count difference too large: expected ~{}, got {}", expected_words, extracted_words);
Ok(())
}
#[test]
fn test_conversion_with_different_page_sizes() -> Result<()> {
let temp_dir = TempDir::new().unwrap();
let mut handler = DocxHandler::new().unwrap();
let doc_id = handler.create_document().unwrap();
handler.add_paragraph(&doc_id, "This document tests page size handling during conversion.", None)?;
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
// Test different output formats and sizes
let test_cases = vec![
("a4.pdf", "A4"),
("letter.pdf", "Letter"),
("legal.pdf", "Legal"),
];
for (filename, _page_size) in test_cases {
let output_path = temp_dir.path().join(filename);
// Note: In a full implementation, you'd pass page_size to the converter
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
assert!(output_path.exists());
let file_size = fs::metadata(&output_path)?.len();
assert!(file_size > 500); // Reasonable minimum size
}
Ok(())
}
// Parametrized test for different image formats
#[rstest]
#[case("png", &[0x89, 0x50, 0x4E, 0x47])]
#[case("jpeg", &[0xFF, 0xD8, 0xFF])]
fn test_image_format_conversion(#[case] format: &str, #[case] signature: &[u8]) -> Result<()> {
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let output_dir = temp_dir.path().join(format!("{}_images", format));
fs::create_dir_all(&output_dir)?;
let image_paths = converter.convert_docx_to_images_with_format(&metadata.path, &output_dir, format, 100)?;
assert!(!image_paths.is_empty());
for image_path in &image_paths {
assert!(image_path.exists());
let image_content = fs::read(image_path)?;
assert!(image_content.starts_with(signature),
"Image should have correct format signature for {}", format);
}
Ok(())
}
#[test]
fn test_conversion_thread_safety() -> Result<()> {
use std::sync::Arc;
use std::thread;
let temp_dir = TempDir::new().unwrap();
let temp_path = Arc::new(temp_dir.path().to_path_buf());
let handles: Vec<_> = (0..3).map(|i| {
let temp_path = Arc::clone(&temp_path);
thread::spawn(move || -> Result<()> {
let mut handler = DocxHandler::new()?;
let doc_id = handler.create_document()?;
handler.add_paragraph(&doc_id, &format!("Thread {} test content", i), None)?;
let metadata = handler.get_metadata(&doc_id)?;
let converter = PureRustConverter::new();
let pdf_path = temp_path.join(format!("thread_{}.pdf", i));
converter.convert_docx_to_pdf(&metadata.path, &pdf_path)?;
assert!(pdf_path.exists());
Ok(())
})
}).collect();
// Wait for all threads to complete
for handle in handles {
handle.join().unwrap()?;
}
// Verify all PDFs were created
let pdf_count = fs::read_dir(&temp_dir)?
.filter_map(|entry| entry.ok())
.filter(|entry| entry.path().extension().and_then(|s| s.to_str()) == Some("pdf"))
.count();
assert_eq!(pdf_count, 3);
Ok(())
}