Initial Commit
This commit is contained in:
@@ -0,0 +1,497 @@
|
||||
use anyhow::Result;
|
||||
use docx_mcp::docx_handler::{DocxHandler, DocxStyle, TableData};
|
||||
use docx_mcp::pure_converter::PureRustConverter;
|
||||
use tempfile::TempDir;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::fs;
|
||||
use pretty_assertions::assert_eq;
|
||||
use rstest::*;
|
||||
|
||||
fn setup_test_handler_with_content() -> (DocxHandler, String, TempDir) {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
|
||||
// Add comprehensive content for testing
|
||||
handler.add_heading(&doc_id, "Test Document Title", 1).unwrap();
|
||||
handler.add_paragraph(&doc_id, "This is a comprehensive test document with various content types.", None).unwrap();
|
||||
|
||||
handler.add_heading(&doc_id, "Table Example", 2).unwrap();
|
||||
let table_data = TableData {
|
||||
rows: vec![
|
||||
vec!["Product".to_string(), "Price".to_string(), "Quantity".to_string()],
|
||||
vec!["Widget A".to_string(), "$10.00".to_string(), "5".to_string()],
|
||||
vec!["Widget B".to_string(), "$15.00".to_string(), "3".to_string()],
|
||||
],
|
||||
headers: Some(vec!["Product".to_string(), "Price".to_string(), "Quantity".to_string()]),
|
||||
border_style: Some("single".to_string()),
|
||||
};
|
||||
handler.add_table(&doc_id, table_data).unwrap();
|
||||
|
||||
handler.add_heading(&doc_id, "List Example", 2).unwrap();
|
||||
let list_items = vec![
|
||||
"First important point".to_string(),
|
||||
"Second key feature".to_string(),
|
||||
"Third critical aspect".to_string(),
|
||||
];
|
||||
handler.add_list(&doc_id, list_items, false).unwrap();
|
||||
|
||||
handler.add_paragraph(&doc_id, "Conclusion: This document demonstrates various formatting capabilities.", None).unwrap();
|
||||
|
||||
(handler, doc_id, temp_dir)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pure_converter_creation() {
|
||||
let converter = PureRustConverter::new();
|
||||
// Just verify it can be created without panicking
|
||||
assert!(true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_text_from_docx() -> Result<()> {
|
||||
let (handler, doc_id, _temp_dir) = setup_test_handler_with_content();
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let extracted_text = converter.extract_text_from_docx(&metadata.path)?;
|
||||
|
||||
// Should contain all the content we added
|
||||
assert!(extracted_text.contains("Test Document Title"));
|
||||
assert!(extracted_text.contains("comprehensive test document"));
|
||||
assert!(extracted_text.contains("Table Example"));
|
||||
assert!(extracted_text.contains("Widget A"));
|
||||
assert!(extracted_text.contains("First important point"));
|
||||
assert!(extracted_text.contains("Conclusion"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_text_empty_document() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let extracted_text = converter.extract_text_from_docx(&metadata.path)?;
|
||||
|
||||
// Empty document should return empty or whitespace-only text
|
||||
assert!(extracted_text.trim().is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_docx_to_pdf_basic() -> Result<()> {
|
||||
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let output_path = temp_dir.path().join("test_output.pdf");
|
||||
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
|
||||
|
||||
// Verify PDF file was created
|
||||
assert!(output_path.exists());
|
||||
|
||||
// Check file size is reasonable (should be larger than empty PDF)
|
||||
let file_size = fs::metadata(&output_path)?.len();
|
||||
assert!(file_size > 1000); // PDF should be at least 1KB
|
||||
|
||||
// Verify it's actually a PDF file (starts with PDF signature)
|
||||
let pdf_content = fs::read(&output_path)?;
|
||||
assert!(pdf_content.starts_with(b"%PDF"));
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_docx_to_pdf_with_complex_content() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
|
||||
// Add content with special characters and formatting
|
||||
handler.add_paragraph(&doc_id, "Special characters: éñüñ, 中文, русский, العربية", None)?;
|
||||
|
||||
let style = DocxStyle {
|
||||
font_family: Some("Arial".to_string()),
|
||||
font_size: Some(16),
|
||||
bold: Some(true),
|
||||
italic: Some(false),
|
||||
underline: Some(true),
|
||||
color: Some("#FF0000".to_string()),
|
||||
alignment: Some("center".to_string()),
|
||||
line_spacing: Some(1.5),
|
||||
};
|
||||
handler.add_paragraph(&doc_id, "Bold and underlined text", Some(style))?;
|
||||
|
||||
// Add multiple headings
|
||||
for level in 1..=3 {
|
||||
handler.add_heading(&doc_id, &format!("Heading Level {}", level), level)?;
|
||||
}
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let output_path = temp_dir.path().join("complex_output.pdf");
|
||||
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
|
||||
|
||||
assert!(output_path.exists());
|
||||
let file_size = fs::metadata(&output_path)?.len();
|
||||
assert!(file_size > 2000); // Should be larger due to more content
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_docx_to_images() -> Result<()> {
|
||||
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let output_dir = temp_dir.path().join("images");
|
||||
fs::create_dir_all(&output_dir)?;
|
||||
|
||||
let image_paths = converter.convert_docx_to_images(&metadata.path, &output_dir)?;
|
||||
|
||||
// Should generate at least one image
|
||||
assert!(!image_paths.is_empty());
|
||||
|
||||
// Verify all generated images exist
|
||||
for image_path in &image_paths {
|
||||
assert!(image_path.exists(), "Generated image should exist: {:?}", image_path);
|
||||
|
||||
let file_size = fs::metadata(image_path)?.len();
|
||||
assert!(file_size > 100, "Image file should have reasonable size");
|
||||
|
||||
// Verify it's a PNG file (our default format)
|
||||
if image_path.extension().and_then(|s| s.to_str()) == Some("png") {
|
||||
let image_content = fs::read(image_path)?;
|
||||
assert!(image_content.starts_with(&[0x89, 0x50, 0x4E, 0x47]), "Should be valid PNG");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_convert_docx_to_images_custom_format() -> Result<()> {
|
||||
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let output_dir = temp_dir.path().join("jpeg_images");
|
||||
fs::create_dir_all(&output_dir)?;
|
||||
|
||||
let image_paths = converter.convert_docx_to_images_with_format(&metadata.path, &output_dir, "jpeg", 150)?;
|
||||
|
||||
assert!(!image_paths.is_empty());
|
||||
|
||||
for image_path in &image_paths {
|
||||
assert!(image_path.exists());
|
||||
|
||||
// Verify JPEG format
|
||||
if image_path.extension().and_then(|s| s.to_str()) == Some("jpg") ||
|
||||
image_path.extension().and_then(|s| s.to_str()) == Some("jpeg") {
|
||||
let image_content = fs::read(image_path)?;
|
||||
assert!(image_content.starts_with(&[0xFF, 0xD8, 0xFF]), "Should be valid JPEG");
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_pdf_generation_with_embedded_fonts() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
|
||||
// Add text that might require different fonts
|
||||
handler.add_paragraph(&doc_id, "Regular ASCII text", None)?;
|
||||
handler.add_paragraph(&doc_id, "Unicode: àáâãäå çèéêë ìíîï ñòóôõö ùúûü ýÿ", None)?;
|
||||
handler.add_paragraph(&doc_id, "Math symbols: ∑ ∏ ∫ √ ≤ ≥ ≠ ± ∞", None)?;
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let output_path = temp_dir.path().join("embedded_fonts.pdf");
|
||||
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
|
||||
|
||||
assert!(output_path.exists());
|
||||
let file_size = fs::metadata(&output_path)?.len();
|
||||
assert!(file_size > 5000); // Should be larger due to embedded fonts
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_batch_conversion() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
|
||||
// Create multiple documents
|
||||
let mut doc_paths = Vec::new();
|
||||
for i in 0..3 {
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
handler.add_paragraph(&doc_id, &format!("Document {} content", i), None)?;
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
doc_paths.push(metadata.path);
|
||||
}
|
||||
|
||||
let converter = PureRustConverter::new();
|
||||
let output_dir = temp_dir.path().join("batch_output");
|
||||
fs::create_dir_all(&output_dir)?;
|
||||
|
||||
// Convert all documents to PDF
|
||||
for (i, doc_path) in doc_paths.iter().enumerate() {
|
||||
let output_path = output_dir.join(format!("document_{}.pdf", i));
|
||||
converter.convert_docx_to_pdf(doc_path, &output_path)?;
|
||||
|
||||
assert!(output_path.exists());
|
||||
}
|
||||
|
||||
// Verify all PDFs were created
|
||||
let pdf_files: Vec<_> = fs::read_dir(&output_dir)?
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter(|entry| entry.path().extension().and_then(|s| s.to_str()) == Some("pdf"))
|
||||
.collect();
|
||||
|
||||
assert_eq!(pdf_files.len(), 3);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_handling_invalid_docx() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
// Create a fake DOCX file (actually just text)
|
||||
let fake_docx = temp_dir.path().join("fake.docx");
|
||||
fs::write(&fake_docx, "This is not a DOCX file").unwrap();
|
||||
|
||||
// Should handle the error gracefully
|
||||
let result = converter.extract_text_from_docx(&fake_docx);
|
||||
assert!(result.is_err());
|
||||
|
||||
let output_path = temp_dir.path().join("output.pdf");
|
||||
let result = converter.convert_docx_to_pdf(&fake_docx, &output_path);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_error_handling_nonexistent_file() {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let nonexistent = temp_dir.path().join("nonexistent.docx");
|
||||
|
||||
let result = converter.extract_text_from_docx(&nonexistent);
|
||||
assert!(result.is_err());
|
||||
|
||||
let output_path = temp_dir.path().join("output.pdf");
|
||||
let result = converter.convert_docx_to_pdf(&nonexistent, &output_path);
|
||||
assert!(result.is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_large_document_conversion() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
|
||||
// Create a large document with many pages
|
||||
for i in 0..50 {
|
||||
handler.add_heading(&doc_id, &format!("Section {}", i + 1), 1)?;
|
||||
|
||||
for j in 0..10 {
|
||||
let content = format!("This is paragraph {} in section {}. It contains enough text to make the document substantial and test the conversion capabilities with larger files.", j + 1, i + 1);
|
||||
handler.add_paragraph(&doc_id, &content, None)?;
|
||||
}
|
||||
|
||||
if i % 10 == 9 {
|
||||
handler.add_page_break(&doc_id)?;
|
||||
}
|
||||
}
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
// Test PDF conversion
|
||||
let pdf_path = temp_dir.path().join("large_document.pdf");
|
||||
converter.convert_docx_to_pdf(&metadata.path, &pdf_path)?;
|
||||
|
||||
assert!(pdf_path.exists());
|
||||
let pdf_size = fs::metadata(&pdf_path)?.len();
|
||||
assert!(pdf_size > 50000); // Should be a substantial PDF
|
||||
|
||||
// Test image conversion (but only first few pages to avoid excessive test time)
|
||||
let images_dir = temp_dir.path().join("large_images");
|
||||
fs::create_dir_all(&images_dir)?;
|
||||
|
||||
let image_paths = converter.convert_docx_to_images(&metadata.path, &images_dir)?;
|
||||
assert!(!image_paths.is_empty());
|
||||
|
||||
// Should generate multiple images for multiple pages
|
||||
assert!(image_paths.len() >= 2);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_extraction_accuracy() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
|
||||
// Add specific test content
|
||||
let test_sentences = vec![
|
||||
"The quick brown fox jumps over the lazy dog.",
|
||||
"Pack my box with five dozen liquor jugs.",
|
||||
"How vexingly quick daft zebras jump!",
|
||||
"Sphinx of black quartz, judge my vow.",
|
||||
];
|
||||
|
||||
for sentence in &test_sentences {
|
||||
handler.add_paragraph(&doc_id, sentence, None)?;
|
||||
}
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let extracted_text = converter.extract_text_from_docx(&metadata.path)?;
|
||||
|
||||
// Verify all sentences are present in the extracted text
|
||||
for sentence in &test_sentences {
|
||||
assert!(extracted_text.contains(sentence),
|
||||
"Extracted text should contain: '{}'", sentence);
|
||||
}
|
||||
|
||||
// Check word count accuracy
|
||||
let expected_words: usize = test_sentences.iter()
|
||||
.map(|s| s.split_whitespace().count())
|
||||
.sum();
|
||||
let extracted_words = extracted_text.split_whitespace().count();
|
||||
|
||||
// Should be approximately equal (allowing for minor differences)
|
||||
let word_diff = if extracted_words > expected_words {
|
||||
extracted_words - expected_words
|
||||
} else {
|
||||
expected_words - extracted_words
|
||||
};
|
||||
assert!(word_diff <= 5, "Word count difference too large: expected ~{}, got {}", expected_words, extracted_words);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_conversion_with_different_page_sizes() -> Result<()> {
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let mut handler = DocxHandler::new_with_temp_dir(temp_dir.path()).unwrap();
|
||||
let doc_id = handler.create_document().unwrap();
|
||||
|
||||
handler.add_paragraph(&doc_id, "This document tests page size handling during conversion.", None)?;
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
// Test different output formats and sizes
|
||||
let test_cases = vec![
|
||||
("a4.pdf", "A4"),
|
||||
("letter.pdf", "Letter"),
|
||||
("legal.pdf", "Legal"),
|
||||
];
|
||||
|
||||
for (filename, _page_size) in test_cases {
|
||||
let output_path = temp_dir.path().join(filename);
|
||||
|
||||
// Note: In a full implementation, you'd pass page_size to the converter
|
||||
converter.convert_docx_to_pdf(&metadata.path, &output_path)?;
|
||||
|
||||
assert!(output_path.exists());
|
||||
let file_size = fs::metadata(&output_path)?.len();
|
||||
assert!(file_size > 500); // Reasonable minimum size
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Parametrized test for different image formats
|
||||
#[rstest]
|
||||
#[case("png", &[0x89, 0x50, 0x4E, 0x47])]
|
||||
#[case("jpeg", &[0xFF, 0xD8, 0xFF])]
|
||||
fn test_image_format_conversion(#[case] format: &str, #[case] signature: &[u8]) -> Result<()> {
|
||||
let (handler, doc_id, temp_dir) = setup_test_handler_with_content();
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let output_dir = temp_dir.path().join(format!("{}_images", format));
|
||||
fs::create_dir_all(&output_dir)?;
|
||||
|
||||
let image_paths = converter.convert_docx_to_images_with_format(&metadata.path, &output_dir, format, 100)?;
|
||||
|
||||
assert!(!image_paths.is_empty());
|
||||
|
||||
for image_path in &image_paths {
|
||||
assert!(image_path.exists());
|
||||
|
||||
let image_content = fs::read(image_path)?;
|
||||
assert!(image_content.starts_with(signature),
|
||||
"Image should have correct format signature for {}", format);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_conversion_thread_safety() -> Result<()> {
|
||||
use std::sync::Arc;
|
||||
use std::thread;
|
||||
|
||||
let temp_dir = TempDir::new().unwrap();
|
||||
let temp_path = Arc::new(temp_dir.path().to_path_buf());
|
||||
|
||||
let handles: Vec<_> = (0..3).map(|i| {
|
||||
let temp_path = Arc::clone(&temp_path);
|
||||
thread::spawn(move || -> Result<()> {
|
||||
let mut handler = DocxHandler::new_with_temp_dir(&temp_path)?;
|
||||
let doc_id = handler.create_document()?;
|
||||
|
||||
handler.add_paragraph(&doc_id, &format!("Thread {} test content", i), None)?;
|
||||
|
||||
let metadata = handler.get_metadata(&doc_id)?;
|
||||
let converter = PureRustConverter::new();
|
||||
|
||||
let pdf_path = temp_path.join(format!("thread_{}.pdf", i));
|
||||
converter.convert_docx_to_pdf(&metadata.path, &pdf_path)?;
|
||||
|
||||
assert!(pdf_path.exists());
|
||||
Ok(())
|
||||
})
|
||||
}).collect();
|
||||
|
||||
// Wait for all threads to complete
|
||||
for handle in handles {
|
||||
handle.join().unwrap()?;
|
||||
}
|
||||
|
||||
// Verify all PDFs were created
|
||||
let pdf_count = fs::read_dir(&temp_dir)?
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter(|entry| entry.path().extension().and_then(|s| s.to_str()) == Some("pdf"))
|
||||
.count();
|
||||
|
||||
assert_eq!(pdf_count, 3);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
Reference in New Issue
Block a user