mcp-docx/tests/converter_tests.rs

use anyhow::Result;
use docx_mcp::docx_handler::{DocxHandler, DocxStyle, TableData};
use docx_mcp::pure_converter::PureRustConverter;
use tempfile::TempDir;
use std::path::{Path, PathBuf};
use std::fs;
use pretty_assertions::assert_eq;
use rstest::*;

fn setup_test_handler_with_content() -> (DocxHandler, String, TempDir) {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();
    let doc_id = handler.create_document().unwrap();

    // Add comprehensive content for testing
    handler.add_heading(&doc_id, "Test Document Title", 1).unwrap();
    handler.add_paragraph(&doc_id, "This is a comprehensive test document with various content types.", None).unwrap();

    handler.add_heading(&doc_id, "Table Example", 2).unwrap();
    let table_data = TableData {
        rows: vec![
            vec!["Product".to_string(), "Price".to_string(), "Quantity".to_string()],
            vec!["Widget A".to_string(), "$10.00".to_string(), "5".to_string()],
            vec!["Widget B".to_string(), "$15.00".to_string(), "3".to_string()],
        ],
        headers: Some(vec!["Product".to_string(), "Price".to_string(), "Quantity".to_string()]),
        border_style: Some("single".to_string()),
    };
    handler.add_table(&doc_id, table_data).unwrap();

    handler.add_heading(&doc_id, "List Example", 2).unwrap();
    let list_items = vec![
        "First important point".to_string(),
        "Second key feature".to_string(),
        "Third critical aspect".to_string(),
    ];
    handler.add_list(&doc_id, list_items, false).unwrap();

    handler.add_paragraph(&doc_id, "Conclusion: This document demonstrates various formatting capabilities.", None).unwrap();

    (handler, doc_id, temp_dir)
}

#[test]
fn test_pure_converter_creation() {
    let converter = PureRustConverter::new();
    // Just verify it can be created without panicking
    assert!(true);
}

#[test]
fn test_extract_text_from_docx() -> Result<()> {
    let (handler, doc_id, _temp_dir) = setup_test_handler_with_content();

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let extracted_text = converter.extract_text_from_docx(&metadata.path)?;

    // Should contain all the content we added
    assert!(extracted_text.contains("Test Document Title"));
    assert!(extracted_text.contains("comprehensive test document"));
    assert!(extracted_text.contains("Table Example"));
    assert!(extracted_text.contains("Widget A"));
    assert!(extracted_text.contains("First important point"));
    assert!(extracted_text.contains("Conclusion"));

    Ok(())
}

#[test]
fn test_extract_text_empty_document() -> Result<()> {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();
    let doc_id = handler.create_document().unwrap();

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let extracted_text = converter.extract_text_from_docx(&metadata.path)?;

    // Empty document should return empty or whitespace-only text
    assert!(extracted_text.trim().is_empty());

    Ok(())
}

#[test]
fn test_convert_docx_to_pdf_basic() -> Result<()> {
    let (handler, doc_id, temp_dir) = setup_test_handler_with_content();

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let output_path = temp_dir.path().join("test_output.pdf");
    converter.convert_docx_to_pdf(&metadata.path, &output_path)?;

    // Verify PDF file was created
    assert!(output_path.exists());

    // Check file size is reasonable (should be larger than empty PDF)
    let file_size = fs::metadata(&output_path)?.len();
    assert!(file_size > 1000); // PDF should be at least 1KB

    // Verify it's actually a PDF file (starts with PDF signature)
    let pdf_content = fs::read(&output_path)?;
    assert!(pdf_content.starts_with(b"%PDF"));

    Ok(())
}

#[test]
fn test_convert_docx_to_pdf_with_complex_content() -> Result<()> {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();
    let doc_id = handler.create_document().unwrap();

    // Add content with special characters and formatting
    handler.add_paragraph(&doc_id, "Special characters: éñüñ, 中文, русский, العربية", None)?;

    let style = DocxStyle {
        font_family: Some("Arial".to_string()),
        font_size: Some(16),
        bold: Some(true),
        italic: Some(false),
        underline: Some(true),
        color: Some("#FF0000".to_string()),
        alignment: Some("center".to_string()),
        line_spacing: Some(1.5),
    };
    handler.add_paragraph(&doc_id, "Bold and underlined text", Some(style))?;

    // Add multiple headings
    for level in 1..=3 {
        handler.add_heading(&doc_id, &format!("Heading Level {}", level), level)?;
    }

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let output_path = temp_dir.path().join("complex_output.pdf");
    converter.convert_docx_to_pdf(&metadata.path, &output_path)?;

    assert!(output_path.exists());
    let file_size = fs::metadata(&output_path)?.len();
    assert!(file_size > 500); // Should be larger due to more content

    Ok(())
}

#[test]
fn test_convert_docx_to_images() -> Result<()> {
    let (handler, doc_id, temp_dir) = setup_test_handler_with_content();

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let output_dir = temp_dir.path().join("images");
    fs::create_dir_all(&output_dir)?;

    let image_paths = converter.convert_docx_to_images(&metadata.path, &output_dir)?;

    // Should generate at least one image
    assert!(!image_paths.is_empty());

    // Verify all generated images exist
    for image_path in &image_paths {
        assert!(image_path.exists(), "Generated image should exist: {:?}", image_path);

        let file_size = fs::metadata(image_path)?.len();
        assert!(file_size > 100, "Image file should have reasonable size");

        // Verify it's a PNG file (our default format)
        if image_path.extension().and_then(|s| s.to_str()) == Some("png") {
            let image_content = fs::read(image_path)?;
            assert!(image_content.starts_with(&[0x89, 0x50, 0x4E, 0x47]), "Should be valid PNG");
        }
    }

    Ok(())
}

#[test]
fn test_convert_docx_to_images_custom_format() -> Result<()> {
    let (handler, doc_id, temp_dir) = setup_test_handler_with_content();

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let output_dir = temp_dir.path().join("jpeg_images");
    fs::create_dir_all(&output_dir)?;

    let image_paths = converter.convert_docx_to_images_with_format(&metadata.path, &output_dir, "jpeg", 150)?;

    assert!(!image_paths.is_empty());

    for image_path in &image_paths {
        assert!(image_path.exists());

        // Verify JPEG format
        if image_path.extension().and_then(|s| s.to_str()) == Some("jpg") ||
           image_path.extension().and_then(|s| s.to_str()) == Some("jpeg") {
            let image_content = fs::read(image_path)?;
            assert!(image_content.starts_with(&[0xFF, 0xD8, 0xFF]), "Should be valid JPEG");
        }
    }

    Ok(())
}

#[test]
fn test_pdf_generation_with_embedded_fonts() -> Result<()> {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();
    let doc_id = handler.create_document().unwrap();

    // Add text that might require different fonts
    handler.add_paragraph(&doc_id, "Regular ASCII text", None)?;
    handler.add_paragraph(&doc_id, "Unicode: àáâãäå çèéêë ìíîï ñòóôõö ùúûü ýÿ", None)?;
    handler.add_paragraph(&doc_id, "Math symbols: ∑ ∏ ∫ √ ≤ ≥ ≠ ± ∞", None)?;

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let output_path = temp_dir.path().join("embedded_fonts.pdf");
    converter.convert_docx_to_pdf(&metadata.path, &output_path)?;

    assert!(output_path.exists());
    let file_size = fs::metadata(&output_path)?.len();
    assert!(file_size > 1000); // Should be larger due to embedded fonts

    Ok(())
}

#[test]
fn test_batch_conversion() -> Result<()> {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();

    // Create multiple documents
    let mut doc_paths = Vec::new();
    for i in 0..3 {
        let doc_id = handler.create_document().unwrap();
        handler.add_paragraph(&doc_id, &format!("Document {} content", i), None)?;

        let metadata = handler.get_metadata(&doc_id)?;
        doc_paths.push(metadata.path);
    }

    let converter = PureRustConverter::new();
    let output_dir = temp_dir.path().join("batch_output");
    fs::create_dir_all(&output_dir)?;

    // Convert all documents to PDF
    for (i, doc_path) in doc_paths.iter().enumerate() {
        let output_path = output_dir.join(format!("document_{}.pdf", i));
        converter.convert_docx_to_pdf(doc_path, &output_path)?;

        assert!(output_path.exists());
    }

    // Verify all PDFs were created
    let pdf_files: Vec<_> = fs::read_dir(&output_dir)?
        .filter_map(|entry| entry.ok())
        .filter(|entry| entry.path().extension().and_then(|s| s.to_str()) == Some("pdf"))
        .collect();

    assert_eq!(pdf_files.len(), 3);

    Ok(())
}

#[test]
fn test_error_handling_invalid_docx() {
    let temp_dir = TempDir::new().unwrap();
    let converter = PureRustConverter::new();

    // Create a fake DOCX file (actually just text)
    let fake_docx = temp_dir.path().join("fake.docx");
    fs::write(&fake_docx, "This is not a DOCX file").unwrap();

    // Should handle the error gracefully
    let result = converter.extract_text_from_docx(&fake_docx);
    assert!(result.is_err());

    let output_path = temp_dir.path().join("output.pdf");
    let result = converter.convert_docx_to_pdf(&fake_docx, &output_path);
    assert!(result.is_err());
}

#[test]
fn test_error_handling_nonexistent_file() {
    let temp_dir = TempDir::new().unwrap();
    let converter = PureRustConverter::new();

    let nonexistent = temp_dir.path().join("nonexistent.docx");

    let result = converter.extract_text_from_docx(&nonexistent);
    assert!(result.is_err());

    let output_path = temp_dir.path().join("output.pdf");
    let result = converter.convert_docx_to_pdf(&nonexistent, &output_path);
    assert!(result.is_err());
}

#[test]
fn test_large_document_conversion() -> Result<()> {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();
    let doc_id = handler.create_document().unwrap();

    // Create a large document with many pages
    for i in 0..50 {
        handler.add_heading(&doc_id, &format!("Section {}", i + 1), 1)?;

        for j in 0..10 {
            let content = format!("This is paragraph {} in section {}. It contains enough text to make the document substantial and test the conversion capabilities with larger files.", j + 1, i + 1);
            handler.add_paragraph(&doc_id, &content, None)?;
        }

        if i % 10 == 9 {
            handler.add_page_break(&doc_id)?;
        }
    }

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    // Test PDF conversion
    let pdf_path = temp_dir.path().join("large_document.pdf");
    converter.convert_docx_to_pdf(&metadata.path, &pdf_path)?;

    assert!(pdf_path.exists());
    let pdf_size = fs::metadata(&pdf_path)?.len();
    assert!(pdf_size > 50000); // Should be a substantial PDF

    // Test image conversion (but only first few pages to avoid excessive test time)
    let images_dir = temp_dir.path().join("large_images");
    fs::create_dir_all(&images_dir)?;

    let image_paths = converter.convert_docx_to_images(&metadata.path, &images_dir)?;
    assert!(!image_paths.is_empty());

    // Should generate multiple images for multiple pages
    assert!(image_paths.len() >= 2);

    Ok(())
}

#[test]
fn test_text_extraction_accuracy() -> Result<()> {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();
    let doc_id = handler.create_document().unwrap();

    // Add specific test content
    let test_sentences = vec![
        "The quick brown fox jumps over the lazy dog.",
        "Pack my box with five dozen liquor jugs.",
        "How vexingly quick daft zebras jump!",
        "Sphinx of black quartz, judge my vow.",
    ];

    for sentence in &test_sentences {
        handler.add_paragraph(&doc_id, sentence, None)?;
    }

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let extracted_text = converter.extract_text_from_docx(&metadata.path)?;

    // Verify all sentences are present in the extracted text
    for sentence in &test_sentences {
        assert!(extracted_text.contains(sentence),
                "Extracted text should contain: '{}'", sentence);
    }

    // Check word count accuracy
    let expected_words: usize = test_sentences.iter()
        .map(|s| s.split_whitespace().count())
        .sum();
    let extracted_words = extracted_text.split_whitespace().count();

    // Should be approximately equal (allowing for minor differences)
    let word_diff = if extracted_words > expected_words {
        extracted_words - expected_words
    } else {
        expected_words - extracted_words
    };
    assert!(word_diff <= 5, "Word count difference too large: expected ~{}, got {}", expected_words, extracted_words);

    Ok(())
}

#[test]
fn test_conversion_with_different_page_sizes() -> Result<()> {
    let temp_dir = TempDir::new().unwrap();
    let mut handler = DocxHandler::new().unwrap();
    let doc_id = handler.create_document().unwrap();

    handler.add_paragraph(&doc_id, "This document tests page size handling during conversion.", None)?;

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    // Test different output formats and sizes
    let test_cases = vec![
        ("a4.pdf", "A4"),
        ("letter.pdf", "Letter"),
        ("legal.pdf", "Legal"),
    ];

    for (filename, _page_size) in test_cases {
        let output_path = temp_dir.path().join(filename);

        // Note: In a full implementation, you'd pass page_size to the converter
        converter.convert_docx_to_pdf(&metadata.path, &output_path)?;

        assert!(output_path.exists());
        let file_size = fs::metadata(&output_path)?.len();
        assert!(file_size > 500); // Reasonable minimum size
    }

    Ok(())
}

// Parametrized test for different image formats
#[rstest]
#[case("png", &[0x89, 0x50, 0x4E, 0x47])]
#[case("jpeg", &[0xFF, 0xD8, 0xFF])]
fn test_image_format_conversion(#[case] format: &str, #[case] signature: &[u8]) -> Result<()> {
    let (handler, doc_id, temp_dir) = setup_test_handler_with_content();

    let metadata = handler.get_metadata(&doc_id)?;
    let converter = PureRustConverter::new();

    let output_dir = temp_dir.path().join(format!("{}_images", format));
    fs::create_dir_all(&output_dir)?;

    let image_paths = converter.convert_docx_to_images_with_format(&metadata.path, &output_dir, format, 100)?;

    assert!(!image_paths.is_empty());

    for image_path in &image_paths {
        assert!(image_path.exists());

        let image_content = fs::read(image_path)?;
        assert!(image_content.starts_with(signature),
                "Image should have correct format signature for {}", format);
    }

    Ok(())
}

#[test]
fn test_conversion_thread_safety() -> Result<()> {
    use std::sync::Arc;
    use std::thread;

    let temp_dir = TempDir::new().unwrap();
    let temp_path = Arc::new(temp_dir.path().to_path_buf());

    let handles: Vec<_> = (0..3).map(|i| {
        let temp_path = Arc::clone(&temp_path);
        thread::spawn(move || -> Result<()> {
            let mut handler = DocxHandler::new()?;
            let doc_id = handler.create_document()?;

            handler.add_paragraph(&doc_id, &format!("Thread {} test content", i), None)?;

            let metadata = handler.get_metadata(&doc_id)?;
            let converter = PureRustConverter::new();

            let pdf_path = temp_path.join(format!("thread_{}.pdf", i));
            converter.convert_docx_to_pdf(&metadata.path, &pdf_path)?;

            assert!(pdf_path.exists());
            Ok(())
        })
    }).collect();

    // Wait for all threads to complete
    for handle in handles {
        handle.join().unwrap()?;
    }

    // Verify all PDFs were created
    let pdf_count = fs::read_dir(&temp_dir)?
        .filter_map(|entry| entry.ok())
        .filter(|entry| entry.path().extension().and_then(|s| s.to_str()) == Some("pdf"))
        .count();

    assert_eq!(pdf_count, 3);

    Ok(())
}