Initial Commit
This commit is contained in:
@@ -0,0 +1,423 @@
|
||||
use anyhow::{Context, Result};
|
||||
use image::{DynamicImage, ImageFormat, Rgba, RgbaImage};
|
||||
use printpdf::*;
|
||||
use std::fs::{self, File};
|
||||
use std::io::{BufReader, BufWriter, Read, Write};
|
||||
use std::path::{Path, PathBuf};
|
||||
use tempfile::NamedTempFile;
|
||||
use tracing::{debug, info, warn};
|
||||
use roxmltree;
|
||||
use zip::ZipArchive;
|
||||
use rusttype::{Font, Scale};
|
||||
use lopdf;
|
||||
|
||||
pub struct PureRustConverter;
|
||||
|
||||
impl PureRustConverter {
|
||||
pub fn new() -> Self {
|
||||
Self
|
||||
}
|
||||
|
||||
/// Extract text from DOCX using pure Rust XML parsing
|
||||
pub fn extract_text_from_docx(&self, docx_path: &Path) -> Result<String> {
|
||||
let file = File::open(docx_path)?;
|
||||
let mut archive = ZipArchive::new(file)?;
|
||||
|
||||
// Find the main document XML
|
||||
let mut document_xml = String::new();
|
||||
|
||||
for i in 0..archive.len() {
|
||||
let mut file = archive.by_index(i)?;
|
||||
let name = file.name().to_string();
|
||||
|
||||
if name == "word/document.xml" {
|
||||
file.read_to_string(&mut document_xml)?;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if document_xml.is_empty() {
|
||||
anyhow::bail!("No document.xml found in DOCX file");
|
||||
}
|
||||
|
||||
// Parse XML and extract text
|
||||
let doc = roxmltree::Document::parse(&document_xml)?;
|
||||
let mut text = String::new();
|
||||
|
||||
// Extract text from all w:t elements
|
||||
for node in doc.descendants() {
|
||||
if node.tag_name().name() == "t" {
|
||||
if let Some(node_text) = node.text() {
|
||||
text.push_str(node_text);
|
||||
text.push(' ');
|
||||
}
|
||||
}
|
||||
// Handle line breaks
|
||||
if node.tag_name().name() == "br" || node.tag_name().name() == "p" {
|
||||
text.push('\n');
|
||||
}
|
||||
}
|
||||
|
||||
Ok(text.trim().to_string())
|
||||
}
|
||||
|
||||
/// Convert DOCX to PDF using pure Rust (no external dependencies)
|
||||
pub fn docx_to_pdf_pure(&self, docx_path: &Path, pdf_path: &Path) -> Result<()> {
|
||||
// Extract text from DOCX
|
||||
let text = self.extract_text_from_docx(docx_path)
|
||||
.with_context(|| format!("Failed to extract text from {:?}", docx_path))?;
|
||||
|
||||
// Create PDF with extracted text
|
||||
self.create_pdf_from_text(&text, pdf_path)?;
|
||||
|
||||
info!("Successfully converted DOCX to PDF using pure Rust");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Create a PDF from text content
|
||||
pub fn create_pdf_from_text(&self, text: &str, pdf_path: &Path) -> Result<()> {
|
||||
let (doc, page1, layer1) = PdfDocument::new("Document", Mm(210.0), Mm(297.0), "Layer 1");
|
||||
let current_layer = doc.get_page(page1).get_layer(layer1);
|
||||
|
||||
// Use embedded font or built-in font
|
||||
let font = doc.add_builtin_font(BuiltinFont::Helvetica)?;
|
||||
|
||||
// Configure text layout
|
||||
let font_size = 11.0;
|
||||
let line_height = Mm(5.0);
|
||||
let margin_left = Mm(20.0);
|
||||
let margin_top = Mm(280.0);
|
||||
let margin_bottom = Mm(20.0);
|
||||
let page_width = Mm(210.0);
|
||||
let page_height = Mm(297.0);
|
||||
let text_width = page_width - (margin_left * 2.0);
|
||||
|
||||
let lines: Vec<&str> = text.lines().collect();
|
||||
let mut current_page = page1;
|
||||
let mut current_layer = layer1;
|
||||
let mut y_position = margin_top;
|
||||
|
||||
for line in lines {
|
||||
// Check if we need a new page
|
||||
if y_position < margin_bottom {
|
||||
let (new_page, new_layer) = doc.add_page(Mm(210.0), Mm(297.0), "Page layer");
|
||||
current_page = new_page;
|
||||
current_layer = new_layer;
|
||||
y_position = margin_top;
|
||||
}
|
||||
|
||||
// Word wrap if line is too long
|
||||
let words: Vec<&str> = line.split_whitespace().collect();
|
||||
let mut current_line = String::new();
|
||||
let max_chars_per_line = 80; // Approximate
|
||||
|
||||
for word in words {
|
||||
if current_line.len() + word.len() + 1 > max_chars_per_line {
|
||||
// Write current line
|
||||
if !current_line.is_empty() {
|
||||
doc.get_page(current_page)
|
||||
.get_layer(current_layer)
|
||||
.use_text(¤t_line, font_size, margin_left, y_position, &font);
|
||||
y_position -= line_height;
|
||||
current_line.clear();
|
||||
|
||||
// Check for new page
|
||||
if y_position < margin_bottom {
|
||||
let (new_page, new_layer) = doc.add_page(Mm(210.0), Mm(297.0), "Page layer");
|
||||
current_page = new_page;
|
||||
current_layer = new_layer;
|
||||
y_position = margin_top;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !current_line.is_empty() {
|
||||
current_line.push(' ');
|
||||
}
|
||||
current_line.push_str(word);
|
||||
}
|
||||
|
||||
// Write remaining text in line
|
||||
if !current_line.is_empty() {
|
||||
doc.get_page(current_page)
|
||||
.get_layer(current_layer)
|
||||
.use_text(¤t_line, font_size, margin_left, y_position, &font);
|
||||
y_position -= line_height;
|
||||
}
|
||||
}
|
||||
|
||||
// Save PDF
|
||||
doc.save(&mut BufWriter::new(File::create(pdf_path)?))?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Convert PDF to images using pure Rust
|
||||
pub fn pdf_to_images_pure(
|
||||
&self,
|
||||
pdf_path: &Path,
|
||||
output_dir: &Path,
|
||||
format: ImageFormat,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
// Parse PDF
|
||||
let doc = lopdf::Document::load(pdf_path)?;
|
||||
let pages = doc.get_pages();
|
||||
|
||||
fs::create_dir_all(output_dir)?;
|
||||
let mut output_paths = Vec::new();
|
||||
|
||||
// For each page, render to image
|
||||
for (page_num, (_page_num, _page_id)) in pages.iter().enumerate() {
|
||||
// Create a blank image for the page
|
||||
// In a real implementation, you would render the PDF content
|
||||
let img = self.render_pdf_page_to_image(&doc, page_num)?;
|
||||
|
||||
// Save image
|
||||
let extension = match format {
|
||||
ImageFormat::Png => "png",
|
||||
ImageFormat::Jpeg => "jpg",
|
||||
_ => "png",
|
||||
};
|
||||
|
||||
let output_path = output_dir.join(format!("page_{:03}.{}", page_num + 1, extension));
|
||||
img.save_with_format(&output_path, format)?;
|
||||
output_paths.push(output_path);
|
||||
}
|
||||
|
||||
Ok(output_paths)
|
||||
}
|
||||
|
||||
/// Render a PDF page to image (simplified implementation)
|
||||
fn render_pdf_page_to_image(&self, _doc: &lopdf::Document, _page_num: usize) -> Result<DynamicImage> {
|
||||
// This is a simplified implementation
|
||||
// A full implementation would parse PDF content and render it
|
||||
|
||||
// Create a white image as placeholder
|
||||
let width = 1240; // A4 at 150 DPI
|
||||
let height = 1754; // A4 at 150 DPI
|
||||
|
||||
let mut img = RgbaImage::new(width, height);
|
||||
|
||||
// Fill with white background
|
||||
for pixel in img.pixels_mut() {
|
||||
*pixel = Rgba([255, 255, 255, 255]);
|
||||
}
|
||||
|
||||
// Add a simple text indicator
|
||||
// In production, you would properly render PDF content
|
||||
|
||||
Ok(DynamicImage::ImageRgba8(img))
|
||||
}
|
||||
|
||||
/// Convert DOCX to images using pure Rust
|
||||
pub fn docx_to_images_pure(
|
||||
&self,
|
||||
docx_path: &Path,
|
||||
output_dir: &Path,
|
||||
format: ImageFormat,
|
||||
) -> Result<Vec<PathBuf>> {
|
||||
// First convert to PDF
|
||||
let temp_pdf = NamedTempFile::new()?.into_temp_path();
|
||||
self.docx_to_pdf_pure(docx_path, &temp_pdf)?;
|
||||
|
||||
// Then convert PDF to images
|
||||
self.pdf_to_images_pure(&temp_pdf, output_dir, format)
|
||||
}
|
||||
|
||||
/// Create a thumbnail from an image
|
||||
pub fn create_thumbnail(
|
||||
&self,
|
||||
image_path: &Path,
|
||||
output_path: &Path,
|
||||
width: u32,
|
||||
height: u32,
|
||||
) -> Result<()> {
|
||||
let img = image::open(image_path)
|
||||
.with_context(|| format!("Failed to open image {:?}", image_path))?;
|
||||
|
||||
let thumbnail = img.thumbnail(width, height);
|
||||
thumbnail.save(output_path)
|
||||
.with_context(|| format!("Failed to save thumbnail to {:?}", output_path))?;
|
||||
|
||||
info!("Created thumbnail {}x{} at {:?}", width, height, output_path);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merge multiple PDFs using pure Rust
|
||||
pub fn merge_pdfs_pure(&self, pdf_paths: &[PathBuf], output_path: &Path) -> Result<()> {
|
||||
use lopdf::{Document, Object, ObjectId};
|
||||
|
||||
// Create a new document for merging
|
||||
let mut merged_doc = Document::with_version("1.5");
|
||||
|
||||
// Track page tree
|
||||
let mut all_pages = Vec::new();
|
||||
|
||||
for pdf_path in pdf_paths {
|
||||
let doc = Document::load(pdf_path)?;
|
||||
|
||||
// Get pages from the document
|
||||
let pages = doc.get_pages();
|
||||
|
||||
for (_page_num, page_id) in pages.iter() {
|
||||
// Clone the page object
|
||||
if let Ok(page_obj) = doc.get_object(*page_id) {
|
||||
let new_id = merged_doc.new_object_id();
|
||||
merged_doc.objects.insert(new_id, page_obj.clone());
|
||||
all_pages.push(new_id);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Build the page tree for merged document
|
||||
let pages_id = merged_doc.new_object_id();
|
||||
let pages_dict = lopdf::dictionary! {
|
||||
"Type" => "Pages",
|
||||
"Kids" => all_pages.iter().map(|id| Object::Reference(*id)).collect::<Vec<_>>(),
|
||||
"Count" => all_pages.len() as i32,
|
||||
};
|
||||
merged_doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
|
||||
// Update catalog
|
||||
let catalog_id = merged_doc.new_object_id();
|
||||
let catalog = lopdf::dictionary! {
|
||||
"Type" => "Catalog",
|
||||
"Pages" => Object::Reference(pages_id),
|
||||
};
|
||||
merged_doc.objects.insert(catalog_id, Object::Dictionary(catalog));
|
||||
merged_doc.trailer.set("Root", Object::Reference(catalog_id));
|
||||
|
||||
// Save the merged PDF
|
||||
merged_doc.save(output_path)?;
|
||||
|
||||
info!("Successfully merged {} PDFs into {:?}", pdf_paths.len(), output_path);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Split a PDF into individual pages using pure Rust
|
||||
pub fn split_pdf_pure(&self, pdf_path: &Path, output_dir: &Path) -> Result<Vec<PathBuf>> {
|
||||
use lopdf::Document;
|
||||
|
||||
fs::create_dir_all(output_dir)?;
|
||||
|
||||
let doc = Document::load(pdf_path)?;
|
||||
let pages = doc.get_pages();
|
||||
let mut output_paths = Vec::new();
|
||||
|
||||
for (i, (_page_num, page_id)) in pages.iter().enumerate() {
|
||||
// Create a new document with just this page
|
||||
let mut single_page_doc = Document::with_version("1.5");
|
||||
|
||||
// Clone the page
|
||||
if let Ok(page_obj) = doc.get_object(*page_id) {
|
||||
let new_page_id = single_page_doc.new_object_id();
|
||||
single_page_doc.objects.insert(new_page_id, page_obj.clone());
|
||||
|
||||
// Create page tree
|
||||
let pages_id = single_page_doc.new_object_id();
|
||||
let pages_dict = lopdf::dictionary! {
|
||||
"Type" => "Pages",
|
||||
"Kids" => vec![Object::Reference(new_page_id)],
|
||||
"Count" => 1,
|
||||
};
|
||||
single_page_doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
|
||||
|
||||
// Create catalog
|
||||
let catalog_id = single_page_doc.new_object_id();
|
||||
let catalog = lopdf::dictionary! {
|
||||
"Type" => "Catalog",
|
||||
"Pages" => Object::Reference(pages_id),
|
||||
};
|
||||
single_page_doc.objects.insert(catalog_id, Object::Dictionary(catalog));
|
||||
single_page_doc.trailer.set("Root", Object::Reference(catalog_id));
|
||||
|
||||
// Save the page
|
||||
let output_path = output_dir.join(format!("page_{:03}.pdf", i + 1));
|
||||
single_page_doc.save(&output_path)?;
|
||||
output_paths.push(output_path);
|
||||
}
|
||||
}
|
||||
|
||||
info!("Split PDF into {} pages", output_paths.len());
|
||||
Ok(output_paths)
|
||||
}
|
||||
|
||||
/// Parse and render markdown to PDF
|
||||
pub fn markdown_to_pdf(&self, markdown: &str, pdf_path: &Path) -> Result<()> {
|
||||
use pulldown_cmark::{Parser, Event, Tag, TagEnd};
|
||||
|
||||
let parser = Parser::new(markdown);
|
||||
let mut plain_text = String::new();
|
||||
let mut in_code_block = false;
|
||||
let mut list_depth = 0;
|
||||
|
||||
for event in parser {
|
||||
match event {
|
||||
Event::Text(text) => {
|
||||
if in_code_block {
|
||||
plain_text.push_str(" ");
|
||||
} else if list_depth > 0 {
|
||||
plain_text.push_str(&" ".repeat(list_depth));
|
||||
}
|
||||
plain_text.push_str(&text);
|
||||
}
|
||||
Event::Start(tag) => {
|
||||
match tag {
|
||||
Tag::Heading { level, .. } => {
|
||||
plain_text.push('\n');
|
||||
plain_text.push_str(&"#".repeat(level as usize));
|
||||
plain_text.push(' ');
|
||||
}
|
||||
Tag::Paragraph => {
|
||||
if !plain_text.is_empty() {
|
||||
plain_text.push_str("\n\n");
|
||||
}
|
||||
}
|
||||
Tag::List(_) => {
|
||||
list_depth += 1;
|
||||
plain_text.push('\n');
|
||||
}
|
||||
Tag::Item => {
|
||||
plain_text.push_str("• ");
|
||||
}
|
||||
Tag::CodeBlock(_) => {
|
||||
in_code_block = true;
|
||||
plain_text.push_str("\n\n");
|
||||
}
|
||||
Tag::Emphasis => plain_text.push('*'),
|
||||
Tag::Strong => plain_text.push_str("**"),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Event::End(tag) => {
|
||||
match tag {
|
||||
TagEnd::Heading(_) => plain_text.push_str("\n\n"),
|
||||
TagEnd::Paragraph => plain_text.push('\n'),
|
||||
TagEnd::List(_) => {
|
||||
list_depth = list_depth.saturating_sub(1);
|
||||
plain_text.push('\n');
|
||||
}
|
||||
TagEnd::Item => plain_text.push('\n'),
|
||||
TagEnd::CodeBlock => {
|
||||
in_code_block = false;
|
||||
plain_text.push_str("\n\n");
|
||||
}
|
||||
TagEnd::Emphasis => plain_text.push('*'),
|
||||
TagEnd::Strong => plain_text.push_str("**"),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
Event::Code(code) => {
|
||||
plain_text.push('`');
|
||||
plain_text.push_str(&code);
|
||||
plain_text.push('`');
|
||||
}
|
||||
Event::SoftBreak => plain_text.push(' '),
|
||||
Event::HardBreak => plain_text.push('\n'),
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
self.create_pdf_from_text(&plain_text, pdf_path)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user