Initial Commit

This commit is contained in:
Andy
2025-08-11 14:31:51 +08:00
commit 39e94c1b13
36 changed files with 12517 additions and 0 deletions
+423
View File
@@ -0,0 +1,423 @@
use anyhow::{Context, Result};
use image::{DynamicImage, ImageFormat, Rgba, RgbaImage};
use printpdf::*;
use std::fs::{self, File};
use std::io::{BufReader, BufWriter, Read, Write};
use std::path::{Path, PathBuf};
use tempfile::NamedTempFile;
use tracing::{debug, info, warn};
use roxmltree;
use zip::ZipArchive;
use rusttype::{Font, Scale};
use lopdf;
pub struct PureRustConverter;
impl PureRustConverter {
pub fn new() -> Self {
Self
}
/// Extract text from DOCX using pure Rust XML parsing
pub fn extract_text_from_docx(&self, docx_path: &Path) -> Result<String> {
let file = File::open(docx_path)?;
let mut archive = ZipArchive::new(file)?;
// Find the main document XML
let mut document_xml = String::new();
for i in 0..archive.len() {
let mut file = archive.by_index(i)?;
let name = file.name().to_string();
if name == "word/document.xml" {
file.read_to_string(&mut document_xml)?;
break;
}
}
if document_xml.is_empty() {
anyhow::bail!("No document.xml found in DOCX file");
}
// Parse XML and extract text
let doc = roxmltree::Document::parse(&document_xml)?;
let mut text = String::new();
// Extract text from all w:t elements
for node in doc.descendants() {
if node.tag_name().name() == "t" {
if let Some(node_text) = node.text() {
text.push_str(node_text);
text.push(' ');
}
}
// Handle line breaks
if node.tag_name().name() == "br" || node.tag_name().name() == "p" {
text.push('\n');
}
}
Ok(text.trim().to_string())
}
/// Convert DOCX to PDF using pure Rust (no external dependencies)
pub fn docx_to_pdf_pure(&self, docx_path: &Path, pdf_path: &Path) -> Result<()> {
// Extract text from DOCX
let text = self.extract_text_from_docx(docx_path)
.with_context(|| format!("Failed to extract text from {:?}", docx_path))?;
// Create PDF with extracted text
self.create_pdf_from_text(&text, pdf_path)?;
info!("Successfully converted DOCX to PDF using pure Rust");
Ok(())
}
/// Create a PDF from text content
pub fn create_pdf_from_text(&self, text: &str, pdf_path: &Path) -> Result<()> {
let (doc, page1, layer1) = PdfDocument::new("Document", Mm(210.0), Mm(297.0), "Layer 1");
let current_layer = doc.get_page(page1).get_layer(layer1);
// Use embedded font or built-in font
let font = doc.add_builtin_font(BuiltinFont::Helvetica)?;
// Configure text layout
let font_size = 11.0;
let line_height = Mm(5.0);
let margin_left = Mm(20.0);
let margin_top = Mm(280.0);
let margin_bottom = Mm(20.0);
let page_width = Mm(210.0);
let page_height = Mm(297.0);
let text_width = page_width - (margin_left * 2.0);
let lines: Vec<&str> = text.lines().collect();
let mut current_page = page1;
let mut current_layer = layer1;
let mut y_position = margin_top;
for line in lines {
// Check if we need a new page
if y_position < margin_bottom {
let (new_page, new_layer) = doc.add_page(Mm(210.0), Mm(297.0), "Page layer");
current_page = new_page;
current_layer = new_layer;
y_position = margin_top;
}
// Word wrap if line is too long
let words: Vec<&str> = line.split_whitespace().collect();
let mut current_line = String::new();
let max_chars_per_line = 80; // Approximate
for word in words {
if current_line.len() + word.len() + 1 > max_chars_per_line {
// Write current line
if !current_line.is_empty() {
doc.get_page(current_page)
.get_layer(current_layer)
.use_text(&current_line, font_size, margin_left, y_position, &font);
y_position -= line_height;
current_line.clear();
// Check for new page
if y_position < margin_bottom {
let (new_page, new_layer) = doc.add_page(Mm(210.0), Mm(297.0), "Page layer");
current_page = new_page;
current_layer = new_layer;
y_position = margin_top;
}
}
}
if !current_line.is_empty() {
current_line.push(' ');
}
current_line.push_str(word);
}
// Write remaining text in line
if !current_line.is_empty() {
doc.get_page(current_page)
.get_layer(current_layer)
.use_text(&current_line, font_size, margin_left, y_position, &font);
y_position -= line_height;
}
}
// Save PDF
doc.save(&mut BufWriter::new(File::create(pdf_path)?))?;
Ok(())
}
/// Convert PDF to images using pure Rust
pub fn pdf_to_images_pure(
&self,
pdf_path: &Path,
output_dir: &Path,
format: ImageFormat,
) -> Result<Vec<PathBuf>> {
// Parse PDF
let doc = lopdf::Document::load(pdf_path)?;
let pages = doc.get_pages();
fs::create_dir_all(output_dir)?;
let mut output_paths = Vec::new();
// For each page, render to image
for (page_num, (_page_num, _page_id)) in pages.iter().enumerate() {
// Create a blank image for the page
// In a real implementation, you would render the PDF content
let img = self.render_pdf_page_to_image(&doc, page_num)?;
// Save image
let extension = match format {
ImageFormat::Png => "png",
ImageFormat::Jpeg => "jpg",
_ => "png",
};
let output_path = output_dir.join(format!("page_{:03}.{}", page_num + 1, extension));
img.save_with_format(&output_path, format)?;
output_paths.push(output_path);
}
Ok(output_paths)
}
/// Render a PDF page to image (simplified implementation)
fn render_pdf_page_to_image(&self, _doc: &lopdf::Document, _page_num: usize) -> Result<DynamicImage> {
// This is a simplified implementation
// A full implementation would parse PDF content and render it
// Create a white image as placeholder
let width = 1240; // A4 at 150 DPI
let height = 1754; // A4 at 150 DPI
let mut img = RgbaImage::new(width, height);
// Fill with white background
for pixel in img.pixels_mut() {
*pixel = Rgba([255, 255, 255, 255]);
}
// Add a simple text indicator
// In production, you would properly render PDF content
Ok(DynamicImage::ImageRgba8(img))
}
/// Convert DOCX to images using pure Rust
pub fn docx_to_images_pure(
&self,
docx_path: &Path,
output_dir: &Path,
format: ImageFormat,
) -> Result<Vec<PathBuf>> {
// First convert to PDF
let temp_pdf = NamedTempFile::new()?.into_temp_path();
self.docx_to_pdf_pure(docx_path, &temp_pdf)?;
// Then convert PDF to images
self.pdf_to_images_pure(&temp_pdf, output_dir, format)
}
/// Create a thumbnail from an image
pub fn create_thumbnail(
&self,
image_path: &Path,
output_path: &Path,
width: u32,
height: u32,
) -> Result<()> {
let img = image::open(image_path)
.with_context(|| format!("Failed to open image {:?}", image_path))?;
let thumbnail = img.thumbnail(width, height);
thumbnail.save(output_path)
.with_context(|| format!("Failed to save thumbnail to {:?}", output_path))?;
info!("Created thumbnail {}x{} at {:?}", width, height, output_path);
Ok(())
}
/// Merge multiple PDFs using pure Rust
pub fn merge_pdfs_pure(&self, pdf_paths: &[PathBuf], output_path: &Path) -> Result<()> {
use lopdf::{Document, Object, ObjectId};
// Create a new document for merging
let mut merged_doc = Document::with_version("1.5");
// Track page tree
let mut all_pages = Vec::new();
for pdf_path in pdf_paths {
let doc = Document::load(pdf_path)?;
// Get pages from the document
let pages = doc.get_pages();
for (_page_num, page_id) in pages.iter() {
// Clone the page object
if let Ok(page_obj) = doc.get_object(*page_id) {
let new_id = merged_doc.new_object_id();
merged_doc.objects.insert(new_id, page_obj.clone());
all_pages.push(new_id);
}
}
}
// Build the page tree for merged document
let pages_id = merged_doc.new_object_id();
let pages_dict = lopdf::dictionary! {
"Type" => "Pages",
"Kids" => all_pages.iter().map(|id| Object::Reference(*id)).collect::<Vec<_>>(),
"Count" => all_pages.len() as i32,
};
merged_doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
// Update catalog
let catalog_id = merged_doc.new_object_id();
let catalog = lopdf::dictionary! {
"Type" => "Catalog",
"Pages" => Object::Reference(pages_id),
};
merged_doc.objects.insert(catalog_id, Object::Dictionary(catalog));
merged_doc.trailer.set("Root", Object::Reference(catalog_id));
// Save the merged PDF
merged_doc.save(output_path)?;
info!("Successfully merged {} PDFs into {:?}", pdf_paths.len(), output_path);
Ok(())
}
/// Split a PDF into individual pages using pure Rust
pub fn split_pdf_pure(&self, pdf_path: &Path, output_dir: &Path) -> Result<Vec<PathBuf>> {
use lopdf::Document;
fs::create_dir_all(output_dir)?;
let doc = Document::load(pdf_path)?;
let pages = doc.get_pages();
let mut output_paths = Vec::new();
for (i, (_page_num, page_id)) in pages.iter().enumerate() {
// Create a new document with just this page
let mut single_page_doc = Document::with_version("1.5");
// Clone the page
if let Ok(page_obj) = doc.get_object(*page_id) {
let new_page_id = single_page_doc.new_object_id();
single_page_doc.objects.insert(new_page_id, page_obj.clone());
// Create page tree
let pages_id = single_page_doc.new_object_id();
let pages_dict = lopdf::dictionary! {
"Type" => "Pages",
"Kids" => vec![Object::Reference(new_page_id)],
"Count" => 1,
};
single_page_doc.objects.insert(pages_id, Object::Dictionary(pages_dict));
// Create catalog
let catalog_id = single_page_doc.new_object_id();
let catalog = lopdf::dictionary! {
"Type" => "Catalog",
"Pages" => Object::Reference(pages_id),
};
single_page_doc.objects.insert(catalog_id, Object::Dictionary(catalog));
single_page_doc.trailer.set("Root", Object::Reference(catalog_id));
// Save the page
let output_path = output_dir.join(format!("page_{:03}.pdf", i + 1));
single_page_doc.save(&output_path)?;
output_paths.push(output_path);
}
}
info!("Split PDF into {} pages", output_paths.len());
Ok(output_paths)
}
/// Parse and render markdown to PDF
pub fn markdown_to_pdf(&self, markdown: &str, pdf_path: &Path) -> Result<()> {
use pulldown_cmark::{Parser, Event, Tag, TagEnd};
let parser = Parser::new(markdown);
let mut plain_text = String::new();
let mut in_code_block = false;
let mut list_depth = 0;
for event in parser {
match event {
Event::Text(text) => {
if in_code_block {
plain_text.push_str(" ");
} else if list_depth > 0 {
plain_text.push_str(&" ".repeat(list_depth));
}
plain_text.push_str(&text);
}
Event::Start(tag) => {
match tag {
Tag::Heading { level, .. } => {
plain_text.push('\n');
plain_text.push_str(&"#".repeat(level as usize));
plain_text.push(' ');
}
Tag::Paragraph => {
if !plain_text.is_empty() {
plain_text.push_str("\n\n");
}
}
Tag::List(_) => {
list_depth += 1;
plain_text.push('\n');
}
Tag::Item => {
plain_text.push_str("");
}
Tag::CodeBlock(_) => {
in_code_block = true;
plain_text.push_str("\n\n");
}
Tag::Emphasis => plain_text.push('*'),
Tag::Strong => plain_text.push_str("**"),
_ => {}
}
}
Event::End(tag) => {
match tag {
TagEnd::Heading(_) => plain_text.push_str("\n\n"),
TagEnd::Paragraph => plain_text.push('\n'),
TagEnd::List(_) => {
list_depth = list_depth.saturating_sub(1);
plain_text.push('\n');
}
TagEnd::Item => plain_text.push('\n'),
TagEnd::CodeBlock => {
in_code_block = false;
plain_text.push_str("\n\n");
}
TagEnd::Emphasis => plain_text.push('*'),
TagEnd::Strong => plain_text.push_str("**"),
_ => {}
}
}
Event::Code(code) => {
plain_text.push('`');
plain_text.push_str(&code);
plain_text.push('`');
}
Event::SoftBreak => plain_text.push(' '),
Event::HardBreak => plain_text.push('\n'),
_ => {}
}
}
self.create_pdf_from_text(&plain_text, pdf_path)?;
Ok(())
}
}