#!/usr/bin/env python3 """ PDF to Markdown Converter Converts PDF files in a folder to Markdown format, extracting text and metadata. Handles errors gracefully and provides detailed logging. """ import argparse import logging import sys from pathlib import Path from datetime import datetime from typing import Optional, Tuple, Dict, Any import json from pypdf import PdfReader from dateutil import parser as date_parser # Configure logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) logger = logging.getLogger(__name__) class PDFToMarkdownConverter: """Converts PDF files to Markdown format.""" def __init__(self, input_dir: Path, output_dir: Path, verbose: bool = False, quiet: bool = False): """ Initialize the converter. Args: input_dir: Directory containing PDF files output_dir: Directory to save Markdown files verbose: Enable verbose logging quiet: Suppress all output except errors """ self.input_dir = Path(input_dir).resolve() self.output_dir = Path(output_dir).resolve() self.verbose = verbose self.quiet = quiet # Configure logging based on verbosity if quiet: logger.setLevel(logging.ERROR) elif verbose: logger.setLevel(logging.DEBUG) # Create output directory if it doesn't exist self.output_dir.mkdir(parents=True, exist_ok=True) # Statistics self.stats = { 'total': 0, 'successful': 0, 'failed': 0, 'skipped': 0, 'errors': [] } def extract_metadata(self, reader: PdfReader, pdf_path: Path) -> Dict[str, Any]: """ Extract metadata from PDF. Args: reader: PdfReader object pdf_path: Path to PDF file Returns: Dictionary containing metadata """ metadata = { 'title': None, 'author': None, 'created': None, 'source': pdf_path.name } try: # Try to extract from PDF metadata if reader.metadata: # Title if '/Title' in reader.metadata: title = reader.metadata.get('/Title') metadata['title'] = title if isinstance(title, str) else str(title) # Author if '/Author' in reader.metadata: author = reader.metadata.get('/Author') metadata['author'] = author if isinstance(author, str) else str(author) # Creation date if '/CreationDate' in reader.metadata: try: date_str = reader.metadata.get('/CreationDate') # Parse PDF date format (D:YYYYMMDDHHmmSS...) if isinstance(date_str, str): # Remove 'D:' prefix if present if date_str.startswith('D:'): date_str = date_str[2:] # Parse date parsed_date = date_parser.parse(date_str) metadata['created'] = parsed_date.strftime('%Y-%m-%d') except Exception as e: logger.debug(f"Could not parse creation date: {e}") except Exception as e: logger.warning(f"Error extracting metadata from {pdf_path.name}: {e}") # Use filename as title if not found in metadata if not metadata['title']: metadata['title'] = pdf_path.stem return metadata def extract_text(self, reader: PdfReader, pdf_path: Path) -> str: """ Extract text from PDF. Args: reader: PdfReader object pdf_path: Path to PDF file Returns: Extracted text with page breaks """ text_parts = [] total_pages = len(reader.pages) if total_pages == 0: logger.warning(f"{pdf_path.name}: No pages found") return "" for page_num, page in enumerate(reader.pages, start=1): try: text = page.extract_text() if text and text.strip(): # Add page header text_parts.append(f"\n## Page {page_num}\n") text_parts.append(text) else: logger.debug(f"{pdf_path.name}: Page {page_num} has no extractable text") except Exception as e: logger.warning(f"{pdf_path.name}: Error extracting text from page {page_num}: {e}") if not text_parts: logger.warning(f"{pdf_path.name}: No text could be extracted from any pages") return "" return "".join(text_parts) def create_markdown(self, metadata: Dict[str, Any], text: str) -> str: """ Create Markdown content with metadata front matter. Args: metadata: Dictionary containing document metadata text: Extracted text content Returns: Markdown formatted content """ # Build YAML front matter front_matter = ["---"] if metadata.get('title'): front_matter.append(f"title: {metadata['title']}") if metadata.get('author'): front_matter.append(f"author: {metadata['author']}") if metadata.get('created'): front_matter.append(f"created: {metadata['created']}") # Add conversion timestamp converted_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') front_matter.append(f"converted: {converted_time}") if metadata.get('source'): front_matter.append(f"source: {metadata['source']}") front_matter.append("---\n") # Combine front matter with content content = "\n".join(front_matter) if text: # Add main heading if we have a title if metadata.get('title'): content += f"# {metadata['title']}\n\n" content += text else: content += "\n*No text content could be extracted from this PDF.*\n" return content def convert_pdf(self, pdf_path: Path) -> bool: """ Convert a single PDF file to Markdown. Args: pdf_path: Path to PDF file Returns: True if successful, False otherwise """ try: if not self.quiet: logger.info(f"Processing: {pdf_path.name}") # Read PDF reader = PdfReader(pdf_path) # Extract metadata and text metadata = self.extract_metadata(reader, pdf_path) text = self.extract_text(reader, pdf_path) # Create Markdown content markdown_content = self.create_markdown(metadata, text) # Generate output path output_path = self.output_dir / pdf_path.with_suffix('.md').name # Write Markdown file output_path.write_text(markdown_content, encoding='utf-8') if not self.quiet: logger.info(f"✓ Successfully converted: {pdf_path.name} → {output_path.name}") self.stats['successful'] += 1 return True except Exception as e: error_msg = f"✗ Error converting {pdf_path.name}: {str(e)}" logger.error(error_msg) self.stats['failed'] += 1 self.stats['errors'].append({'file': pdf_path.name, 'error': str(e)}) return False def convert_folder(self, dry_run: bool = False) -> None: """ Convert all PDF files in input folder. Args: dry_run: If True, don't write files, just report what would be done """ if not self.input_dir.exists(): logger.error(f"Input directory not found: {self.input_dir}") sys.exit(1) # Find all PDF files pdf_files = list(self.input_dir.glob('*.pdf')) if not pdf_files: logger.warning(f"No PDF files found in {self.input_dir}") return self.stats['total'] = len(pdf_files) if not self.quiet: logger.info(f"Found {len(pdf_files)} PDF file(s) in {self.input_dir}") if dry_run: logger.info("DRY RUN: No files will be written") # Convert each PDF for pdf_path in sorted(pdf_files): if dry_run: logger.info(f"[DRY RUN] Would convert: {pdf_path.name}") self.stats['successful'] += 1 else: self.convert_pdf(pdf_path) # Print summary self.print_summary() def print_summary(self) -> None: """Print conversion summary.""" summary = f""" {'='*60} CONVERSION SUMMARY {'='*60} Total PDFs: {self.stats['total']} Successful: {self.stats['successful']} Failed: {self.stats['failed']} Output directory: {self.output_dir} {'='*60} """ if not self.quiet: print(summary) if self.stats['errors']: logger.error("Errors encountered:") for error in self.stats['errors']: logger.error(f" - {error['file']}: {error['error']}") def main(): """Main entry point.""" parser = argparse.ArgumentParser( description='Convert PDF files to Markdown format.', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python pdf_to_markdown.py # Uses default folders python pdf_to_markdown.py ./artikel # Custom input folder python pdf_to_markdown.py ./artikel ./output # Custom input and output python pdf_to_markdown.py -v ./artikel # Verbose mode python pdf_to_markdown.py --dry-run ./input # Preview without writing """ ) parser.add_argument( 'input_dir', nargs='?', default='./artikel', help='Input folder containing PDFs (default: ./artikel)' ) parser.add_argument( 'output_dir', nargs='?', default=None, help='Output folder for Markdown files (default: input_dir/converted)' ) parser.add_argument( '-v', '--verbose', action='store_true', help='Enable verbose logging' ) parser.add_argument( '-q', '--quiet', action='store_true', help='Suppress all output except errors' ) parser.add_argument( '--dry-run', action='store_true', help='Test run without writing files' ) args = parser.parse_args() # Set default output directory if not provided if args.output_dir is None: args.output_dir = str(Path(args.input_dir) / 'converted') # Create converter and run converter = PDFToMarkdownConverter( input_dir=args.input_dir, output_dir=args.output_dir, verbose=args.verbose, quiet=args.quiet ) try: converter.convert_folder(dry_run=args.dry_run) except KeyboardInterrupt: logger.info("\nConversion interrupted by user") sys.exit(1) except Exception as e: logger.error(f"Fatal error: {e}") sys.exit(1) if __name__ == '__main__': main()