maturaarbeit/pdf_to_markdown.py
MM4go c7ff6a8a29 Add PDF to Markdown converter with mise task runner
- Implement pdf_to_markdown.py script with pypdf for text extraction
- Extract metadata (title, author, creation date) from PDFs
- Generate clean Markdown files with YAML front matter
- Add comprehensive error handling and logging
- Create mise.toml with 10+ convenient tasks for conversion
- Provide detailed documentation (4 guides + quick reference)
- Successfully convert all 18 PDF files in artikel/ folder to Markdown
- Include .gitignore for Python cache and local config
2026-02-23 14:58:58 +01:00

374 lines
11 KiB
Python

#!/usr/bin/env python3
"""
PDF to Markdown Converter
Converts PDF files in a folder to Markdown format, extracting text and metadata.
Handles errors gracefully and provides detailed logging.
"""
import argparse
import logging
import sys
from pathlib import Path
from datetime import datetime
from typing import Optional, Tuple, Dict, Any
import json
from pypdf import PdfReader
from dateutil import parser as date_parser
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
class PDFToMarkdownConverter:
"""Converts PDF files to Markdown format."""
def __init__(self, input_dir: Path, output_dir: Path, verbose: bool = False, quiet: bool = False):
"""
Initialize the converter.
Args:
input_dir: Directory containing PDF files
output_dir: Directory to save Markdown files
verbose: Enable verbose logging
quiet: Suppress all output except errors
"""
self.input_dir = Path(input_dir).resolve()
self.output_dir = Path(output_dir).resolve()
self.verbose = verbose
self.quiet = quiet
# Configure logging based on verbosity
if quiet:
logger.setLevel(logging.ERROR)
elif verbose:
logger.setLevel(logging.DEBUG)
# Create output directory if it doesn't exist
self.output_dir.mkdir(parents=True, exist_ok=True)
# Statistics
self.stats = {
'total': 0,
'successful': 0,
'failed': 0,
'skipped': 0,
'errors': []
}
def extract_metadata(self, reader: PdfReader, pdf_path: Path) -> Dict[str, Any]:
"""
Extract metadata from PDF.
Args:
reader: PdfReader object
pdf_path: Path to PDF file
Returns:
Dictionary containing metadata
"""
metadata = {
'title': None,
'author': None,
'created': None,
'source': pdf_path.name
}
try:
# Try to extract from PDF metadata
if reader.metadata:
# Title
if '/Title' in reader.metadata:
title = reader.metadata.get('/Title')
metadata['title'] = title if isinstance(title, str) else str(title)
# Author
if '/Author' in reader.metadata:
author = reader.metadata.get('/Author')
metadata['author'] = author if isinstance(author, str) else str(author)
# Creation date
if '/CreationDate' in reader.metadata:
try:
date_str = reader.metadata.get('/CreationDate')
# Parse PDF date format (D:YYYYMMDDHHmmSS...)
if isinstance(date_str, str):
# Remove 'D:' prefix if present
if date_str.startswith('D:'):
date_str = date_str[2:]
# Parse date
parsed_date = date_parser.parse(date_str)
metadata['created'] = parsed_date.strftime('%Y-%m-%d')
except Exception as e:
logger.debug(f"Could not parse creation date: {e}")
except Exception as e:
logger.warning(f"Error extracting metadata from {pdf_path.name}: {e}")
# Use filename as title if not found in metadata
if not metadata['title']:
metadata['title'] = pdf_path.stem
return metadata
def extract_text(self, reader: PdfReader, pdf_path: Path) -> str:
"""
Extract text from PDF.
Args:
reader: PdfReader object
pdf_path: Path to PDF file
Returns:
Extracted text with page breaks
"""
text_parts = []
total_pages = len(reader.pages)
if total_pages == 0:
logger.warning(f"{pdf_path.name}: No pages found")
return ""
for page_num, page in enumerate(reader.pages, start=1):
try:
text = page.extract_text()
if text and text.strip():
# Add page header
text_parts.append(f"\n## Page {page_num}\n")
text_parts.append(text)
else:
logger.debug(f"{pdf_path.name}: Page {page_num} has no extractable text")
except Exception as e:
logger.warning(f"{pdf_path.name}: Error extracting text from page {page_num}: {e}")
if not text_parts:
logger.warning(f"{pdf_path.name}: No text could be extracted from any pages")
return ""
return "".join(text_parts)
def create_markdown(self, metadata: Dict[str, Any], text: str) -> str:
"""
Create Markdown content with metadata front matter.
Args:
metadata: Dictionary containing document metadata
text: Extracted text content
Returns:
Markdown formatted content
"""
# Build YAML front matter
front_matter = ["---"]
if metadata.get('title'):
front_matter.append(f"title: {metadata['title']}")
if metadata.get('author'):
front_matter.append(f"author: {metadata['author']}")
if metadata.get('created'):
front_matter.append(f"created: {metadata['created']}")
# Add conversion timestamp
converted_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
front_matter.append(f"converted: {converted_time}")
if metadata.get('source'):
front_matter.append(f"source: {metadata['source']}")
front_matter.append("---\n")
# Combine front matter with content
content = "\n".join(front_matter)
if text:
# Add main heading if we have a title
if metadata.get('title'):
content += f"# {metadata['title']}\n\n"
content += text
else:
content += "\n*No text content could be extracted from this PDF.*\n"
return content
def convert_pdf(self, pdf_path: Path) -> bool:
"""
Convert a single PDF file to Markdown.
Args:
pdf_path: Path to PDF file
Returns:
True if successful, False otherwise
"""
try:
if not self.quiet:
logger.info(f"Processing: {pdf_path.name}")
# Read PDF
reader = PdfReader(pdf_path)
# Extract metadata and text
metadata = self.extract_metadata(reader, pdf_path)
text = self.extract_text(reader, pdf_path)
# Create Markdown content
markdown_content = self.create_markdown(metadata, text)
# Generate output path
output_path = self.output_dir / pdf_path.with_suffix('.md').name
# Write Markdown file
output_path.write_text(markdown_content, encoding='utf-8')
if not self.quiet:
logger.info(f"✓ Successfully converted: {pdf_path.name}{output_path.name}")
self.stats['successful'] += 1
return True
except Exception as e:
error_msg = f"✗ Error converting {pdf_path.name}: {str(e)}"
logger.error(error_msg)
self.stats['failed'] += 1
self.stats['errors'].append({'file': pdf_path.name, 'error': str(e)})
return False
def convert_folder(self, dry_run: bool = False) -> None:
"""
Convert all PDF files in input folder.
Args:
dry_run: If True, don't write files, just report what would be done
"""
if not self.input_dir.exists():
logger.error(f"Input directory not found: {self.input_dir}")
sys.exit(1)
# Find all PDF files
pdf_files = list(self.input_dir.glob('*.pdf'))
if not pdf_files:
logger.warning(f"No PDF files found in {self.input_dir}")
return
self.stats['total'] = len(pdf_files)
if not self.quiet:
logger.info(f"Found {len(pdf_files)} PDF file(s) in {self.input_dir}")
if dry_run:
logger.info("DRY RUN: No files will be written")
# Convert each PDF
for pdf_path in sorted(pdf_files):
if dry_run:
logger.info(f"[DRY RUN] Would convert: {pdf_path.name}")
self.stats['successful'] += 1
else:
self.convert_pdf(pdf_path)
# Print summary
self.print_summary()
def print_summary(self) -> None:
"""Print conversion summary."""
summary = f"""
{'='*60}
CONVERSION SUMMARY
{'='*60}
Total PDFs: {self.stats['total']}
Successful: {self.stats['successful']}
Failed: {self.stats['failed']}
Output directory: {self.output_dir}
{'='*60}
"""
if not self.quiet:
print(summary)
if self.stats['errors']:
logger.error("Errors encountered:")
for error in self.stats['errors']:
logger.error(f" - {error['file']}: {error['error']}")
def main():
"""Main entry point."""
parser = argparse.ArgumentParser(
description='Convert PDF files to Markdown format.',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python pdf_to_markdown.py # Uses default folders
python pdf_to_markdown.py ./artikel # Custom input folder
python pdf_to_markdown.py ./artikel ./output # Custom input and output
python pdf_to_markdown.py -v ./artikel # Verbose mode
python pdf_to_markdown.py --dry-run ./input # Preview without writing
"""
)
parser.add_argument(
'input_dir',
nargs='?',
default='./artikel',
help='Input folder containing PDFs (default: ./artikel)'
)
parser.add_argument(
'output_dir',
nargs='?',
default=None,
help='Output folder for Markdown files (default: input_dir/converted)'
)
parser.add_argument(
'-v', '--verbose',
action='store_true',
help='Enable verbose logging'
)
parser.add_argument(
'-q', '--quiet',
action='store_true',
help='Suppress all output except errors'
)
parser.add_argument(
'--dry-run',
action='store_true',
help='Test run without writing files'
)
args = parser.parse_args()
# Set default output directory if not provided
if args.output_dir is None:
args.output_dir = str(Path(args.input_dir) / 'converted')
# Create converter and run
converter = PDFToMarkdownConverter(
input_dir=args.input_dir,
output_dir=args.output_dir,
verbose=args.verbose,
quiet=args.quiet
)
try:
converter.convert_folder(dry_run=args.dry_run)
except KeyboardInterrupt:
logger.info("\nConversion interrupted by user")
sys.exit(1)
except Exception as e:
logger.error(f"Fatal error: {e}")
sys.exit(1)
if __name__ == '__main__':
main()