- Implement pdf_to_markdown.py script with pypdf for text extraction - Extract metadata (title, author, creation date) from PDFs - Generate clean Markdown files with YAML front matter - Add comprehensive error handling and logging - Create mise.toml with 10+ convenient tasks for conversion - Provide detailed documentation (4 guides + quick reference) - Successfully convert all 18 PDF files in artikel/ folder to Markdown - Include .gitignore for Python cache and local config
374 lines
11 KiB
Python
374 lines
11 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
PDF to Markdown Converter
|
|
|
|
Converts PDF files in a folder to Markdown format, extracting text and metadata.
|
|
Handles errors gracefully and provides detailed logging.
|
|
"""
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional, Tuple, Dict, Any
|
|
import json
|
|
|
|
from pypdf import PdfReader
|
|
from dateutil import parser as date_parser
|
|
|
|
|
|
# Configure logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PDFToMarkdownConverter:
|
|
"""Converts PDF files to Markdown format."""
|
|
|
|
def __init__(self, input_dir: Path, output_dir: Path, verbose: bool = False, quiet: bool = False):
|
|
"""
|
|
Initialize the converter.
|
|
|
|
Args:
|
|
input_dir: Directory containing PDF files
|
|
output_dir: Directory to save Markdown files
|
|
verbose: Enable verbose logging
|
|
quiet: Suppress all output except errors
|
|
"""
|
|
self.input_dir = Path(input_dir).resolve()
|
|
self.output_dir = Path(output_dir).resolve()
|
|
self.verbose = verbose
|
|
self.quiet = quiet
|
|
|
|
# Configure logging based on verbosity
|
|
if quiet:
|
|
logger.setLevel(logging.ERROR)
|
|
elif verbose:
|
|
logger.setLevel(logging.DEBUG)
|
|
|
|
# Create output directory if it doesn't exist
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Statistics
|
|
self.stats = {
|
|
'total': 0,
|
|
'successful': 0,
|
|
'failed': 0,
|
|
'skipped': 0,
|
|
'errors': []
|
|
}
|
|
|
|
def extract_metadata(self, reader: PdfReader, pdf_path: Path) -> Dict[str, Any]:
|
|
"""
|
|
Extract metadata from PDF.
|
|
|
|
Args:
|
|
reader: PdfReader object
|
|
pdf_path: Path to PDF file
|
|
|
|
Returns:
|
|
Dictionary containing metadata
|
|
"""
|
|
metadata = {
|
|
'title': None,
|
|
'author': None,
|
|
'created': None,
|
|
'source': pdf_path.name
|
|
}
|
|
|
|
try:
|
|
# Try to extract from PDF metadata
|
|
if reader.metadata:
|
|
# Title
|
|
if '/Title' in reader.metadata:
|
|
title = reader.metadata.get('/Title')
|
|
metadata['title'] = title if isinstance(title, str) else str(title)
|
|
|
|
# Author
|
|
if '/Author' in reader.metadata:
|
|
author = reader.metadata.get('/Author')
|
|
metadata['author'] = author if isinstance(author, str) else str(author)
|
|
|
|
# Creation date
|
|
if '/CreationDate' in reader.metadata:
|
|
try:
|
|
date_str = reader.metadata.get('/CreationDate')
|
|
# Parse PDF date format (D:YYYYMMDDHHmmSS...)
|
|
if isinstance(date_str, str):
|
|
# Remove 'D:' prefix if present
|
|
if date_str.startswith('D:'):
|
|
date_str = date_str[2:]
|
|
# Parse date
|
|
parsed_date = date_parser.parse(date_str)
|
|
metadata['created'] = parsed_date.strftime('%Y-%m-%d')
|
|
except Exception as e:
|
|
logger.debug(f"Could not parse creation date: {e}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Error extracting metadata from {pdf_path.name}: {e}")
|
|
|
|
# Use filename as title if not found in metadata
|
|
if not metadata['title']:
|
|
metadata['title'] = pdf_path.stem
|
|
|
|
return metadata
|
|
|
|
def extract_text(self, reader: PdfReader, pdf_path: Path) -> str:
|
|
"""
|
|
Extract text from PDF.
|
|
|
|
Args:
|
|
reader: PdfReader object
|
|
pdf_path: Path to PDF file
|
|
|
|
Returns:
|
|
Extracted text with page breaks
|
|
"""
|
|
text_parts = []
|
|
total_pages = len(reader.pages)
|
|
|
|
if total_pages == 0:
|
|
logger.warning(f"{pdf_path.name}: No pages found")
|
|
return ""
|
|
|
|
for page_num, page in enumerate(reader.pages, start=1):
|
|
try:
|
|
text = page.extract_text()
|
|
if text and text.strip():
|
|
# Add page header
|
|
text_parts.append(f"\n## Page {page_num}\n")
|
|
text_parts.append(text)
|
|
else:
|
|
logger.debug(f"{pdf_path.name}: Page {page_num} has no extractable text")
|
|
except Exception as e:
|
|
logger.warning(f"{pdf_path.name}: Error extracting text from page {page_num}: {e}")
|
|
|
|
if not text_parts:
|
|
logger.warning(f"{pdf_path.name}: No text could be extracted from any pages")
|
|
return ""
|
|
|
|
return "".join(text_parts)
|
|
|
|
def create_markdown(self, metadata: Dict[str, Any], text: str) -> str:
|
|
"""
|
|
Create Markdown content with metadata front matter.
|
|
|
|
Args:
|
|
metadata: Dictionary containing document metadata
|
|
text: Extracted text content
|
|
|
|
Returns:
|
|
Markdown formatted content
|
|
"""
|
|
# Build YAML front matter
|
|
front_matter = ["---"]
|
|
|
|
if metadata.get('title'):
|
|
front_matter.append(f"title: {metadata['title']}")
|
|
|
|
if metadata.get('author'):
|
|
front_matter.append(f"author: {metadata['author']}")
|
|
|
|
if metadata.get('created'):
|
|
front_matter.append(f"created: {metadata['created']}")
|
|
|
|
# Add conversion timestamp
|
|
converted_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
front_matter.append(f"converted: {converted_time}")
|
|
|
|
if metadata.get('source'):
|
|
front_matter.append(f"source: {metadata['source']}")
|
|
|
|
front_matter.append("---\n")
|
|
|
|
# Combine front matter with content
|
|
content = "\n".join(front_matter)
|
|
|
|
if text:
|
|
# Add main heading if we have a title
|
|
if metadata.get('title'):
|
|
content += f"# {metadata['title']}\n\n"
|
|
content += text
|
|
else:
|
|
content += "\n*No text content could be extracted from this PDF.*\n"
|
|
|
|
return content
|
|
|
|
def convert_pdf(self, pdf_path: Path) -> bool:
|
|
"""
|
|
Convert a single PDF file to Markdown.
|
|
|
|
Args:
|
|
pdf_path: Path to PDF file
|
|
|
|
Returns:
|
|
True if successful, False otherwise
|
|
"""
|
|
try:
|
|
if not self.quiet:
|
|
logger.info(f"Processing: {pdf_path.name}")
|
|
|
|
# Read PDF
|
|
reader = PdfReader(pdf_path)
|
|
|
|
# Extract metadata and text
|
|
metadata = self.extract_metadata(reader, pdf_path)
|
|
text = self.extract_text(reader, pdf_path)
|
|
|
|
# Create Markdown content
|
|
markdown_content = self.create_markdown(metadata, text)
|
|
|
|
# Generate output path
|
|
output_path = self.output_dir / pdf_path.with_suffix('.md').name
|
|
|
|
# Write Markdown file
|
|
output_path.write_text(markdown_content, encoding='utf-8')
|
|
|
|
if not self.quiet:
|
|
logger.info(f"✓ Successfully converted: {pdf_path.name} → {output_path.name}")
|
|
|
|
self.stats['successful'] += 1
|
|
return True
|
|
|
|
except Exception as e:
|
|
error_msg = f"✗ Error converting {pdf_path.name}: {str(e)}"
|
|
logger.error(error_msg)
|
|
self.stats['failed'] += 1
|
|
self.stats['errors'].append({'file': pdf_path.name, 'error': str(e)})
|
|
return False
|
|
|
|
def convert_folder(self, dry_run: bool = False) -> None:
|
|
"""
|
|
Convert all PDF files in input folder.
|
|
|
|
Args:
|
|
dry_run: If True, don't write files, just report what would be done
|
|
"""
|
|
if not self.input_dir.exists():
|
|
logger.error(f"Input directory not found: {self.input_dir}")
|
|
sys.exit(1)
|
|
|
|
# Find all PDF files
|
|
pdf_files = list(self.input_dir.glob('*.pdf'))
|
|
|
|
if not pdf_files:
|
|
logger.warning(f"No PDF files found in {self.input_dir}")
|
|
return
|
|
|
|
self.stats['total'] = len(pdf_files)
|
|
|
|
if not self.quiet:
|
|
logger.info(f"Found {len(pdf_files)} PDF file(s) in {self.input_dir}")
|
|
if dry_run:
|
|
logger.info("DRY RUN: No files will be written")
|
|
|
|
# Convert each PDF
|
|
for pdf_path in sorted(pdf_files):
|
|
if dry_run:
|
|
logger.info(f"[DRY RUN] Would convert: {pdf_path.name}")
|
|
self.stats['successful'] += 1
|
|
else:
|
|
self.convert_pdf(pdf_path)
|
|
|
|
# Print summary
|
|
self.print_summary()
|
|
|
|
def print_summary(self) -> None:
|
|
"""Print conversion summary."""
|
|
summary = f"""
|
|
{'='*60}
|
|
CONVERSION SUMMARY
|
|
{'='*60}
|
|
Total PDFs: {self.stats['total']}
|
|
Successful: {self.stats['successful']}
|
|
Failed: {self.stats['failed']}
|
|
Output directory: {self.output_dir}
|
|
{'='*60}
|
|
"""
|
|
if not self.quiet:
|
|
print(summary)
|
|
|
|
if self.stats['errors']:
|
|
logger.error("Errors encountered:")
|
|
for error in self.stats['errors']:
|
|
logger.error(f" - {error['file']}: {error['error']}")
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
parser = argparse.ArgumentParser(
|
|
description='Convert PDF files to Markdown format.',
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python pdf_to_markdown.py # Uses default folders
|
|
python pdf_to_markdown.py ./artikel # Custom input folder
|
|
python pdf_to_markdown.py ./artikel ./output # Custom input and output
|
|
python pdf_to_markdown.py -v ./artikel # Verbose mode
|
|
python pdf_to_markdown.py --dry-run ./input # Preview without writing
|
|
"""
|
|
)
|
|
|
|
parser.add_argument(
|
|
'input_dir',
|
|
nargs='?',
|
|
default='./artikel',
|
|
help='Input folder containing PDFs (default: ./artikel)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'output_dir',
|
|
nargs='?',
|
|
default=None,
|
|
help='Output folder for Markdown files (default: input_dir/converted)'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-v', '--verbose',
|
|
action='store_true',
|
|
help='Enable verbose logging'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'-q', '--quiet',
|
|
action='store_true',
|
|
help='Suppress all output except errors'
|
|
)
|
|
|
|
parser.add_argument(
|
|
'--dry-run',
|
|
action='store_true',
|
|
help='Test run without writing files'
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Set default output directory if not provided
|
|
if args.output_dir is None:
|
|
args.output_dir = str(Path(args.input_dir) / 'converted')
|
|
|
|
# Create converter and run
|
|
converter = PDFToMarkdownConverter(
|
|
input_dir=args.input_dir,
|
|
output_dir=args.output_dir,
|
|
verbose=args.verbose,
|
|
quiet=args.quiet
|
|
)
|
|
|
|
try:
|
|
converter.convert_folder(dry_run=args.dry_run)
|
|
except KeyboardInterrupt:
|
|
logger.info("\nConversion interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
logger.error(f"Fatal error: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|