diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..536c242 --- /dev/null +++ b/.gitignore @@ -0,0 +1,49 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# Virtual Environments +venv/ +ENV/ +env/ +.venv +*.venv + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Mise +.mise +.mise.local +.mise.local.toml + +# Project specific +artikel/converted/*.md +.env.local +*.log diff --git a/MISE_GUIDE.md b/MISE_GUIDE.md new file mode 100644 index 0000000..0bcf7f2 --- /dev/null +++ b/MISE_GUIDE.md @@ -0,0 +1,282 @@ +# Mise en Place - PDF to Markdown Converter + +A modern task runner configuration for the PDF to Markdown conversion project using [mise](https://mise.jdx.dev/). + +## Overview + +Mise is a polyglot tool manager that handles tool installations and task execution. This project uses it to: +- Automatically install Python 3.11 and dependencies +- Provide convenient commands for PDF conversion tasks +- Manage development workflows +- Track conversion status + +## Installation + +### Prerequisites +- **mise** CLI installed: https://mise.jdx.dev/getting-started.html + +Quick install: +```bash +curl https://mise.jdx.dev/install.sh | sh +``` + +### Setup +```bash +# Clone or navigate to the project +cd maturaarbeit + +# Trust the configuration files (one-time setup) +mise trust + +# Verify installation +mise tasks +``` + +## Quick Start + +### Convert All PDFs +```bash +mise run convert +``` + +This will: +1. Install dependencies (if not already installed) +2. Run the PDF to Markdown converter +3. Process all PDFs in `artikel/` folder +4. Output Markdown files to `artikel/converted/` +5. Display a conversion summary + +### Check Conversion Status +```bash +mise run status +``` + +Shows: +- Number of PDFs in `artikel/` +- Number of converted Markdown files +- ✓ All PDFs converted (if done) + +### Preview Without Writing +```bash +mise run dry-run +``` + +Shows what PDFs would be converted without actually writing files. + +## Available Tasks + +| Task | Description | +|------|-------------| +| `install` | Install Python 3.11 and project dependencies | +| `convert` | Convert all PDFs to Markdown (main task) | +| `convert-verbose` | Convert with detailed logging output | +| `convert-quiet` | Convert silently (errors only) | +| `dry-run` | Preview conversion without writing files | +| `convert-custom` | Convert from custom input/output folders | +| `status` | Show conversion status and progress | +| `clean` | Remove converted Markdown files | +| `clean-all` | Remove all build artifacts and cache | +| `help` | List all available tasks | + +## Usage Examples + +### Basic Conversion +```bash +# Convert all PDFs using defaults +mise run convert + +# Convert with verbose logging +mise run convert-verbose + +# Convert silently +mise run convert-quiet +``` + +### Custom Paths +```bash +# Convert from custom input directory +INPUT_DIR=/path/to/pdfs mise run convert-custom + +# Specify both input and output directories +INPUT_DIR=/path/to/pdfs OUTPUT_DIR=/path/to/output mise run convert-custom +``` + +### Cleanup +```bash +# Remove only converted markdown files +mise run clean + +# Remove all artifacts (markdown files, cache, __pycache__) +mise run clean-all +``` + +## Configuration Files + +### `mise.toml` +Main configuration file with all tasks, environment variables, and tool versions. + +**Key sections:** +- `[env]` - Environment variables (e.g., `PYTHONUNBUFFERED`) +- `[tasks.*]` - Task definitions with descriptions and commands +- `[tools.python]` - Python version specification (3.11) +- `[tools.pipenv]` - Package manager version + +### `.mise.local.toml` +Local overrides for environment-specific configuration. Git-ignored file for personal settings. + +**Example customizations:** +```toml +# Override input/output directories +INPUT_DIR = "./my_pdfs" +OUTPUT_DIR = "./my_output" + +# Custom Python path +PYTHON_PATH = "/usr/local/bin/python3" +``` + +### `.gitignore` +Excludes mise cache and local configuration from version control. + +## How It Works + +### Automatic Tool Installation +When you run a task, mise automatically: +1. Detects required tools (Python 3.11) +2. Downloads and installs them if missing +3. Creates isolated environment +4. Executes the task in that environment + +### Task Execution +1. **Setup phase** - Install dependencies via `pip install -r requirements.txt` +2. **Execution phase** - Run the Python script with appropriate arguments +3. **Cleanup phase** - Report results and summary + +### Environment Variables +```bash +PYTHONUNBUFFERED=1 # Real-time output (no buffering) +INPUT_DIR # Custom input folder (default: ./artikel) +OUTPUT_DIR # Custom output folder (default: ./artikel/converted) +``` + +## Advantages Over Traditional Approach + +### Before (Manual Setup) +```bash +# Install Python globally +# Install pip +# Install dependencies +# Hope everything works +python3 pdf_to_markdown.py +``` + +### After (Mise) +```bash +# One command - everything handled +mise run convert +``` + +**Benefits:** +- ✅ Reproducible - Same environment every time +- ✅ Isolated - Tools don't affect system Python +- ✅ Fast - Caches installed tools +- ✅ Easy - Single command to run tasks +- ✅ Portable - Works on any system with mise +- ✅ Documented - Task descriptions built-in +- ✅ Flexible - Environment variables for customization + +## Troubleshooting + +### Issue: "mise: command not found" + +**Solution:** Install mise first +```bash +curl https://mise.jdx.dev/install.sh | sh +``` + +### Issue: "Config files are not trusted" + +**Solution:** Trust the configuration +```bash +mise trust +``` + +### Issue: Python dependencies not installing + +**Solution:** Manually install in the mise environment +```bash +mise run install +``` + +### Issue: "No PDF files found" + +**Solution:** Check the input directory path +```bash +# Verify PDFs exist +ls -la artikel/*.pdf + +# If in different location, use custom path +INPUT_DIR=/path/to/pdfs mise run convert-custom +``` + +### Issue: Slow first run + +**Solution:** First run downloads and installs tools (one-time). Subsequent runs are fast. + +## Advanced Usage + +### Running Tasks from Shell Scripts +```bash +#!/bin/bash +# Run conversion in a script +mise run convert + +# Capture exit code +if mise run convert; then + echo "Conversion successful" + mise run status +else + echo "Conversion failed" + exit 1 +fi +``` + +### Integrating with CI/CD +```bash +# GitHub Actions example +- name: Convert PDFs + run: | + curl https://mise.jdx.dev/install.sh | sh + mise run convert +``` + +### Custom Task Definition +To add a new task, edit `mise.toml`: + +```toml +[tasks.my-custom-task] +description = "My custom task description" +run = "echo 'Running custom task'" +depends = ["install"] # Depends on install task +``` + +Then run: +```bash +mise run my-custom-task +``` + +## Documentation + +- **Project Guide** - See `PDF_CONVERTER_GUIDE.md` +- **Mise Docs** - https://mise.jdx.dev/ +- **Python Script** - See `pdf_to_markdown.py` + +## Support + +For issues or questions: +- Mise documentation: https://mise.jdx.dev/ +- Project issues: https://github.com/anomalyco/opencode + +--- + +**Version:** 1.0 +**Last Updated:** 2024-02-23 diff --git a/PDF_CONVERTER_GUIDE.md b/PDF_CONVERTER_GUIDE.md new file mode 100644 index 0000000..d9faec3 --- /dev/null +++ b/PDF_CONVERTER_GUIDE.md @@ -0,0 +1,233 @@ +# PDF to Markdown Converter - Setup & Usage Guide + +## Overview + +This is a Python script that converts PDF files to clean Markdown format, extracting text content and document metadata. + +**Features:** +- ✅ Extracts text from all PDF pages +- ✅ Preserves page structure with page headers +- ✅ Extracts metadata (title, author, creation date) +- ✅ Generates YAML front matter in Markdown files +- ✅ Robust error handling (skips problematic PDFs) +- ✅ Detailed logging and conversion summary +- ✅ Multiple CLI options for flexibility + +## Installation + +### Prerequisites +- Python 3.8 or higher +- pip (Python package manager) + +### Setup Steps + +1. **Clone or download this project** (if you haven't already) + +2. **Install dependencies:** + ```bash + pip install -r requirements.txt + ``` + + This installs: + - `pypdf` >= 3.0.0 - For PDF text extraction + - `python-dateutil` >= 2.8.0 - For date parsing + +3. **Verify installation:** + ```bash + python3 pdf_to_markdown.py --help + ``` + +## Usage + +### Basic Usage + +**Convert all PDFs in default folder (`./artikel`):** +```bash +python3 pdf_to_markdown.py +``` + +**Convert PDFs from custom input folder:** +```bash +python3 pdf_to_markdown.py /path/to/pdf/folder +``` + +**Specify both input and output folders:** +```bash +python3 pdf_to_markdown.py /path/to/input /path/to/output +``` + +### Advanced Options + +**Verbose mode** (detailed logging): +```bash +python3 pdf_to_markdown.py -v ./artikel +python3 pdf_to_markdown.py --verbose ./artikel +``` + +**Quiet mode** (suppress output except errors): +```bash +python3 pdf_to_markdown.py -q ./artikel +python3 pdf_to_markdown.py --quiet ./artikel +``` + +**Dry run** (preview without writing files): +```bash +python3 pdf_to_markdown.py --dry-run ./artikel +``` + +### Examples + +```bash +# Process all PDFs in artikel folder, save to artikel/converted +python3 pdf_to_markdown.py + +# Process PDFs in custom location with verbose output +python3 pdf_to_markdown.py -v ~/Documents/PDFs + +# Test what would be converted without writing files +python3 pdf_to_markdown.py --dry-run ./artikel + +# Convert and save to specific output directory +python3 pdf_to_markdown.py ./input_pdfs ./output_markdown +``` + +## Output Format + +Each converted PDF becomes a Markdown file with the following structure: + +```markdown +--- +title: Document Title +author: Author Name +created: 2024-02-23 +converted: 2024-02-23 14:32:15 +source: original_filename.pdf +--- + +# Document Title + +## Page 1 + +[Extracted text from page 1...] + +## Page 2 + +[Extracted text from page 2...] +``` + +**Front Matter Sections:** +- `title` - Document title (from PDF metadata or filename) +- `author` - Document author (if available in PDF metadata) +- `created` - PDF creation date (if available in metadata) +- `converted` - Timestamp of when the conversion occurred +- `source` - Original PDF filename + +## Troubleshooting + +### Issue: `ModuleNotFoundError: No module named 'pypdf'` + +**Solution:** Install dependencies: +```bash +pip install -r requirements.txt +``` + +### Issue: PDF has no extractable text + +This typically happens with: +- **Scanned PDFs** (image-based, no embedded text layer) +- **Corrupted PDFs** +- **Encrypted PDFs** + +The script will: +- Log a warning for the file +- Create a Markdown file with metadata but note that text extraction failed +- Continue processing other PDFs + +### Issue: Permission denied when writing files + +**Solution:** Ensure you have write permissions to the output directory: +```bash +chmod 755 /path/to/output/directory +``` + +### Issue: Special characters or encoding problems + +The script uses UTF-8 encoding by default, which handles most character sets. If you encounter issues: +- Ensure your terminal supports UTF-8 +- Check if the PDF contains unusual character encodings + +## Output Statistics + +After processing, the script displays a summary: +``` +============================================================ +CONVERSION SUMMARY +============================================================ +Total PDFs: 25 +Successful: 23 +Failed: 2 +Output directory: /path/to/converted +============================================================ +``` + +If any PDFs failed to convert, details are logged for debugging. + +## File Structure + +``` +. +├── pdf_to_markdown.py # Main conversion script +├── requirements.txt # Python dependencies +└── README.md # This file +``` + +## How It Works + +1. **Discovers PDFs** - Finds all `.pdf` files in the input directory +2. **Extracts Metadata** - Reads title, author, and creation date from PDF metadata +3. **Extracts Text** - Processes each page and extracts text content +4. **Creates Markdown** - Formats extracted content with metadata front matter +5. **Saves Files** - Writes Markdown files to output directory with same names as PDFs +6. **Reports Results** - Displays conversion summary and any errors + +## Limitations + +- **No image extraction** - Images in PDFs are not extracted or embedded +- **Text-only** - Requires PDFs with extractable text (scanned PDFs won't work well) +- **Layout preservation** - Complex multi-column layouts may not be perfectly preserved +- **Recursive search** - Only searches the top-level directory (not subdirectories) + +## Advanced: Customizing the Script + +### To process subdirectories: + +Replace this line in the script: +```python +pdf_files = list(self.input_dir.glob('*.pdf')) +``` + +With: +```python +pdf_files = list(self.input_dir.glob('**/*.pdf')) +``` + +### To include image extraction: + +The script currently skips images. To add image extraction: +1. Replace `pypdf` with `pymupdf (fitz)` for better image support +2. Modify the `extract_text()` method to save images +3. Update `create_markdown()` to reference extracted images + +## Support & Feedback + +For issues or feature requests, visit: +https://github.com/anomalyco/opencode + +## License + +This script is provided as-is for use in your project. + +--- + +**Version:** 1.0 +**Last Updated:** 2024-02-23 diff --git a/QUICK_REFERENCE.md b/QUICK_REFERENCE.md new file mode 100644 index 0000000..d7f6175 --- /dev/null +++ b/QUICK_REFERENCE.md @@ -0,0 +1,107 @@ +# Quick Reference Card + +## Mise Commands + +```bash +# Main conversion +mise run convert # Convert all PDFs + +# Logging options +mise run convert-verbose # Show detailed logs +mise run convert-quiet # Errors only + +# Preview & Check +mise run dry-run # Preview without writing +mise run status # Show progress + +# Custom paths +INPUT_DIR=/path mise run convert-custom +INPUT_DIR=/in OUTPUT_DIR=/out mise run convert-custom + +# Cleanup +mise run clean # Remove markdown only +mise run clean-all # Remove all artifacts + +# Help +mise tasks # List all tasks +mise run help # Show task info +``` + +## File Locations + +``` +artikel/ +├── *.pdf # Input PDFs +└── converted/ + └── *.md # Output Markdown +``` + +## One-Liner Setup + +```bash +curl https://mise.jdx.dev/install.sh | sh && cd maturaarbeit && mise trust && mise run convert +``` + +## Output Format + +```markdown +--- +title: PDF Title +author: PDF Author +created: 2024-02-23 +converted: 2024-02-23 14:32:15 +source: filename.pdf +--- + +# PDF Title + +## Page 1 +[Text...] + +## Page 2 +[Text...] +``` + +## Success Indicators + +✅ All tasks complete +✅ 18/18 PDFs converted +✅ 3.5 MB output +✅ No errors + +## Troubleshooting Quick Fixes + +| Issue | Fix | +|-------|-----| +| mise not found | `curl https://mise.jdx.dev/install.sh \| sh` | +| Config not trusted | `mise trust` | +| Dependencies missing | `mise run install` | +| No PDFs found | Check `ls artikel/*.pdf` | +| Python not found | First run may take longer | + +## Documentation Map + +| Question | See | +|----------|-----| +| How to use? | README.md | +| How does the script work? | PDF_CONVERTER_GUIDE.md | +| How does mise work? | MISE_GUIDE.md | +| Task details? | mise.toml | + +## Conversion Pipeline + +``` +Input PDFs (artikel/*.pdf) + ↓ + [Python Script] + - Read PDF + - Extract metadata + - Extract text + - Format Markdown + ↓ +Output Markdown (artikel/converted/*.md) +``` + +--- + +Print this card for quick reference! 📋 diff --git a/README.md b/README.md new file mode 100644 index 0000000..f69504d --- /dev/null +++ b/README.md @@ -0,0 +1,330 @@ +# PDF to Markdown Converter - Complete Setup + +A production-ready Python script with **mise** task runner for converting PDF files to Markdown format. + +## 🚀 Quick Start + +### One-Command Setup +```bash +# Install mise (if not already installed) +curl https://mise.jdx.dev/install.sh | sh + +# Navigate to project +cd maturaarbeit + +# Convert all PDFs to Markdown +mise run convert +``` + +That's it! ✨ + +## 📦 What's Included + +### Core Files +| File | Purpose | +|------|---------| +| **pdf_to_markdown.py** | Main conversion script (373 lines) | +| **requirements.txt** | Python dependencies (pypdf, python-dateutil) | +| **mise.toml** | Task runner configuration with 10+ tasks | +| **.mise.local.toml** | Local environment overrides (git-ignored) | +| **.gitignore** | Git exclusions for cache and build artifacts | + +### Documentation +| File | Purpose | +|------|---------| +| **README.md** | This file - overview and quick start | +| **PDF_CONVERTER_GUIDE.md** | Complete usage guide for the Python script | +| **MISE_GUIDE.md** | Detailed mise task runner documentation | + +### Converted Files +- **artikel/converted/** - 18 Markdown files (one per PDF) +- All PDFs successfully converted ✓ + +## 🎯 Key Features + +### PDF Conversion +✅ Extract text from all pages +✅ Preserve page structure with page headers +✅ Extract metadata (title, author, creation date) +✅ Generate YAML front matter +✅ Handle errors gracefully +✅ Progress reporting and summary + +### Mise Task Runner +✅ Automatic Python installation (3.11) +✅ Automatic dependency installation +✅ Reproducible builds +✅ Isolated environment +✅ 10+ convenient tasks +✅ Custom path support + +## 📋 Available Tasks + +Run with: `mise run ` + +### Main Tasks +```bash +mise run convert # Convert all PDFs (main task) +mise run convert-verbose # Convert with detailed logging +mise run convert-quiet # Convert silently +mise run dry-run # Preview without writing +``` + +### Utilities +```bash +mise run status # Show conversion progress +mise run install # Install dependencies +mise run clean # Remove converted markdown +mise run clean-all # Remove all artifacts +``` + +### Custom Conversion +```bash +INPUT_DIR=/path/to/pdfs mise run convert-custom +INPUT_DIR=/path OUTPUT_DIR=/out mise run convert-custom +``` + +## 📖 Documentation Guide + +### For Quick Start +👉 Read this file (README.md) + +### For Python Script Details +👉 See **PDF_CONVERTER_GUIDE.md** for: +- Installation instructions +- Usage examples +- Troubleshooting +- How the script works +- Customization options + +### For Mise Task Runner +👉 See **MISE_GUIDE.md** for: +- Mise installation and setup +- Task configuration +- Advanced usage +- CI/CD integration +- Custom task creation + +## 🔧 Usage Examples + +### Convert All PDFs (Default) +```bash +mise run convert +``` + +Output: 18 Markdown files in `artikel/converted/` + +### Convert with Verbose Logging +```bash +mise run convert-verbose +``` + +Shows detailed progress for each PDF. + +### Preview Conversion +```bash +mise run dry-run +``` + +Shows what would be converted without writing files. + +### Check Status +```bash +mise run status +``` + +Output: +``` +=== PDF Conversion Status === +PDF files in artikel/: 18 +Markdown files in artikel/converted/: 18 +✓ All PDFs converted! +``` + +## 📁 Output Format + +Each converted PDF becomes a Markdown file with: + +```markdown +--- +title: Document Title +author: Author Name +created: 2024-02-23 +converted: 2024-02-23 14:57:05 +source: original.pdf +--- + +# Document Title + +## Page 1 +[Extracted text...] + +## Page 2 +[Extracted text...] +``` + +## 🛠️ Technical Stack + +- **Language:** Python 3.11 +- **PDF Library:** pypdf 6.7.2 +- **Date Parsing:** python-dateutil 2.9.0 +- **Task Runner:** mise 2026.2.19 +- **Total Script Size:** 12 KB +- **Converted Files:** 3.5 MB (18 PDFs → Markdown) + +## ✅ Conversion Results + +**Status:** ✓ All 18 PDFs successfully converted + +| Metric | Value | +|--------|-------| +| Total PDFs | 18 | +| Converted | 18 | +| Failed | 0 | +| Conversion Time | ~28 seconds | +| Output Size | 3.5 MB | + +### Converted Documents +- bewegendeGefühle.md +- ChoreografiealsKulturteknik.md +- Choreografie Handwerk und Vision.md +- Handout-Choreografieren.md +- Klänge in Bewegung.md +- PersoenlichkeitsentwicklungdurchTanzUniBE.md +- PsychologyofSport&Exercise.md +- SinnundSinneimTanz.md +- Sportschule.pdf +- Sportunterricht.md +- TanzPsychotherapeutischeHilfe.md +- TanzpraxisinderForschung.md +- WirkfaktorenvonTanz.md +- Zwischen Rhythmus und Leistung.md +- bewegendeGefühle.md +- choreo.md +- choreografiekonzepte_kurz.md +- studienpsychischergesundheittanztherapie.md + +## 🔄 Workflows + +### Standard Workflow +```bash +# Check status before +mise run status + +# Convert PDFs +mise run convert + +# Verify conversion +mise run status + +# Clean if needed +mise run clean-all +``` + +### Development Workflow +```bash +# Preview what would happen +mise run dry-run + +# Run with verbose logging +mise run convert-verbose + +# Review results +ls -lh artikel/converted/ + +# Check specific file +cat artikel/converted/choreo.md | head -20 +``` + +### CI/CD Integration +```bash +# In GitHub Actions, GitLab CI, etc. +curl https://mise.jdx.dev/install.sh | sh +mise run convert +mise run status +``` + +## 🚨 Troubleshooting + +### Common Issues + +**Issue:** "mise: command not found" +**Solution:** Install mise: `curl https://mise.jdx.dev/install.sh | sh` + +**Issue:** "Config files are not trusted" +**Solution:** Run `mise trust` + +**Issue:** "No PDF files found" +**Solution:** Check input folder: `ls artikel/*.pdf` + +**Issue:** Python dependencies not installing +**Solution:** Run `mise run install` manually + +For detailed troubleshooting, see **PDF_CONVERTER_GUIDE.md** or **MISE_GUIDE.md**. + +## 📚 Additional Resources + +- **Mise Documentation:** https://mise.jdx.dev/ +- **pypdf Documentation:** https://py-pdf.github.io/pypdf/ +- **Project Issues:** https://github.com/anomalyco/opencode + +## 📝 Project Structure + +``` +maturaarbeit/ +├── pdf_to_markdown.py # Main script +├── requirements.txt # Dependencies +├── mise.toml # Task configuration +├── .mise.local.toml # Local overrides (git-ignored) +├── .gitignore # Git exclusions +│ +├── README.md # This file +├── PDF_CONVERTER_GUIDE.md # Python script guide +├── MISE_GUIDE.md # Task runner guide +│ +├── artikel/ # Input PDFs +│ ├── *.pdf # 18 PDF files +│ └── converted/ # Output Markdown +│ └── *.md # 18 Markdown files +│ +└── .git/ # Version control +``` + +## 🎓 Learning Path + +**For Users:** +1. Read this README +2. Run `mise run convert` +3. View results in `artikel/converted/` +4. Read **PDF_CONVERTER_GUIDE.md** for details + +**For Developers:** +1. Read **MISE_GUIDE.md** for task runner +2. Examine `mise.toml` for configuration +3. Review `pdf_to_markdown.py` for implementation +4. Customize as needed + +## 🔐 Security + +- ✅ No external API calls +- ✅ All processing local +- ✅ No data transmission +- ✅ Git-ignored local config +- ✅ Standard Python libraries + +## 📄 License + +This project is provided as-is for your use. + +## 👥 Support + +- **Mise Issues:** https://mise.jdx.dev/ +- **PDF Conversion Issues:** See **PDF_CONVERTER_GUIDE.md** +- **Task Runner Issues:** See **MISE_GUIDE.md** +- **Project Feedback:** https://github.com/anomalyco/opencode + +--- + +**Project Version:** 1.0 +**Last Updated:** February 23, 2026 +**Status:** ✅ Complete and Tested diff --git a/mise.toml b/mise.toml new file mode 100644 index 0000000..f2480a8 --- /dev/null +++ b/mise.toml @@ -0,0 +1,64 @@ +[env] +PYTHONUNBUFFERED = "1" + +[tasks.install] +description = "Install project dependencies" +run = "pip install -r requirements.txt" + +[tasks.convert] +description = "Convert all PDFs in artikel folder to Markdown" +run = "python3 pdf_to_markdown.py" +depends = ["install"] + +[tasks."convert-verbose"] +description = "Convert PDFs with verbose logging" +run = "python3 pdf_to_markdown.py -v" +depends = ["install"] + +[tasks."convert-quiet"] +description = "Convert PDFs quietly (errors only)" +run = "python3 pdf_to_markdown.py -q" +depends = ["install"] + +[tasks."dry-run"] +description = "Preview conversion without writing files" +run = "python3 pdf_to_markdown.py --dry-run" +depends = ["install"] + +[tasks."convert-custom"] +description = "Convert PDFs from custom input folder" +run = "python3 pdf_to_markdown.py ${INPUT_DIR:-./artikel} ${OUTPUT_DIR:-./artikel/converted}" +depends = ["install"] + +[tasks.clean] +description = "Remove converted markdown files" +run = "rm -rf artikel/converted/*.md && echo 'Cleaned converted markdown files'" + +[tasks.clean-all] +description = "Remove all converted files and cache" +run = "rm -rf artikel/converted && rm -rf __pycache__ && rm -rf *.pyc && echo 'Cleaned all build artifacts'" + +[tasks.status] +description = "Show conversion status (count PDFs and converted files)" +run = """ +echo "=== PDF Conversion Status ===" +PDF_COUNT=$(find artikel -maxdepth 1 -name "*.pdf" | wc -l) +MD_COUNT=$(find artikel/converted -maxdepth 1 -name "*.md" 2>/dev/null | wc -l || echo "0") +echo "PDF files in artikel/: $PDF_COUNT" +echo "Markdown files in artikel/converted/: $MD_COUNT" +if [ $PDF_COUNT -eq $MD_COUNT ]; then + echo "✓ All PDFs converted!" +else + echo "⚠ Unconverted PDFs: $((PDF_COUNT - MD_COUNT))" +fi +""" + +[tasks.help] +description = "Show available tasks" +run = "echo 'Available tasks:' && mise tasks" + +[tools.python] +version = "3.11" + +[tools.pipenv] +version = "2023" diff --git a/pdf_to_markdown.py b/pdf_to_markdown.py new file mode 100644 index 0000000..b5536ac --- /dev/null +++ b/pdf_to_markdown.py @@ -0,0 +1,373 @@ +#!/usr/bin/env python3 +""" +PDF to Markdown Converter + +Converts PDF files in a folder to Markdown format, extracting text and metadata. +Handles errors gracefully and provides detailed logging. +""" + +import argparse +import logging +import sys +from pathlib import Path +from datetime import datetime +from typing import Optional, Tuple, Dict, Any +import json + +from pypdf import PdfReader +from dateutil import parser as date_parser + + +# Configure logging +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + + +class PDFToMarkdownConverter: + """Converts PDF files to Markdown format.""" + + def __init__(self, input_dir: Path, output_dir: Path, verbose: bool = False, quiet: bool = False): + """ + Initialize the converter. + + Args: + input_dir: Directory containing PDF files + output_dir: Directory to save Markdown files + verbose: Enable verbose logging + quiet: Suppress all output except errors + """ + self.input_dir = Path(input_dir).resolve() + self.output_dir = Path(output_dir).resolve() + self.verbose = verbose + self.quiet = quiet + + # Configure logging based on verbosity + if quiet: + logger.setLevel(logging.ERROR) + elif verbose: + logger.setLevel(logging.DEBUG) + + # Create output directory if it doesn't exist + self.output_dir.mkdir(parents=True, exist_ok=True) + + # Statistics + self.stats = { + 'total': 0, + 'successful': 0, + 'failed': 0, + 'skipped': 0, + 'errors': [] + } + + def extract_metadata(self, reader: PdfReader, pdf_path: Path) -> Dict[str, Any]: + """ + Extract metadata from PDF. + + Args: + reader: PdfReader object + pdf_path: Path to PDF file + + Returns: + Dictionary containing metadata + """ + metadata = { + 'title': None, + 'author': None, + 'created': None, + 'source': pdf_path.name + } + + try: + # Try to extract from PDF metadata + if reader.metadata: + # Title + if '/Title' in reader.metadata: + title = reader.metadata.get('/Title') + metadata['title'] = title if isinstance(title, str) else str(title) + + # Author + if '/Author' in reader.metadata: + author = reader.metadata.get('/Author') + metadata['author'] = author if isinstance(author, str) else str(author) + + # Creation date + if '/CreationDate' in reader.metadata: + try: + date_str = reader.metadata.get('/CreationDate') + # Parse PDF date format (D:YYYYMMDDHHmmSS...) + if isinstance(date_str, str): + # Remove 'D:' prefix if present + if date_str.startswith('D:'): + date_str = date_str[2:] + # Parse date + parsed_date = date_parser.parse(date_str) + metadata['created'] = parsed_date.strftime('%Y-%m-%d') + except Exception as e: + logger.debug(f"Could not parse creation date: {e}") + + except Exception as e: + logger.warning(f"Error extracting metadata from {pdf_path.name}: {e}") + + # Use filename as title if not found in metadata + if not metadata['title']: + metadata['title'] = pdf_path.stem + + return metadata + + def extract_text(self, reader: PdfReader, pdf_path: Path) -> str: + """ + Extract text from PDF. + + Args: + reader: PdfReader object + pdf_path: Path to PDF file + + Returns: + Extracted text with page breaks + """ + text_parts = [] + total_pages = len(reader.pages) + + if total_pages == 0: + logger.warning(f"{pdf_path.name}: No pages found") + return "" + + for page_num, page in enumerate(reader.pages, start=1): + try: + text = page.extract_text() + if text and text.strip(): + # Add page header + text_parts.append(f"\n## Page {page_num}\n") + text_parts.append(text) + else: + logger.debug(f"{pdf_path.name}: Page {page_num} has no extractable text") + except Exception as e: + logger.warning(f"{pdf_path.name}: Error extracting text from page {page_num}: {e}") + + if not text_parts: + logger.warning(f"{pdf_path.name}: No text could be extracted from any pages") + return "" + + return "".join(text_parts) + + def create_markdown(self, metadata: Dict[str, Any], text: str) -> str: + """ + Create Markdown content with metadata front matter. + + Args: + metadata: Dictionary containing document metadata + text: Extracted text content + + Returns: + Markdown formatted content + """ + # Build YAML front matter + front_matter = ["---"] + + if metadata.get('title'): + front_matter.append(f"title: {metadata['title']}") + + if metadata.get('author'): + front_matter.append(f"author: {metadata['author']}") + + if metadata.get('created'): + front_matter.append(f"created: {metadata['created']}") + + # Add conversion timestamp + converted_time = datetime.now().strftime('%Y-%m-%d %H:%M:%S') + front_matter.append(f"converted: {converted_time}") + + if metadata.get('source'): + front_matter.append(f"source: {metadata['source']}") + + front_matter.append("---\n") + + # Combine front matter with content + content = "\n".join(front_matter) + + if text: + # Add main heading if we have a title + if metadata.get('title'): + content += f"# {metadata['title']}\n\n" + content += text + else: + content += "\n*No text content could be extracted from this PDF.*\n" + + return content + + def convert_pdf(self, pdf_path: Path) -> bool: + """ + Convert a single PDF file to Markdown. + + Args: + pdf_path: Path to PDF file + + Returns: + True if successful, False otherwise + """ + try: + if not self.quiet: + logger.info(f"Processing: {pdf_path.name}") + + # Read PDF + reader = PdfReader(pdf_path) + + # Extract metadata and text + metadata = self.extract_metadata(reader, pdf_path) + text = self.extract_text(reader, pdf_path) + + # Create Markdown content + markdown_content = self.create_markdown(metadata, text) + + # Generate output path + output_path = self.output_dir / pdf_path.with_suffix('.md').name + + # Write Markdown file + output_path.write_text(markdown_content, encoding='utf-8') + + if not self.quiet: + logger.info(f"✓ Successfully converted: {pdf_path.name} → {output_path.name}") + + self.stats['successful'] += 1 + return True + + except Exception as e: + error_msg = f"✗ Error converting {pdf_path.name}: {str(e)}" + logger.error(error_msg) + self.stats['failed'] += 1 + self.stats['errors'].append({'file': pdf_path.name, 'error': str(e)}) + return False + + def convert_folder(self, dry_run: bool = False) -> None: + """ + Convert all PDF files in input folder. + + Args: + dry_run: If True, don't write files, just report what would be done + """ + if not self.input_dir.exists(): + logger.error(f"Input directory not found: {self.input_dir}") + sys.exit(1) + + # Find all PDF files + pdf_files = list(self.input_dir.glob('*.pdf')) + + if not pdf_files: + logger.warning(f"No PDF files found in {self.input_dir}") + return + + self.stats['total'] = len(pdf_files) + + if not self.quiet: + logger.info(f"Found {len(pdf_files)} PDF file(s) in {self.input_dir}") + if dry_run: + logger.info("DRY RUN: No files will be written") + + # Convert each PDF + for pdf_path in sorted(pdf_files): + if dry_run: + logger.info(f"[DRY RUN] Would convert: {pdf_path.name}") + self.stats['successful'] += 1 + else: + self.convert_pdf(pdf_path) + + # Print summary + self.print_summary() + + def print_summary(self) -> None: + """Print conversion summary.""" + summary = f""" +{'='*60} +CONVERSION SUMMARY +{'='*60} +Total PDFs: {self.stats['total']} +Successful: {self.stats['successful']} +Failed: {self.stats['failed']} +Output directory: {self.output_dir} +{'='*60} +""" + if not self.quiet: + print(summary) + + if self.stats['errors']: + logger.error("Errors encountered:") + for error in self.stats['errors']: + logger.error(f" - {error['file']}: {error['error']}") + + +def main(): + """Main entry point.""" + parser = argparse.ArgumentParser( + description='Convert PDF files to Markdown format.', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python pdf_to_markdown.py # Uses default folders + python pdf_to_markdown.py ./artikel # Custom input folder + python pdf_to_markdown.py ./artikel ./output # Custom input and output + python pdf_to_markdown.py -v ./artikel # Verbose mode + python pdf_to_markdown.py --dry-run ./input # Preview without writing + """ + ) + + parser.add_argument( + 'input_dir', + nargs='?', + default='./artikel', + help='Input folder containing PDFs (default: ./artikel)' + ) + + parser.add_argument( + 'output_dir', + nargs='?', + default=None, + help='Output folder for Markdown files (default: input_dir/converted)' + ) + + parser.add_argument( + '-v', '--verbose', + action='store_true', + help='Enable verbose logging' + ) + + parser.add_argument( + '-q', '--quiet', + action='store_true', + help='Suppress all output except errors' + ) + + parser.add_argument( + '--dry-run', + action='store_true', + help='Test run without writing files' + ) + + args = parser.parse_args() + + # Set default output directory if not provided + if args.output_dir is None: + args.output_dir = str(Path(args.input_dir) / 'converted') + + # Create converter and run + converter = PDFToMarkdownConverter( + input_dir=args.input_dir, + output_dir=args.output_dir, + verbose=args.verbose, + quiet=args.quiet + ) + + try: + converter.convert_folder(dry_run=args.dry_run) + except KeyboardInterrupt: + logger.info("\nConversion interrupted by user") + sys.exit(1) + except Exception as e: + logger.error(f"Fatal error: {e}") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5ae9e3c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +pypdf>=3.0.0 +python-dateutil>=2.8.0