diff --git a/scripts/ex/.gitignore b/scripts/ex/.gitignore new file mode 100644 index 0000000..c9c7429 --- /dev/null +++ b/scripts/ex/.gitignore @@ -0,0 +1,56 @@ +# Python Virtual Environment +venv/ +env/ +.env + +# Python cache files +__pycache__/ +*.pyc +*.pyo +*.pyd +.Python + +# Generated/processed images +demo_* +cleaned_* +comparison_* +*_cleaned_* +*_comparison_* + +# Processing outputs +cleaned/ +output/ +results/ + +# Configuration files (may contain sensitive settings) +config.json +*.config.json +custom_*.json + +# Temporary files +*.tmp +*.temp +.DS_Store +Thumbs.db + +# IDE files +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# Logs +*.log +logs/ + +# Test outputs +test_* +sample_output/ + +# Large source images (uncomment if you don't want to track originals) +# *.jpg +# *.jpeg +# *.png +# *.tif +# *.tiff \ No newline at end of file diff --git a/scripts/ex/1771-09b-02.jpg b/scripts/ex/1771-09b-02.jpg new file mode 100644 index 0000000..fca1984 Binary files /dev/null and b/scripts/ex/1771-09b-02.jpg differ diff --git a/scripts/ex/1772-07b-02.jpg b/scripts/ex/1772-07b-02.jpg new file mode 100644 index 0000000..e921f7a Binary files /dev/null and b/scripts/ex/1772-07b-02.jpg differ diff --git a/scripts/ex/1772-34-136.jpg b/scripts/ex/1772-34-136.jpg new file mode 100644 index 0000000..16a47a7 Binary files /dev/null and b/scripts/ex/1772-34-136.jpg differ diff --git a/scripts/ex/README.md b/scripts/ex/README.md new file mode 100644 index 0000000..0cdabbe --- /dev/null +++ b/scripts/ex/README.md @@ -0,0 +1,211 @@ +# Historical Newspaper Image Cleaning Pipeline + +This pipeline automatically cleans and enhances scanned historical newspaper images by reducing noise, improving contrast, and sharpening text for better readability. + +## Features + +- **Noise Reduction**: Bilateral filtering and non-local means denoising +- **Contrast Enhancement**: CLAHE and gamma correction +- **Background Cleaning**: Morphological operations to remove artifacts +- **Text Sharpening**: Unsharp masking for improved readability +- **Batch Processing**: Process entire directories efficiently +- **Interactive Tuning**: Find optimal parameters for your specific images +- **Before/After Comparisons**: Visual validation of improvements + +## Quick Start + +### 1. Install Dependencies + +```bash +pip install -r requirements.txt +``` + +### 2. Process Single Image + +```bash +python image_cleaner.py input_image.jpg -o cleaned_image.jpg --comparison +``` + +### 3. Batch Process Directory + +```bash +python batch_process.py -i newspaper_scans -o cleaned_images +``` + +### 4. Interactive Parameter Tuning + +```bash +python config_tuner.py sample_image.jpg +``` + +## Usage Examples + +### Basic Image Cleaning +```bash +# Clean single image with default settings +python image_cleaner.py 1771-09b-02.jpg + +# Clean with specific processing steps +python image_cleaner.py 1771-09b-02.jpg --steps denoise contrast sharpen + +# Create before/after comparison +python image_cleaner.py 1771-09b-02.jpg -c +``` + +### Batch Processing +```bash +# Process all JPG files in current directory +python batch_process.py + +# Process specific directory with custom output +python batch_process.py -i scans/ -o cleaned/ + +# Use custom configuration +python batch_process.py --config custom_config.json + +# Skip comparison images for faster processing +python batch_process.py --no-comparisons +``` + +### Parameter Tuning +```bash +# Start interactive tuning session +python config_tuner.py sample_image.jpg + +# Load existing config for fine-tuning +python config_tuner.py sample_image.jpg -c existing_config.json +``` + +## Configuration + +### Default Parameters + +The pipeline uses these default parameters optimized for newspaper scans: + +```json +{ + "bilateral_d": 9, + "bilateral_sigma_color": 75, + "bilateral_sigma_space": 75, + "clahe_clip_limit": 2.0, + "clahe_grid_size": [8, 8], + "gamma": 1.2, + "denoise_h": 10, + "morph_kernel_size": 2, + "unsharp_amount": 1.5, + "unsharp_radius": 1.0, + "unsharp_threshold": 0 +} +``` + +### Parameter Descriptions + +- **bilateral_d**: Neighborhood diameter for bilateral filtering (5-15) +- **bilateral_sigma_color**: Color space filter strength (50-150) +- **bilateral_sigma_space**: Coordinate space filter strength (50-150) +- **clahe_clip_limit**: Contrast limiting for CLAHE (1.0-4.0) +- **clahe_grid_size**: CLAHE tile grid size [width, height] (4-16) +- **gamma**: Gamma correction value (0.8-2.0) +- **denoise_h**: Denoising filter strength (5-20) +- **morph_kernel_size**: Morphological operation kernel size (1-5) +- **unsharp_amount**: Unsharp masking strength (0.5-3.0) +- **unsharp_radius**: Unsharp masking radius (0.5-2.0) +- **unsharp_threshold**: Unsharp masking threshold (0-10) + +### Creating Custom Configurations + +1. Generate default config template: +```bash +python batch_process.py --create-config +``` + +2. Edit `config.json` with your preferred values + +3. Use custom config: +```bash +python batch_process.py --config config.json +``` + +## Processing Pipeline + +The image cleaning pipeline applies these steps in sequence: + +1. **Noise Reduction** + - Bilateral filtering preserves edges while reducing noise + - Non-local means denoising removes repetitive patterns + +2. **Contrast Enhancement** + - CLAHE improves local contrast adaptively + - Gamma correction adjusts overall brightness + +3. **Background Cleaning** + - Morphological operations remove small artifacts + - Background normalization reduces paper texture + +4. **Sharpening** + - Unsharp masking enhances text edges + - Preserves fine details while reducing blur + +## Interactive Tuning Commands + +When using `config_tuner.py`, these commands are available: + +- `set ` - Adjust parameter value +- `show` - Display current parameters +- `test [steps]` - Process with current settings +- `compare [filename]` - Save before/after comparison +- `save ` - Save configuration to file +- `load ` - Load configuration from file +- `presets` - Show preset configurations +- `help` - Show detailed help +- `quit` - Exit tuning session + +## Tips for Best Results + +### For Light Damage/Noise: +- Reduce `bilateral_d` to 5-7 +- Lower `denoise_h` to 5-8 +- Use `clahe_clip_limit` around 1.5 + +### For Heavy Damage/Artifacts: +- Increase `bilateral_d` to 12-15 +- Raise `denoise_h` to 15-20 +- Use higher `clahe_clip_limit` (3.0-4.0) + +### For Faded/Low Contrast Images: +- Increase `gamma` to 1.3-1.5 +- Raise `clahe_clip_limit` to 3.0+ +- Boost `unsharp_amount` to 2.0+ + +### For Sharp/High Quality Scans: +- Focus mainly on `denoise` and `sharpen` steps +- Skip `background` cleaning if unnecessary +- Use lighter settings to preserve quality + +## File Structure + +``` +newspaper_image_cleaner/ +├── image_cleaner.py # Core processing module +├── batch_process.py # Batch processing script +├── config_tuner.py # Interactive parameter tuning +├── requirements.txt # Python dependencies +└── README.md # This documentation +``` + +## Troubleshooting + +### ImportError: No module named 'cv2' +Install OpenCV: `pip install opencv-python` + +### Memory Issues with Large Images +The tuner automatically resizes large images. For batch processing of very large images, consider resizing first. + +### Poor Results +Use the interactive tuner to find optimal parameters for your specific image characteristics. + +## Performance + +- Single 3000x2000 image: ~3-5 seconds +- Batch processing depends on image size and quantity +- Interactive tuning uses smaller images for faster feedback \ No newline at end of file diff --git a/scripts/ex/batch_process.py b/scripts/ex/batch_process.py new file mode 100755 index 0000000..c16f780 --- /dev/null +++ b/scripts/ex/batch_process.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python3 +""" +Batch Processing Script for Historical Newspaper Images + +Simple script to process multiple images with the newspaper cleaning pipeline. +Includes progress tracking and error handling. +""" + +import os +import sys +import time +import json +from pathlib import Path +from image_cleaner import NewspaperImageCleaner, create_comparison_image + + +def process_batch(input_dir=".", output_dir="cleaned", config_file=None, + create_comparisons=True, file_pattern="*.jpg"): + """ + Process all newspaper images in a directory. + + Args: + input_dir: Directory containing input images + output_dir: Directory for cleaned images + config_file: JSON file with custom parameters + create_comparisons: Whether to create before/after comparisons + file_pattern: Glob pattern for files to process + """ + + # Load custom config if provided + config = None + if config_file and os.path.exists(config_file): + with open(config_file, 'r') as f: + config = json.load(f) + print(f"Loaded custom config from {config_file}") + + # Initialize cleaner + cleaner = NewspaperImageCleaner(config) + + # Setup paths + input_path = Path(input_dir) + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + if create_comparisons: + comparison_path = output_path / "comparisons" + comparison_path.mkdir(exist_ok=True) + + # Find all image files + image_files = list(input_path.glob(file_pattern)) + image_files.extend(input_path.glob("*.jpeg")) + image_files.extend(input_path.glob("*.JPG")) + image_files.extend(input_path.glob("*.JPEG")) + + if not image_files: + print(f"No image files found in {input_dir}") + return + + print(f"Found {len(image_files)} images to process") + print(f"Output directory: {output_path.absolute()}") + + # Process each image + success_count = 0 + error_count = 0 + start_time = time.time() + + for i, img_file in enumerate(image_files, 1): + print(f"\n[{i}/{len(image_files)}] Processing: {img_file.name}") + + try: + # Process image + output_file = output_path / f"cleaned_{img_file.name}" + processed, original = cleaner.process_image(img_file, output_file) + + # Create comparison if requested + if create_comparisons: + comp_file = comparison_path / f"comparison_{img_file.name}" + create_comparison_image(original, processed, comp_file) + + success_count += 1 + print(f"✓ Completed: {img_file.name}") + + except Exception as e: + error_count += 1 + print(f"✗ Error processing {img_file.name}: {str(e)}") + + # Summary + elapsed_time = time.time() - start_time + print(f"\n" + "="*50) + print(f"Batch Processing Complete") + print(f"{"="*50}") + print(f"Successfully processed: {success_count}") + print(f"Errors: {error_count}") + print(f"Total time: {elapsed_time:.1f} seconds") + print(f"Average time per image: {elapsed_time/len(image_files):.1f} seconds") + print(f"Output directory: {output_path.absolute()}") + + +def create_sample_config(): + """Create a sample configuration file for customization.""" + config = { + "bilateral_d": 9, + "bilateral_sigma_color": 75, + "bilateral_sigma_space": 75, + "clahe_clip_limit": 2.0, + "clahe_grid_size": [8, 8], + "gamma": 1.2, + "denoise_h": 10, + "morph_kernel_size": 2, + "unsharp_amount": 1.5, + "unsharp_radius": 1.0, + "unsharp_threshold": 0 + } + + with open("config.json", "w") as f: + json.dump(config, f, indent=4) + + print("Created config.json with default parameters.") + print("Edit this file to customize processing settings.") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + description="Batch process historical newspaper images", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + python batch_process.py # Process current directory + python batch_process.py -i scans -o clean # Process 'scans' folder + python batch_process.py --no-comparisons # Skip comparison images + python batch_process.py --config custom.json # Use custom settings + """ + ) + + parser.add_argument("-i", "--input", default=".", + help="Input directory (default: current directory)") + parser.add_argument("-o", "--output", default="cleaned", + help="Output directory (default: cleaned)") + parser.add_argument("-c", "--config", + help="JSON config file with custom parameters") + parser.add_argument("--no-comparisons", action="store_true", + help="Skip creating before/after comparison images") + parser.add_argument("--pattern", default="*.jpg", + help="File pattern to match (default: *.jpg)") + parser.add_argument("--create-config", action="store_true", + help="Create sample config file and exit") + + args = parser.parse_args() + + if args.create_config: + create_sample_config() + sys.exit(0) + + process_batch( + input_dir=args.input, + output_dir=args.output, + config_file=args.config, + create_comparisons=not args.no_comparisons, + file_pattern=args.pattern + ) \ No newline at end of file diff --git a/scripts/ex/config_tuner.py b/scripts/ex/config_tuner.py new file mode 100755 index 0000000..3539f05 --- /dev/null +++ b/scripts/ex/config_tuner.py @@ -0,0 +1,291 @@ +#!/usr/bin/env python3 +""" +Interactive Parameter Tuning Tool for Newspaper Image Cleaning + +This tool helps you find optimal parameters for your specific images +by providing an interactive tuning interface. +""" + +import cv2 +import json +import numpy as np +from pathlib import Path +from image_cleaner import NewspaperImageCleaner + + +class ParameterTuner: + """Interactive parameter tuning for image cleaning pipeline.""" + + def __init__(self, sample_image_path): + """Initialize with a sample image for tuning.""" + self.original = cv2.imread(str(sample_image_path)) + if self.original is None: + raise ValueError(f"Could not load image: {sample_image_path}") + + # Resize large images for faster processing during tuning + height, width = self.original.shape[:2] + if height > 1500 or width > 1500: + scale = min(1500/height, 1500/width) + new_width = int(width * scale) + new_height = int(height * scale) + self.original = cv2.resize(self.original, (new_width, new_height)) + print(f"Resized image to {new_width}x{new_height} for faster tuning") + + self.current_params = self._get_default_params() + self.cleaner = NewspaperImageCleaner(self.current_params) + + def _get_default_params(self): + """Get default parameters as starting point.""" + return { + 'bilateral_d': 9, + 'bilateral_sigma_color': 75, + 'bilateral_sigma_space': 75, + 'clahe_clip_limit': 2.0, + 'clahe_grid_size': (8, 8), + 'gamma': 1.2, + 'denoise_h': 10, + 'morph_kernel_size': 2, + 'unsharp_amount': 1.5, + 'unsharp_radius': 1.0, + 'unsharp_threshold': 0, + } + + def update_parameter(self, param_name, value): + """Update a single parameter and refresh the cleaner.""" + if param_name in self.current_params: + # Handle special cases + if param_name == 'clahe_grid_size': + self.current_params[param_name] = (int(value), int(value)) + else: + self.current_params[param_name] = value + + # Update cleaner with new parameters + self.cleaner = NewspaperImageCleaner(self.current_params) + print(f"Updated {param_name} = {value}") + + def process_with_current_params(self, steps=None): + """Process the sample image with current parameters.""" + if steps is None: + steps = ['denoise', 'contrast', 'background', 'sharpen'] + + image = self.original.copy() + + # Apply processing steps + if 'denoise' in steps: + image = self.cleaner.reduce_noise(image) + + if 'contrast' in steps: + image = self.cleaner.enhance_contrast(image) + + if 'background' in steps: + image = self.cleaner.clean_background(image) + + if 'sharpen' in steps: + image = self.cleaner.sharpen_image(image) + + return image + + def create_comparison(self, steps=None): + """Create side-by-side comparison with current parameters.""" + processed = self.process_with_current_params(steps) + + # Create side-by-side comparison + height = max(self.original.shape[0], processed.shape[0]) + comparison = np.hstack([ + cv2.resize(self.original, (self.original.shape[1], height)), + cv2.resize(processed, (processed.shape[1], height)) + ]) + + return comparison + + def save_comparison(self, output_path, steps=None): + """Save comparison image to file.""" + comparison = self.create_comparison(steps) + cv2.imwrite(str(output_path), comparison) + print(f"Comparison saved to: {output_path}") + + def save_config(self, config_path): + """Save current parameters to JSON config file.""" + # Convert tuple to list for JSON serialization + config_to_save = self.current_params.copy() + if 'clahe_grid_size' in config_to_save: + config_to_save['clahe_grid_size'] = list(config_to_save['clahe_grid_size']) + + with open(config_path, 'w') as f: + json.dump(config_to_save, f, indent=4) + print(f"Configuration saved to: {config_path}") + + def load_config(self, config_path): + """Load parameters from JSON config file.""" + with open(config_path, 'r') as f: + loaded_params = json.load(f) + + # Convert list back to tuple if needed + if 'clahe_grid_size' in loaded_params: + loaded_params['clahe_grid_size'] = tuple(loaded_params['clahe_grid_size']) + + self.current_params.update(loaded_params) + self.cleaner = NewspaperImageCleaner(self.current_params) + print(f"Configuration loaded from: {config_path}") + + def interactive_tune(self): + """Start interactive tuning session.""" + print("\n" + "="*60) + print("INTERACTIVE PARAMETER TUNING") + print("="*60) + print("Commands:") + print(" set - Set parameter value") + print(" show - Show current parameters") + print(" test [steps] - Test current parameters") + print(" save - Save configuration to file") + print(" load - Load configuration from file") + print(" compare [file] - Save comparison image") + print(" presets - Show parameter presets") + print(" help - Show this help") + print(" quit - Exit tuning") + print("\nParameters you can adjust:") + for param in self.current_params: + print(f" {param}") + + while True: + try: + command = input("\ntuner> ").strip().split() + if not command: + continue + + cmd = command[0].lower() + + if cmd == 'quit' or cmd == 'exit': + break + + elif cmd == 'show': + self._show_parameters() + + elif cmd == 'set' and len(command) >= 3: + param = command[1] + try: + value = float(command[2]) if '.' in command[2] else int(command[2]) + except ValueError: + value = command[2] + self.update_parameter(param, value) + + elif cmd == 'test': + steps = command[1:] if len(command) > 1 else None + print("Processing with current parameters...") + processed = self.process_with_current_params(steps) + print(f"Processed image shape: {processed.shape}") + + elif cmd == 'save' and len(command) > 1: + self.save_config(command[1]) + + elif cmd == 'load' and len(command) > 1: + self.load_config(command[1]) + + elif cmd == 'compare': + output = command[1] if len(command) > 1 else "tuning_comparison.jpg" + self.save_comparison(output) + + elif cmd == 'presets': + self._show_presets() + + elif cmd == 'help': + self._show_help() + + else: + print("Unknown command. Type 'help' for available commands.") + + except KeyboardInterrupt: + print("\nExiting tuner...") + break + except Exception as e: + print(f"Error: {str(e)}") + + def _show_parameters(self): + """Display current parameter values.""" + print("\nCurrent Parameters:") + print("-" * 30) + for param, value in self.current_params.items(): + print(f" {param:<20} = {value}") + + def _show_presets(self): + """Show preset configurations for different image types.""" + presets = { + "light_cleaning": { + "bilateral_d": 5, + "denoise_h": 5, + "clahe_clip_limit": 1.5, + "gamma": 1.1, + "unsharp_amount": 1.2 + }, + "heavy_cleaning": { + "bilateral_d": 15, + "denoise_h": 15, + "clahe_clip_limit": 3.0, + "gamma": 1.3, + "unsharp_amount": 2.0 + }, + "high_contrast": { + "clahe_clip_limit": 4.0, + "gamma": 1.4, + "unsharp_amount": 2.5 + } + } + + print("\nAvailable Presets:") + print("-" * 30) + for name, params in presets.items(): + print(f"{name}:") + for param, value in params.items(): + print(f" {param} = {value}") + print() + + def _show_help(self): + """Show detailed help information.""" + help_text = """ +Parameter Descriptions: +----------------------- +bilateral_d : Neighborhood diameter for bilateral filtering (5-15) +bilateral_sigma_color: Filter sigma in color space (50-150) +bilateral_sigma_space: Filter sigma in coordinate space (50-150) +clahe_clip_limit : Contrast limit for CLAHE (1.0-4.0) +clahe_grid_size : CLAHE tile grid size (4-16) +gamma : Gamma correction value (0.8-2.0) +denoise_h : Denoising filter strength (5-20) +morph_kernel_size : Morphological operation kernel size (1-5) +unsharp_amount : Unsharp masking amount (0.5-3.0) +unsharp_radius : Unsharp masking radius (0.5-2.0) +unsharp_threshold : Unsharp masking threshold (0-10) + +Tips: +- Start with small adjustments (±20% of current value) +- Test frequently with 'compare' command +- Save working configurations before major changes +- Use 'test denoise' to test individual steps + """ + print(help_text) + + +def main(): + """Main function for command line usage.""" + import argparse + + parser = argparse.ArgumentParser(description="Interactive parameter tuning for newspaper image cleaning") + parser.add_argument("image", help="Sample image path for tuning") + parser.add_argument("-c", "--config", help="Load initial config from file") + + args = parser.parse_args() + + try: + tuner = ParameterTuner(args.image) + + if args.config: + tuner.load_config(args.config) + + tuner.interactive_tune() + + except Exception as e: + print(f"Error: {str(e)}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/ex/demo.py b/scripts/ex/demo.py new file mode 100755 index 0000000..9e01a2a --- /dev/null +++ b/scripts/ex/demo.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Demo Script for Newspaper Image Cleaning Pipeline + +This script demonstrates the cleaning pipeline on the sample images +and shows the available functionality. +""" + +import sys +import os +from pathlib import Path + +# Add current directory to Python path +sys.path.append(str(Path(__file__).parent)) + +try: + from image_cleaner import NewspaperImageCleaner, create_comparison_image + import cv2 + import numpy as np + print("✓ All required libraries imported successfully") +except ImportError as e: + print(f"✗ Import error: {e}") + print("Please install required packages: pip install -r requirements.txt") + sys.exit(1) + + +def demo_single_image(image_path): + """Demonstrate processing a single image.""" + print(f"\n=== Processing Single Image: {image_path} ===") + + if not os.path.exists(image_path): + print(f"Image not found: {image_path}") + return False + + try: + # Initialize cleaner + cleaner = NewspaperImageCleaner() + + # Process image + output_path = f"demo_cleaned_{Path(image_path).name}" + processed, original = cleaner.process_image(image_path, output_path) + + # Create comparison + comparison_path = f"demo_comparison_{Path(image_path).name}" + create_comparison_image(original, processed, comparison_path) + + print(f"✓ Processed image saved: {output_path}") + print(f"✓ Comparison saved: {comparison_path}") + return True + + except Exception as e: + print(f"✗ Error processing {image_path}: {str(e)}") + return False + + +def demo_step_by_step(image_path): + """Demonstrate individual processing steps.""" + print(f"\n=== Step-by-Step Processing: {image_path} ===") + + if not os.path.exists(image_path): + print(f"Image not found: {image_path}") + return + + try: + # Load image + original = cv2.imread(image_path) + if original is None: + print(f"Could not load image: {image_path}") + return + + # Resize if too large for demo + height, width = original.shape[:2] + if height > 1000 or width > 1000: + scale = min(1000/height, 1000/width) + new_width = int(width * scale) + new_height = int(height * scale) + original = cv2.resize(original, (new_width, new_height)) + print(f"Resized to {new_width}x{new_height} for demo") + + cleaner = NewspaperImageCleaner() + + # Process step by step + steps = [ + ('original', original), + ('denoised', cleaner.reduce_noise(original.copy())), + ('contrast_enhanced', cleaner.enhance_contrast(original.copy())), + ('background_cleaned', cleaner.clean_background(original.copy())), + ('sharpened', cleaner.sharpen_image(original.copy())) + ] + + # Save each step + for step_name, image in steps: + output_path = f"demo_step_{step_name}_{Path(image_path).name}" + cv2.imwrite(output_path, image) + print(f"✓ Saved {step_name}: {output_path}") + + print("✓ Individual processing steps completed") + + except Exception as e: + print(f"✗ Error in step-by-step processing: {str(e)}") + + +def show_image_info(): + """Show information about available images.""" + print("\n=== Available Sample Images ===") + + image_files = [] + for ext in ['*.jpg', '*.jpeg', '*.JPG', '*.JPEG']: + image_files.extend(Path('.').glob(ext)) + + if not image_files: + print("No image files found in current directory") + return [] + + for img_file in image_files: + try: + # Load image to get dimensions + img = cv2.imread(str(img_file)) + if img is not None: + height, width = img.shape[:2] + file_size = img_file.stat().st_size / (1024*1024) # MB + print(f" {img_file.name}: {width}x{height} pixels, {file_size:.1f}MB") + else: + print(f" {img_file.name}: Could not load") + except Exception as e: + print(f" {img_file.name}: Error - {str(e)}") + + return image_files + + +def main(): + """Main demo function.""" + print("Historical Newspaper Image Cleaning Pipeline - Demo") + print("=" * 55) + + # Show available images + image_files = show_image_info() + + if not image_files: + print("\nNo images found. Please add some image files to test.") + return + + # Select first image for demo + sample_image = image_files[0] + print(f"\nUsing sample image: {sample_image.name}") + + # Demo single image processing + success = demo_single_image(str(sample_image)) + + if success: + # Demo step-by-step processing + demo_step_by_step(str(sample_image)) + + print(f"\n=== Demo Complete ===") + print("Generated files:") + print(" - demo_cleaned_*.jpg (cleaned image)") + print(" - demo_comparison_*.jpg (before/after comparison)") + print(" - demo_step_*.jpg (individual processing steps)") + + print(f"\nNext steps:") + print(f" - Try: python config_tuner.py {sample_image.name}") + print(f" - Try: python batch_process.py") + print(f" - Adjust parameters in config.json for better results") + + else: + print("\nDemo failed. Please check your Python environment and dependencies.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/scripts/ex/image_cleaner.py b/scripts/ex/image_cleaner.py new file mode 100644 index 0000000..33a6c4d --- /dev/null +++ b/scripts/ex/image_cleaner.py @@ -0,0 +1,310 @@ +""" +Historical Newspaper Image Cleaning Pipeline + +This module provides functions to clean and enhance scanned historical newspaper images +by reducing noise, improving contrast, and sharpening text for better readability. +""" + +import cv2 +import numpy as np +from PIL import Image, ImageEnhance +import os +import argparse +from pathlib import Path + + +class NewspaperImageCleaner: + """ + Image processing pipeline specifically designed for historical newspaper scans. + """ + + def __init__(self, config=None): + """Initialize with default or custom configuration.""" + self.config = config or self._default_config() + + def _default_config(self): + """Default processing parameters optimized for newspaper scans.""" + return { + 'bilateral_d': 9, # Neighborhood diameter for bilateral filter + 'bilateral_sigma_color': 75, # Filter sigma in color space + 'bilateral_sigma_space': 75, # Filter sigma in coordinate space + 'clahe_clip_limit': 2.0, # Contrast limiting for CLAHE + 'clahe_grid_size': (8, 8), # CLAHE grid size + 'gamma': 1.2, # Gamma correction value + 'denoise_h': 10, # Denoising filter strength + 'morph_kernel_size': 2, # Morphological operation kernel size + 'unsharp_amount': 1.5, # Unsharp masking amount + 'unsharp_radius': 1.0, # Unsharp masking radius + 'unsharp_threshold': 0, # Unsharp masking threshold + } + + def reduce_noise(self, image): + """ + Apply noise reduction techniques to remove speckles and JPEG artifacts. + + Args: + image: Input BGR image + + Returns: + Denoised image + """ + # Bilateral filter - preserves edges while reducing noise + bilateral = cv2.bilateralFilter( + image, + self.config['bilateral_d'], + self.config['bilateral_sigma_color'], + self.config['bilateral_sigma_space'] + ) + + # Non-local means denoising for better noise reduction + if len(image.shape) == 3: + # Color image + denoised = cv2.fastNlMeansDenoisingColored( + bilateral, None, + self.config['denoise_h'], + self.config['denoise_h'], + 7, 21 + ) + else: + # Grayscale image + denoised = cv2.fastNlMeansDenoising( + bilateral, None, + self.config['denoise_h'], + 7, 21 + ) + + return denoised + + def enhance_contrast(self, image): + """ + Improve image contrast using CLAHE and gamma correction. + + Args: + image: Input BGR image + + Returns: + Contrast-enhanced image + """ + # Convert to LAB color space for better contrast processing + if len(image.shape) == 3: + lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) + l_channel, a_channel, b_channel = cv2.split(lab) + else: + l_channel = image + + # Apply CLAHE (Contrast Limited Adaptive Histogram Equalization) + clahe = cv2.createCLAHE( + clipLimit=self.config['clahe_clip_limit'], + tileGridSize=self.config['clahe_grid_size'] + ) + l_channel = clahe.apply(l_channel) + + # Reconstruct image + if len(image.shape) == 3: + enhanced = cv2.merge([l_channel, a_channel, b_channel]) + enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR) + else: + enhanced = l_channel + + # Apply gamma correction + gamma = self.config['gamma'] + inv_gamma = 1.0 / gamma + table = np.array([((i / 255.0) ** inv_gamma) * 255 + for i in np.arange(0, 256)]).astype("uint8") + enhanced = cv2.LUT(enhanced, table) + + return enhanced + + def clean_background(self, image): + """ + Remove small artifacts and clean background noise. + + Args: + image: Input image + + Returns: + Background-cleaned image + """ + # Convert to grayscale for morphological operations + if len(image.shape) == 3: + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + else: + gray = image + + # Morphological opening to remove small noise + kernel = np.ones((self.config['morph_kernel_size'], + self.config['morph_kernel_size']), np.uint8) + + # Opening (erosion followed by dilation) + opened = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel) + + # If original was color, apply the mask + if len(image.shape) == 3: + # Create a mask and apply it to the original color image + mask = opened > 0 + result = image.copy() + result[~mask] = [255, 255, 255] # Set background to white + return result + else: + return opened + + def sharpen_image(self, image): + """ + Apply unsharp masking to enhance text clarity. + + Args: + image: Input image + + Returns: + Sharpened image + """ + # Convert to float for processing + float_img = image.astype(np.float32) / 255.0 + + # Create Gaussian blur + radius = self.config['unsharp_radius'] + sigma = radius / 3.0 + blurred = cv2.GaussianBlur(float_img, (0, 0), sigma) + + # Unsharp masking + amount = self.config['unsharp_amount'] + sharpened = float_img + amount * (float_img - blurred) + + # Threshold and clamp + threshold = self.config['unsharp_threshold'] / 255.0 + sharpened = np.where(np.abs(float_img - blurred) < threshold, + float_img, sharpened) + sharpened = np.clip(sharpened, 0.0, 1.0) + + return (sharpened * 255).astype(np.uint8) + + def process_image(self, image_path, output_path=None, steps=None): + """ + Process a single image through the complete pipeline. + + Args: + image_path: Path to input image + output_path: Path for output image (optional) + steps: List of processing steps to apply (optional) + + Returns: + Processed image array + """ + if steps is None: + steps = ['denoise', 'contrast', 'background', 'sharpen'] + + # Load image + image = cv2.imread(str(image_path)) + if image is None: + raise ValueError(f"Could not load image: {image_path}") + + original = image.copy() + + # Apply processing steps + if 'denoise' in steps: + print(f"Applying noise reduction...") + image = self.reduce_noise(image) + + if 'contrast' in steps: + print(f"Enhancing contrast...") + image = self.enhance_contrast(image) + + if 'background' in steps: + print(f"Cleaning background...") + image = self.clean_background(image) + + if 'sharpen' in steps: + print(f"Sharpening image...") + image = self.sharpen_image(image) + + # Save output if path provided + if output_path: + cv2.imwrite(str(output_path), image) + print(f"Processed image saved to: {output_path}") + + return image, original + + def process_directory(self, input_dir, output_dir, extensions=None): + """ + Process all images in a directory. + + Args: + input_dir: Input directory path + output_dir: Output directory path + extensions: List of file extensions to process + """ + if extensions is None: + extensions = ['.jpg', '.jpeg', '.png', '.tif', '.tiff'] + + input_path = Path(input_dir) + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + for file_path in input_path.iterdir(): + if file_path.suffix.lower() in extensions: + print(f"\nProcessing: {file_path.name}") + output_file = output_path / f"cleaned_{file_path.name}" + + try: + self.process_image(file_path, output_file) + except Exception as e: + print(f"Error processing {file_path.name}: {str(e)}") + + print(f"\nBatch processing completed. Results in: {output_dir}") + + +def create_comparison_image(original, processed, output_path): + """ + Create a side-by-side comparison image. + + Args: + original: Original image array + processed: Processed image array + output_path: Path to save comparison + """ + # Resize images to same height if needed + h1, w1 = original.shape[:2] + h2, w2 = processed.shape[:2] + + if h1 != h2: + height = min(h1, h2) + original = cv2.resize(original, (int(w1 * height / h1), height)) + processed = cv2.resize(processed, (int(w2 * height / h2), height)) + + # Create side-by-side comparison + comparison = np.hstack([original, processed]) + cv2.imwrite(str(output_path), comparison) + print(f"Comparison saved to: {output_path}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Clean historical newspaper images") + parser.add_argument("input", help="Input image or directory path") + parser.add_argument("-o", "--output", help="Output path") + parser.add_argument("-d", "--directory", action="store_true", + help="Process entire directory") + parser.add_argument("-c", "--comparison", action="store_true", + help="Create before/after comparison") + parser.add_argument("--steps", nargs="+", + choices=['denoise', 'contrast', 'background', 'sharpen'], + default=['denoise', 'contrast', 'background', 'sharpen'], + help="Processing steps to apply") + + args = parser.parse_args() + + cleaner = NewspaperImageCleaner() + + if args.directory: + output_dir = args.output or "cleaned_images" + cleaner.process_directory(args.input, output_dir) + else: + output_path = args.output + if not output_path: + input_path = Path(args.input) + output_path = input_path.parent / f"cleaned_{input_path.name}" + + processed, original = cleaner.process_image(args.input, output_path, args.steps) + + if args.comparison: + comparison_path = Path(output_path).parent / f"comparison_{Path(args.input).name}" + create_comparison_image(original, processed, comparison_path) \ No newline at end of file diff --git a/scripts/ex/requirements.txt b/scripts/ex/requirements.txt new file mode 100644 index 0000000..978aaf4 --- /dev/null +++ b/scripts/ex/requirements.txt @@ -0,0 +1,5 @@ +opencv-python==4.10.0.84 +scikit-image==0.24.0 +Pillow==10.4.0 +numpy==2.1.1 +matplotlib==3.9.2 \ No newline at end of file diff --git a/scripts/ex/run.sh b/scripts/ex/run.sh new file mode 100755 index 0000000..6fa808e --- /dev/null +++ b/scripts/ex/run.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Convenience script to run the image cleaning pipeline with virtual environment + +# Activate virtual environment +source venv/bin/activate + +# Check if any arguments provided +if [ $# -eq 0 ]; then + echo "Historical Newspaper Image Cleaning Pipeline" + echo "Usage examples:" + echo " $0 demo # Run demo" + echo " $0 clean image.jpg # Clean single image" + echo " $0 batch # Process all images in directory" + echo " $0 tune image.jpg # Interactive parameter tuning" + echo " $0 python script.py [args] # Run custom Python script" + exit 1 +fi + +case "$1" in + "demo") + python demo.py + ;; + "clean") + shift + python image_cleaner.py "$@" + ;; + "batch") + shift + python batch_process.py "$@" + ;; + "tune") + shift + python config_tuner.py "$@" + ;; + "python") + shift + python "$@" + ;; + *) + python "$@" + ;; +esac \ No newline at end of file