Files
kgpz_web/scripts/ex/image_cleaner.py
Simon Martens 9960dc5e38 image cleaner
2025-09-15 18:32:13 +02:00

310 lines
10 KiB
Python

"""
Historical Newspaper Image Cleaning Pipeline
This module provides functions to clean and enhance scanned historical newspaper images
by reducing noise, improving contrast, and sharpening text for better readability.
"""
import cv2
import numpy as np
from PIL import Image, ImageEnhance
import os
import argparse
from pathlib import Path
class NewspaperImageCleaner:
"""
Image processing pipeline specifically designed for historical newspaper scans.
"""
def __init__(self, config=None):
"""Initialize with default or custom configuration."""
self.config = config or self._default_config()
def _default_config(self):
"""Default processing parameters optimized for newspaper scans."""
return {
'bilateral_d': 9, # Neighborhood diameter for bilateral filter
'bilateral_sigma_color': 75, # Filter sigma in color space
'bilateral_sigma_space': 75, # Filter sigma in coordinate space
'clahe_clip_limit': 2.0, # Contrast limiting for CLAHE
'clahe_grid_size': (8, 8), # CLAHE grid size
'gamma': 1.2, # Gamma correction value
'denoise_h': 10, # Denoising filter strength
'morph_kernel_size': 2, # Morphological operation kernel size
'unsharp_amount': 1.5, # Unsharp masking amount
'unsharp_radius': 1.0, # Unsharp masking radius
'unsharp_threshold': 0, # Unsharp masking threshold
}
def reduce_noise(self, image):
"""
Apply noise reduction techniques to remove speckles and JPEG artifacts.
Args:
image: Input BGR image
Returns:
Denoised image
"""
# Bilateral filter - preserves edges while reducing noise
bilateral = cv2.bilateralFilter(
image,
self.config['bilateral_d'],
self.config['bilateral_sigma_color'],
self.config['bilateral_sigma_space']
)
# Non-local means denoising for better noise reduction
if len(image.shape) == 3:
# Color image
denoised = cv2.fastNlMeansDenoisingColored(
bilateral, None,
self.config['denoise_h'],
self.config['denoise_h'],
7, 21
)
else:
# Grayscale image
denoised = cv2.fastNlMeansDenoising(
bilateral, None,
self.config['denoise_h'],
7, 21
)
return denoised
def enhance_contrast(self, image):
"""
Improve image contrast using CLAHE and gamma correction.
Args:
image: Input BGR image
Returns:
Contrast-enhanced image
"""
# Convert to LAB color space for better contrast processing
if len(image.shape) == 3:
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
l_channel, a_channel, b_channel = cv2.split(lab)
else:
l_channel = image
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
clahe = cv2.createCLAHE(
clipLimit=self.config['clahe_clip_limit'],
tileGridSize=self.config['clahe_grid_size']
)
l_channel = clahe.apply(l_channel)
# Reconstruct image
if len(image.shape) == 3:
enhanced = cv2.merge([l_channel, a_channel, b_channel])
enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)
else:
enhanced = l_channel
# Apply gamma correction
gamma = self.config['gamma']
inv_gamma = 1.0 / gamma
table = np.array([((i / 255.0) ** inv_gamma) * 255
for i in np.arange(0, 256)]).astype("uint8")
enhanced = cv2.LUT(enhanced, table)
return enhanced
def clean_background(self, image):
"""
Remove small artifacts and clean background noise.
Args:
image: Input image
Returns:
Background-cleaned image
"""
# Convert to grayscale for morphological operations
if len(image.shape) == 3:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
else:
gray = image
# Morphological opening to remove small noise
kernel = np.ones((self.config['morph_kernel_size'],
self.config['morph_kernel_size']), np.uint8)
# Opening (erosion followed by dilation)
opened = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
# If original was color, apply the mask
if len(image.shape) == 3:
# Create a mask and apply it to the original color image
mask = opened > 0
result = image.copy()
result[~mask] = [255, 255, 255] # Set background to white
return result
else:
return opened
def sharpen_image(self, image):
"""
Apply unsharp masking to enhance text clarity.
Args:
image: Input image
Returns:
Sharpened image
"""
# Convert to float for processing
float_img = image.astype(np.float32) / 255.0
# Create Gaussian blur
radius = self.config['unsharp_radius']
sigma = radius / 3.0
blurred = cv2.GaussianBlur(float_img, (0, 0), sigma)
# Unsharp masking
amount = self.config['unsharp_amount']
sharpened = float_img + amount * (float_img - blurred)
# Threshold and clamp
threshold = self.config['unsharp_threshold'] / 255.0
sharpened = np.where(np.abs(float_img - blurred) < threshold,
float_img, sharpened)
sharpened = np.clip(sharpened, 0.0, 1.0)
return (sharpened * 255).astype(np.uint8)
def process_image(self, image_path, output_path=None, steps=None):
"""
Process a single image through the complete pipeline.
Args:
image_path: Path to input image
output_path: Path for output image (optional)
steps: List of processing steps to apply (optional)
Returns:
Processed image array
"""
if steps is None:
steps = ['denoise', 'contrast', 'background', 'sharpen']
# Load image
image = cv2.imread(str(image_path))
if image is None:
raise ValueError(f"Could not load image: {image_path}")
original = image.copy()
# Apply processing steps
if 'denoise' in steps:
print(f"Applying noise reduction...")
image = self.reduce_noise(image)
if 'contrast' in steps:
print(f"Enhancing contrast...")
image = self.enhance_contrast(image)
if 'background' in steps:
print(f"Cleaning background...")
image = self.clean_background(image)
if 'sharpen' in steps:
print(f"Sharpening image...")
image = self.sharpen_image(image)
# Save output if path provided
if output_path:
cv2.imwrite(str(output_path), image)
print(f"Processed image saved to: {output_path}")
return image, original
def process_directory(self, input_dir, output_dir, extensions=None):
"""
Process all images in a directory.
Args:
input_dir: Input directory path
output_dir: Output directory path
extensions: List of file extensions to process
"""
if extensions is None:
extensions = ['.jpg', '.jpeg', '.png', '.tif', '.tiff']
input_path = Path(input_dir)
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
for file_path in input_path.iterdir():
if file_path.suffix.lower() in extensions:
print(f"\nProcessing: {file_path.name}")
output_file = output_path / f"cleaned_{file_path.name}"
try:
self.process_image(file_path, output_file)
except Exception as e:
print(f"Error processing {file_path.name}: {str(e)}")
print(f"\nBatch processing completed. Results in: {output_dir}")
def create_comparison_image(original, processed, output_path):
"""
Create a side-by-side comparison image.
Args:
original: Original image array
processed: Processed image array
output_path: Path to save comparison
"""
# Resize images to same height if needed
h1, w1 = original.shape[:2]
h2, w2 = processed.shape[:2]
if h1 != h2:
height = min(h1, h2)
original = cv2.resize(original, (int(w1 * height / h1), height))
processed = cv2.resize(processed, (int(w2 * height / h2), height))
# Create side-by-side comparison
comparison = np.hstack([original, processed])
cv2.imwrite(str(output_path), comparison)
print(f"Comparison saved to: {output_path}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Clean historical newspaper images")
parser.add_argument("input", help="Input image or directory path")
parser.add_argument("-o", "--output", help="Output path")
parser.add_argument("-d", "--directory", action="store_true",
help="Process entire directory")
parser.add_argument("-c", "--comparison", action="store_true",
help="Create before/after comparison")
parser.add_argument("--steps", nargs="+",
choices=['denoise', 'contrast', 'background', 'sharpen'],
default=['denoise', 'contrast', 'background', 'sharpen'],
help="Processing steps to apply")
args = parser.parse_args()
cleaner = NewspaperImageCleaner()
if args.directory:
output_dir = args.output or "cleaned_images"
cleaner.process_directory(args.input, output_dir)
else:
output_path = args.output
if not output_path:
input_path = Path(args.input)
output_path = input_path.parent / f"cleaned_{input_path.name}"
processed, original = cleaner.process_image(args.input, output_path, args.steps)
if args.comparison:
comparison_path = Path(output_path).parent / f"comparison_{Path(args.input).name}"
create_comparison_image(original, processed, comparison_path)