mirror of
https://github.com/Theodor-Springmann-Stiftung/kgpz_web.git
synced 2025-10-30 01:25:30 +00:00
image cleaner
This commit is contained in:
310
scripts/ex/image_cleaner.py
Normal file
310
scripts/ex/image_cleaner.py
Normal file
@@ -0,0 +1,310 @@
|
||||
"""
|
||||
Historical Newspaper Image Cleaning Pipeline
|
||||
|
||||
This module provides functions to clean and enhance scanned historical newspaper images
|
||||
by reducing noise, improving contrast, and sharpening text for better readability.
|
||||
"""
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image, ImageEnhance
|
||||
import os
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class NewspaperImageCleaner:
|
||||
"""
|
||||
Image processing pipeline specifically designed for historical newspaper scans.
|
||||
"""
|
||||
|
||||
def __init__(self, config=None):
|
||||
"""Initialize with default or custom configuration."""
|
||||
self.config = config or self._default_config()
|
||||
|
||||
def _default_config(self):
|
||||
"""Default processing parameters optimized for newspaper scans."""
|
||||
return {
|
||||
'bilateral_d': 9, # Neighborhood diameter for bilateral filter
|
||||
'bilateral_sigma_color': 75, # Filter sigma in color space
|
||||
'bilateral_sigma_space': 75, # Filter sigma in coordinate space
|
||||
'clahe_clip_limit': 2.0, # Contrast limiting for CLAHE
|
||||
'clahe_grid_size': (8, 8), # CLAHE grid size
|
||||
'gamma': 1.2, # Gamma correction value
|
||||
'denoise_h': 10, # Denoising filter strength
|
||||
'morph_kernel_size': 2, # Morphological operation kernel size
|
||||
'unsharp_amount': 1.5, # Unsharp masking amount
|
||||
'unsharp_radius': 1.0, # Unsharp masking radius
|
||||
'unsharp_threshold': 0, # Unsharp masking threshold
|
||||
}
|
||||
|
||||
def reduce_noise(self, image):
|
||||
"""
|
||||
Apply noise reduction techniques to remove speckles and JPEG artifacts.
|
||||
|
||||
Args:
|
||||
image: Input BGR image
|
||||
|
||||
Returns:
|
||||
Denoised image
|
||||
"""
|
||||
# Bilateral filter - preserves edges while reducing noise
|
||||
bilateral = cv2.bilateralFilter(
|
||||
image,
|
||||
self.config['bilateral_d'],
|
||||
self.config['bilateral_sigma_color'],
|
||||
self.config['bilateral_sigma_space']
|
||||
)
|
||||
|
||||
# Non-local means denoising for better noise reduction
|
||||
if len(image.shape) == 3:
|
||||
# Color image
|
||||
denoised = cv2.fastNlMeansDenoisingColored(
|
||||
bilateral, None,
|
||||
self.config['denoise_h'],
|
||||
self.config['denoise_h'],
|
||||
7, 21
|
||||
)
|
||||
else:
|
||||
# Grayscale image
|
||||
denoised = cv2.fastNlMeansDenoising(
|
||||
bilateral, None,
|
||||
self.config['denoise_h'],
|
||||
7, 21
|
||||
)
|
||||
|
||||
return denoised
|
||||
|
||||
def enhance_contrast(self, image):
|
||||
"""
|
||||
Improve image contrast using CLAHE and gamma correction.
|
||||
|
||||
Args:
|
||||
image: Input BGR image
|
||||
|
||||
Returns:
|
||||
Contrast-enhanced image
|
||||
"""
|
||||
# Convert to LAB color space for better contrast processing
|
||||
if len(image.shape) == 3:
|
||||
lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
|
||||
l_channel, a_channel, b_channel = cv2.split(lab)
|
||||
else:
|
||||
l_channel = image
|
||||
|
||||
# Apply CLAHE (Contrast Limited Adaptive Histogram Equalization)
|
||||
clahe = cv2.createCLAHE(
|
||||
clipLimit=self.config['clahe_clip_limit'],
|
||||
tileGridSize=self.config['clahe_grid_size']
|
||||
)
|
||||
l_channel = clahe.apply(l_channel)
|
||||
|
||||
# Reconstruct image
|
||||
if len(image.shape) == 3:
|
||||
enhanced = cv2.merge([l_channel, a_channel, b_channel])
|
||||
enhanced = cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)
|
||||
else:
|
||||
enhanced = l_channel
|
||||
|
||||
# Apply gamma correction
|
||||
gamma = self.config['gamma']
|
||||
inv_gamma = 1.0 / gamma
|
||||
table = np.array([((i / 255.0) ** inv_gamma) * 255
|
||||
for i in np.arange(0, 256)]).astype("uint8")
|
||||
enhanced = cv2.LUT(enhanced, table)
|
||||
|
||||
return enhanced
|
||||
|
||||
def clean_background(self, image):
|
||||
"""
|
||||
Remove small artifacts and clean background noise.
|
||||
|
||||
Args:
|
||||
image: Input image
|
||||
|
||||
Returns:
|
||||
Background-cleaned image
|
||||
"""
|
||||
# Convert to grayscale for morphological operations
|
||||
if len(image.shape) == 3:
|
||||
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||
else:
|
||||
gray = image
|
||||
|
||||
# Morphological opening to remove small noise
|
||||
kernel = np.ones((self.config['morph_kernel_size'],
|
||||
self.config['morph_kernel_size']), np.uint8)
|
||||
|
||||
# Opening (erosion followed by dilation)
|
||||
opened = cv2.morphologyEx(gray, cv2.MORPH_OPEN, kernel)
|
||||
|
||||
# If original was color, apply the mask
|
||||
if len(image.shape) == 3:
|
||||
# Create a mask and apply it to the original color image
|
||||
mask = opened > 0
|
||||
result = image.copy()
|
||||
result[~mask] = [255, 255, 255] # Set background to white
|
||||
return result
|
||||
else:
|
||||
return opened
|
||||
|
||||
def sharpen_image(self, image):
|
||||
"""
|
||||
Apply unsharp masking to enhance text clarity.
|
||||
|
||||
Args:
|
||||
image: Input image
|
||||
|
||||
Returns:
|
||||
Sharpened image
|
||||
"""
|
||||
# Convert to float for processing
|
||||
float_img = image.astype(np.float32) / 255.0
|
||||
|
||||
# Create Gaussian blur
|
||||
radius = self.config['unsharp_radius']
|
||||
sigma = radius / 3.0
|
||||
blurred = cv2.GaussianBlur(float_img, (0, 0), sigma)
|
||||
|
||||
# Unsharp masking
|
||||
amount = self.config['unsharp_amount']
|
||||
sharpened = float_img + amount * (float_img - blurred)
|
||||
|
||||
# Threshold and clamp
|
||||
threshold = self.config['unsharp_threshold'] / 255.0
|
||||
sharpened = np.where(np.abs(float_img - blurred) < threshold,
|
||||
float_img, sharpened)
|
||||
sharpened = np.clip(sharpened, 0.0, 1.0)
|
||||
|
||||
return (sharpened * 255).astype(np.uint8)
|
||||
|
||||
def process_image(self, image_path, output_path=None, steps=None):
|
||||
"""
|
||||
Process a single image through the complete pipeline.
|
||||
|
||||
Args:
|
||||
image_path: Path to input image
|
||||
output_path: Path for output image (optional)
|
||||
steps: List of processing steps to apply (optional)
|
||||
|
||||
Returns:
|
||||
Processed image array
|
||||
"""
|
||||
if steps is None:
|
||||
steps = ['denoise', 'contrast', 'background', 'sharpen']
|
||||
|
||||
# Load image
|
||||
image = cv2.imread(str(image_path))
|
||||
if image is None:
|
||||
raise ValueError(f"Could not load image: {image_path}")
|
||||
|
||||
original = image.copy()
|
||||
|
||||
# Apply processing steps
|
||||
if 'denoise' in steps:
|
||||
print(f"Applying noise reduction...")
|
||||
image = self.reduce_noise(image)
|
||||
|
||||
if 'contrast' in steps:
|
||||
print(f"Enhancing contrast...")
|
||||
image = self.enhance_contrast(image)
|
||||
|
||||
if 'background' in steps:
|
||||
print(f"Cleaning background...")
|
||||
image = self.clean_background(image)
|
||||
|
||||
if 'sharpen' in steps:
|
||||
print(f"Sharpening image...")
|
||||
image = self.sharpen_image(image)
|
||||
|
||||
# Save output if path provided
|
||||
if output_path:
|
||||
cv2.imwrite(str(output_path), image)
|
||||
print(f"Processed image saved to: {output_path}")
|
||||
|
||||
return image, original
|
||||
|
||||
def process_directory(self, input_dir, output_dir, extensions=None):
|
||||
"""
|
||||
Process all images in a directory.
|
||||
|
||||
Args:
|
||||
input_dir: Input directory path
|
||||
output_dir: Output directory path
|
||||
extensions: List of file extensions to process
|
||||
"""
|
||||
if extensions is None:
|
||||
extensions = ['.jpg', '.jpeg', '.png', '.tif', '.tiff']
|
||||
|
||||
input_path = Path(input_dir)
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for file_path in input_path.iterdir():
|
||||
if file_path.suffix.lower() in extensions:
|
||||
print(f"\nProcessing: {file_path.name}")
|
||||
output_file = output_path / f"cleaned_{file_path.name}"
|
||||
|
||||
try:
|
||||
self.process_image(file_path, output_file)
|
||||
except Exception as e:
|
||||
print(f"Error processing {file_path.name}: {str(e)}")
|
||||
|
||||
print(f"\nBatch processing completed. Results in: {output_dir}")
|
||||
|
||||
|
||||
def create_comparison_image(original, processed, output_path):
|
||||
"""
|
||||
Create a side-by-side comparison image.
|
||||
|
||||
Args:
|
||||
original: Original image array
|
||||
processed: Processed image array
|
||||
output_path: Path to save comparison
|
||||
"""
|
||||
# Resize images to same height if needed
|
||||
h1, w1 = original.shape[:2]
|
||||
h2, w2 = processed.shape[:2]
|
||||
|
||||
if h1 != h2:
|
||||
height = min(h1, h2)
|
||||
original = cv2.resize(original, (int(w1 * height / h1), height))
|
||||
processed = cv2.resize(processed, (int(w2 * height / h2), height))
|
||||
|
||||
# Create side-by-side comparison
|
||||
comparison = np.hstack([original, processed])
|
||||
cv2.imwrite(str(output_path), comparison)
|
||||
print(f"Comparison saved to: {output_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Clean historical newspaper images")
|
||||
parser.add_argument("input", help="Input image or directory path")
|
||||
parser.add_argument("-o", "--output", help="Output path")
|
||||
parser.add_argument("-d", "--directory", action="store_true",
|
||||
help="Process entire directory")
|
||||
parser.add_argument("-c", "--comparison", action="store_true",
|
||||
help="Create before/after comparison")
|
||||
parser.add_argument("--steps", nargs="+",
|
||||
choices=['denoise', 'contrast', 'background', 'sharpen'],
|
||||
default=['denoise', 'contrast', 'background', 'sharpen'],
|
||||
help="Processing steps to apply")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
cleaner = NewspaperImageCleaner()
|
||||
|
||||
if args.directory:
|
||||
output_dir = args.output or "cleaned_images"
|
||||
cleaner.process_directory(args.input, output_dir)
|
||||
else:
|
||||
output_path = args.output
|
||||
if not output_path:
|
||||
input_path = Path(args.input)
|
||||
output_path = input_path.parent / f"cleaned_{input_path.name}"
|
||||
|
||||
processed, original = cleaner.process_image(args.input, output_path, args.steps)
|
||||
|
||||
if args.comparison:
|
||||
comparison_path = Path(output_path).parent / f"comparison_{Path(args.input).name}"
|
||||
create_comparison_image(original, processed, comparison_path)
|
||||
Reference in New Issue
Block a user