+ scripts to extract images from pdfs, + scripts to rename the image files accordingly

This commit is contained in:
Simon Martens
2024-07-17 14:57:48 +02:00
parent 744c58bd75
commit 513332b837
3 changed files with 174 additions and 0 deletions

41
Scripts/extract_images.sh Normal file
View File

@@ -0,0 +1,41 @@
#!/bin/bash
# Create the images directory if it doesn't exist
mkdir -p images
# Loop through all PDF files in the current directory
for pdf in *.pdf; do
# Extract the date from the filename
date=$(echo "$pdf" | grep -oP '\d{4}-\d{2}-\d{2}')
if [ -z "$date" ]; then
echo "Skipping $pdf: No date found in filename"
continue
fi
# Initialize counter for this PDF
counter=1
# Use pdfimages to extract images, storing them in a temporary directory
temp_dir=$(mktemp -d)
pdfimages -all "$pdf" "$temp_dir/img"
# Move and rename the extracted images
for img in "$temp_dir"/*; do
# Get the file extension
ext="${img##*.}"
# Move and rename the image
mv "$img" "images/${date}_${counter}.${ext}"
# Increment counter
((counter++))
done
# Remove the temporary directory
rm -rf "$temp_dir"
echo "Processed $pdf: Extracted $((counter-1)) images"
done
echo "Image extraction complete. All images are in the 'images' directory."