+ scripts to extract images from pdfs, + scripts to rename the image files accordingly

This commit is contained in:
Simon Martens
2024-07-17 14:57:48 +02:00
parent 744c58bd75
commit 513332b837
3 changed files with 174 additions and 0 deletions

41
Scripts/extract_images.sh Normal file
View File

@@ -0,0 +1,41 @@
#!/bin/bash
# Create the images directory if it doesn't exist
mkdir -p images
# Loop through all PDF files in the current directory
for pdf in *.pdf; do
# Extract the date from the filename
date=$(echo "$pdf" | grep -oP '\d{4}-\d{2}-\d{2}')
if [ -z "$date" ]; then
echo "Skipping $pdf: No date found in filename"
continue
fi
# Initialize counter for this PDF
counter=1
# Use pdfimages to extract images, storing them in a temporary directory
temp_dir=$(mktemp -d)
pdfimages -all "$pdf" "$temp_dir/img"
# Move and rename the extracted images
for img in "$temp_dir"/*; do
# Get the file extension
ext="${img##*.}"
# Move and rename the image
mv "$img" "images/${date}_${counter}.${ext}"
# Increment counter
((counter++))
done
# Remove the temporary directory
rm -rf "$temp_dir"
echo "Processed $pdf: Extracted $((counter-1)) images"
done
echo "Image extraction complete. All images are in the 'images' directory."

75
Scripts/rename_1764.sh Normal file
View File

@@ -0,0 +1,75 @@
#!/bin/bash
# Create a new directory for the renamed files
mkdir -p renamed_files
# Copy all jpg files to the new directory
cp *.jpg renamed_files/
# Change to the new directory
cd renamed_files
# Function to calculate days between two dates
days_between() {
date1=$(date -d "$1" +%s)
date2=$(date -d "$2" +%s)
echo $(( (date2 - date1) / 86400 ))
}
# Function to check if a date is Monday or Friday
is_monday_or_friday() {
day=$(date -d "$1" +%u)
[[ $day == 1 ]] || [[ $day == 5 ]]
}
# Function to pad numbers with leading zeros
pad() {
printf "%03d" $1
}
# Initialize variables
start_date="1764-02-03"
# Sort files by name (which will sort them chronologically)
for file in $(ls -1 1764-*.jpg | sort); do
# Extract date from filename
current_date=$(echo "$file" | grep -oP '\d{4}-\d{2}-\d{2}')
# Calculate the issue number
issue_number=1
temp_date="$start_date"
while [[ "$temp_date" != "$current_date" ]]; do
if is_monday_or_friday "$temp_date"; then
((issue_number++))
fi
temp_date=$(date -d "$temp_date + 1 day" +%Y-%m-%d)
done
# Count the number of files for this date
pages_in_issue=$(ls -1 "$current_date"*.jpg | wc -l)
# Determine the page number within this issue
page_in_issue=$(echo "$file" | grep -oP '_(\d+)\.jpg$' | cut -d'_' -f2 | cut -d'.' -f1)
# Calculate the starting page number for this issue
starting_page=$((4 * (issue_number - 1) + 1))
# Calculate the actual page number
actual_page=$((starting_page + page_in_issue - 1))
# Create new filename based on number of pages in the issue
if [ $pages_in_issue -le 4 ]; then
new_name="1764-$(pad $issue_number)-$(pad $actual_page).jpg"
else
if [ $page_in_issue -le 2 ] || [ $page_in_issue -gt $((pages_in_issue - 2)) ]; then
new_name="1764-$(pad $issue_number)-$(pad $actual_page).jpg"
else
new_name="1764-$(pad $issue_number)b-$(pad $((page_in_issue - 2))).jpg"
fi
fi
# Rename file
mv "$file" "$new_name"
done
echo "Files have been renamed in the 'renamed_files' directory."

58
Scripts/rename_1765.sh Normal file
View File

@@ -0,0 +1,58 @@
#!/bin/bash
# Create a new directory for the renamed files
mkdir -p renamed_files
# Copy all jpg files to the new directory
cp *.jpg renamed_files/
# Change to the new directory
cd renamed_files
# Remove the first two files (blank pages)
rm 1765-01-04_1.jpg 1765-01-04_2.jpg
# Initialize variables
issue_number=0
page_number=1
global_page_number=1
# Function to pad numbers with leading zeros
pad() {
printf "%03d" $1
}
# Sort files by name (which will sort them chronologically)
for file in $(ls -1 1765-*.jpg | sort); do
# Extract date from filename
date=$(echo $file | grep -oP '\d{4}-\d{2}-\d{2}')
# If it's a new date, increment issue number and reset page counter
if [[ $date != $current_date ]]; then
current_date=$date
issue_number=$((issue_number + 1))
page_number=1
pages_in_issue=$(ls -1 $date*.jpg | wc -l)
fi
# Create new filename based on number of pages in the issue and the issue number
if [ $issue_number -eq 27 ] || [ $pages_in_issue -le 4 ]; then
new_name="1765-$(pad $issue_number)-$(pad $global_page_number).jpg"
global_page_number=$((global_page_number + 1))
else
if [ $page_number -le 2 ] || [ $page_number -gt $((pages_in_issue - 2)) ]; then
new_name="1765-$(pad $issue_number)-$(pad $global_page_number).jpg"
global_page_number=$((global_page_number + 1))
else
new_name="1765-$(pad $issue_number)b-$(pad $((page_number - 2))).jpg"
fi
fi
# Rename file
mv "$file" "$new_name"
# Increment page number
page_number=$((page_number + 1))
done
echo "Files have been renamed in the 'renamed_files' directory."