From 513332b8372cf9079459de1002491defdd09c439 Mon Sep 17 00:00:00 2001 From: Simon Martens Date: Wed, 17 Jul 2024 14:57:48 +0200 Subject: [PATCH] + scripts to extract images from pdfs, + scripts to rename the image files accordingly --- Scripts/extract_images.sh | 41 +++++++++++++++++++++ Scripts/rename_1764.sh | 75 +++++++++++++++++++++++++++++++++++++++ Scripts/rename_1765.sh | 58 ++++++++++++++++++++++++++++++ 3 files changed, 174 insertions(+) create mode 100644 Scripts/extract_images.sh create mode 100644 Scripts/rename_1764.sh create mode 100644 Scripts/rename_1765.sh diff --git a/Scripts/extract_images.sh b/Scripts/extract_images.sh new file mode 100644 index 0000000..ef62a4e --- /dev/null +++ b/Scripts/extract_images.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# Create the images directory if it doesn't exist +mkdir -p images + +# Loop through all PDF files in the current directory +for pdf in *.pdf; do + # Extract the date from the filename + date=$(echo "$pdf" | grep -oP '\d{4}-\d{2}-\d{2}') + + if [ -z "$date" ]; then + echo "Skipping $pdf: No date found in filename" + continue + fi + + # Initialize counter for this PDF + counter=1 + + # Use pdfimages to extract images, storing them in a temporary directory + temp_dir=$(mktemp -d) + pdfimages -all "$pdf" "$temp_dir/img" + + # Move and rename the extracted images + for img in "$temp_dir"/*; do + # Get the file extension + ext="${img##*.}" + + # Move and rename the image + mv "$img" "images/${date}_${counter}.${ext}" + + # Increment counter + ((counter++)) + done + + # Remove the temporary directory + rm -rf "$temp_dir" + + echo "Processed $pdf: Extracted $((counter-1)) images" +done + +echo "Image extraction complete. All images are in the 'images' directory." diff --git a/Scripts/rename_1764.sh b/Scripts/rename_1764.sh new file mode 100644 index 0000000..704cf49 --- /dev/null +++ b/Scripts/rename_1764.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# Create a new directory for the renamed files +mkdir -p renamed_files + +# Copy all jpg files to the new directory +cp *.jpg renamed_files/ + +# Change to the new directory +cd renamed_files + +# Function to calculate days between two dates +days_between() { + date1=$(date -d "$1" +%s) + date2=$(date -d "$2" +%s) + echo $(( (date2 - date1) / 86400 )) +} + +# Function to check if a date is Monday or Friday +is_monday_or_friday() { + day=$(date -d "$1" +%u) + [[ $day == 1 ]] || [[ $day == 5 ]] +} + +# Function to pad numbers with leading zeros +pad() { + printf "%03d" $1 +} + +# Initialize variables +start_date="1764-02-03" + +# Sort files by name (which will sort them chronologically) +for file in $(ls -1 1764-*.jpg | sort); do + # Extract date from filename + current_date=$(echo "$file" | grep -oP '\d{4}-\d{2}-\d{2}') + + # Calculate the issue number + issue_number=1 + temp_date="$start_date" + while [[ "$temp_date" != "$current_date" ]]; do + if is_monday_or_friday "$temp_date"; then + ((issue_number++)) + fi + temp_date=$(date -d "$temp_date + 1 day" +%Y-%m-%d) + done + + # Count the number of files for this date + pages_in_issue=$(ls -1 "$current_date"*.jpg | wc -l) + + # Determine the page number within this issue + page_in_issue=$(echo "$file" | grep -oP '_(\d+)\.jpg$' | cut -d'_' -f2 | cut -d'.' -f1) + + # Calculate the starting page number for this issue + starting_page=$((4 * (issue_number - 1) + 1)) + + # Calculate the actual page number + actual_page=$((starting_page + page_in_issue - 1)) + + # Create new filename based on number of pages in the issue + if [ $pages_in_issue -le 4 ]; then + new_name="1764-$(pad $issue_number)-$(pad $actual_page).jpg" + else + if [ $page_in_issue -le 2 ] || [ $page_in_issue -gt $((pages_in_issue - 2)) ]; then + new_name="1764-$(pad $issue_number)-$(pad $actual_page).jpg" + else + new_name="1764-$(pad $issue_number)b-$(pad $((page_in_issue - 2))).jpg" + fi + fi + + # Rename file + mv "$file" "$new_name" +done + +echo "Files have been renamed in the 'renamed_files' directory." diff --git a/Scripts/rename_1765.sh b/Scripts/rename_1765.sh new file mode 100644 index 0000000..ac96f86 --- /dev/null +++ b/Scripts/rename_1765.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Create a new directory for the renamed files +mkdir -p renamed_files + +# Copy all jpg files to the new directory +cp *.jpg renamed_files/ + +# Change to the new directory +cd renamed_files + +# Remove the first two files (blank pages) +rm 1765-01-04_1.jpg 1765-01-04_2.jpg + +# Initialize variables +issue_number=0 +page_number=1 +global_page_number=1 + +# Function to pad numbers with leading zeros +pad() { + printf "%03d" $1 +} + +# Sort files by name (which will sort them chronologically) +for file in $(ls -1 1765-*.jpg | sort); do + # Extract date from filename + date=$(echo $file | grep -oP '\d{4}-\d{2}-\d{2}') + + # If it's a new date, increment issue number and reset page counter + if [[ $date != $current_date ]]; then + current_date=$date + issue_number=$((issue_number + 1)) + page_number=1 + pages_in_issue=$(ls -1 $date*.jpg | wc -l) + fi + + # Create new filename based on number of pages in the issue and the issue number + if [ $issue_number -eq 27 ] || [ $pages_in_issue -le 4 ]; then + new_name="1765-$(pad $issue_number)-$(pad $global_page_number).jpg" + global_page_number=$((global_page_number + 1)) + else + if [ $page_number -le 2 ] || [ $page_number -gt $((pages_in_issue - 2)) ]; then + new_name="1765-$(pad $issue_number)-$(pad $global_page_number).jpg" + global_page_number=$((global_page_number + 1)) + else + new_name="1765-$(pad $issue_number)b-$(pad $((page_number - 2))).jpg" + fi + fi + + # Rename file + mv "$file" "$new_name" + + # Increment page number + page_number=$((page_number + 1)) +done + +echo "Files have been renamed in the 'renamed_files' directory."