mirror of
https://github.com/Theodor-Springmann-Stiftung/KGPZ.git
synced 2025-10-28 16:45:31 +00:00
+ scripts to extract images from pdfs, + scripts to rename the image files accordingly
This commit is contained in:
41
Scripts/extract_images.sh
Normal file
41
Scripts/extract_images.sh
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Create the images directory if it doesn't exist
|
||||
mkdir -p images
|
||||
|
||||
# Loop through all PDF files in the current directory
|
||||
for pdf in *.pdf; do
|
||||
# Extract the date from the filename
|
||||
date=$(echo "$pdf" | grep -oP '\d{4}-\d{2}-\d{2}')
|
||||
|
||||
if [ -z "$date" ]; then
|
||||
echo "Skipping $pdf: No date found in filename"
|
||||
continue
|
||||
fi
|
||||
|
||||
# Initialize counter for this PDF
|
||||
counter=1
|
||||
|
||||
# Use pdfimages to extract images, storing them in a temporary directory
|
||||
temp_dir=$(mktemp -d)
|
||||
pdfimages -all "$pdf" "$temp_dir/img"
|
||||
|
||||
# Move and rename the extracted images
|
||||
for img in "$temp_dir"/*; do
|
||||
# Get the file extension
|
||||
ext="${img##*.}"
|
||||
|
||||
# Move and rename the image
|
||||
mv "$img" "images/${date}_${counter}.${ext}"
|
||||
|
||||
# Increment counter
|
||||
((counter++))
|
||||
done
|
||||
|
||||
# Remove the temporary directory
|
||||
rm -rf "$temp_dir"
|
||||
|
||||
echo "Processed $pdf: Extracted $((counter-1)) images"
|
||||
done
|
||||
|
||||
echo "Image extraction complete. All images are in the 'images' directory."
|
||||
75
Scripts/rename_1764.sh
Normal file
75
Scripts/rename_1764.sh
Normal file
@@ -0,0 +1,75 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Create a new directory for the renamed files
|
||||
mkdir -p renamed_files
|
||||
|
||||
# Copy all jpg files to the new directory
|
||||
cp *.jpg renamed_files/
|
||||
|
||||
# Change to the new directory
|
||||
cd renamed_files
|
||||
|
||||
# Function to calculate days between two dates
|
||||
days_between() {
|
||||
date1=$(date -d "$1" +%s)
|
||||
date2=$(date -d "$2" +%s)
|
||||
echo $(( (date2 - date1) / 86400 ))
|
||||
}
|
||||
|
||||
# Function to check if a date is Monday or Friday
|
||||
is_monday_or_friday() {
|
||||
day=$(date -d "$1" +%u)
|
||||
[[ $day == 1 ]] || [[ $day == 5 ]]
|
||||
}
|
||||
|
||||
# Function to pad numbers with leading zeros
|
||||
pad() {
|
||||
printf "%03d" $1
|
||||
}
|
||||
|
||||
# Initialize variables
|
||||
start_date="1764-02-03"
|
||||
|
||||
# Sort files by name (which will sort them chronologically)
|
||||
for file in $(ls -1 1764-*.jpg | sort); do
|
||||
# Extract date from filename
|
||||
current_date=$(echo "$file" | grep -oP '\d{4}-\d{2}-\d{2}')
|
||||
|
||||
# Calculate the issue number
|
||||
issue_number=1
|
||||
temp_date="$start_date"
|
||||
while [[ "$temp_date" != "$current_date" ]]; do
|
||||
if is_monday_or_friday "$temp_date"; then
|
||||
((issue_number++))
|
||||
fi
|
||||
temp_date=$(date -d "$temp_date + 1 day" +%Y-%m-%d)
|
||||
done
|
||||
|
||||
# Count the number of files for this date
|
||||
pages_in_issue=$(ls -1 "$current_date"*.jpg | wc -l)
|
||||
|
||||
# Determine the page number within this issue
|
||||
page_in_issue=$(echo "$file" | grep -oP '_(\d+)\.jpg$' | cut -d'_' -f2 | cut -d'.' -f1)
|
||||
|
||||
# Calculate the starting page number for this issue
|
||||
starting_page=$((4 * (issue_number - 1) + 1))
|
||||
|
||||
# Calculate the actual page number
|
||||
actual_page=$((starting_page + page_in_issue - 1))
|
||||
|
||||
# Create new filename based on number of pages in the issue
|
||||
if [ $pages_in_issue -le 4 ]; then
|
||||
new_name="1764-$(pad $issue_number)-$(pad $actual_page).jpg"
|
||||
else
|
||||
if [ $page_in_issue -le 2 ] || [ $page_in_issue -gt $((pages_in_issue - 2)) ]; then
|
||||
new_name="1764-$(pad $issue_number)-$(pad $actual_page).jpg"
|
||||
else
|
||||
new_name="1764-$(pad $issue_number)b-$(pad $((page_in_issue - 2))).jpg"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Rename file
|
||||
mv "$file" "$new_name"
|
||||
done
|
||||
|
||||
echo "Files have been renamed in the 'renamed_files' directory."
|
||||
58
Scripts/rename_1765.sh
Normal file
58
Scripts/rename_1765.sh
Normal file
@@ -0,0 +1,58 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Create a new directory for the renamed files
|
||||
mkdir -p renamed_files
|
||||
|
||||
# Copy all jpg files to the new directory
|
||||
cp *.jpg renamed_files/
|
||||
|
||||
# Change to the new directory
|
||||
cd renamed_files
|
||||
|
||||
# Remove the first two files (blank pages)
|
||||
rm 1765-01-04_1.jpg 1765-01-04_2.jpg
|
||||
|
||||
# Initialize variables
|
||||
issue_number=0
|
||||
page_number=1
|
||||
global_page_number=1
|
||||
|
||||
# Function to pad numbers with leading zeros
|
||||
pad() {
|
||||
printf "%03d" $1
|
||||
}
|
||||
|
||||
# Sort files by name (which will sort them chronologically)
|
||||
for file in $(ls -1 1765-*.jpg | sort); do
|
||||
# Extract date from filename
|
||||
date=$(echo $file | grep -oP '\d{4}-\d{2}-\d{2}')
|
||||
|
||||
# If it's a new date, increment issue number and reset page counter
|
||||
if [[ $date != $current_date ]]; then
|
||||
current_date=$date
|
||||
issue_number=$((issue_number + 1))
|
||||
page_number=1
|
||||
pages_in_issue=$(ls -1 $date*.jpg | wc -l)
|
||||
fi
|
||||
|
||||
# Create new filename based on number of pages in the issue and the issue number
|
||||
if [ $issue_number -eq 27 ] || [ $pages_in_issue -le 4 ]; then
|
||||
new_name="1765-$(pad $issue_number)-$(pad $global_page_number).jpg"
|
||||
global_page_number=$((global_page_number + 1))
|
||||
else
|
||||
if [ $page_number -le 2 ] || [ $page_number -gt $((pages_in_issue - 2)) ]; then
|
||||
new_name="1765-$(pad $issue_number)-$(pad $global_page_number).jpg"
|
||||
global_page_number=$((global_page_number + 1))
|
||||
else
|
||||
new_name="1765-$(pad $issue_number)b-$(pad $((page_number - 2))).jpg"
|
||||
fi
|
||||
fi
|
||||
|
||||
# Rename file
|
||||
mv "$file" "$new_name"
|
||||
|
||||
# Increment page number
|
||||
page_number=$((page_number + 1))
|
||||
done
|
||||
|
||||
echo "Files have been renamed in the 'renamed_files' directory."
|
||||
Reference in New Issue
Block a user