project-finder/scripts/snapshot

223 lines
6.1 KiB
Bash
Executable File

#!/usr/bin/env bash
# Directory structure snapshot tool using fd (https://github.com/sharkdp/fd)
# Check if fd is installed
if ! command -v fd &>/dev/null; then
echo "Error: fd is not installed. Please install it first:"
echo " - Debian/Ubuntu: sudo apt install fd-find"
echo " - Fedora: sudo dnf install fd-find"
echo " - Arch: sudo pacman -S fd"
echo " - macOS: brew install fd"
echo " - Cargo: cargo install fd-find"
exit 1
fi
# Check if ripgrep is installed, fall back to grep if not
if command -v rg &>/dev/null; then
GREP_CMD="rg"
else
GREP_CMD="grep"
fi
# Usage information
usage() {
echo "Usage: $(basename "$0") [OPTIONS] DIRECTORY [DIRECTORY...]"
echo "Create a snapshot of directory structure(s) for benchmarking"
echo
echo "Options:"
echo " -o, --output FILE Output file (default: stdout)"
echo " Use {DATE} or {TIMESTAMP} for dynamic naming"
echo " -f, --format FORMAT Output format: csv or json (default: csv)"
echo " -h, --help Display this help message"
echo
echo "Examples:"
echo " $(basename "$0") ~/projects"
echo " $(basename "$0") -o snapshot.csv -f csv /path/to/dir"
echo " $(basename "$0") -o snapshot-{DATE}.csv ~/dir1 ~/dir2"
echo " $(basename "$0") -f json > snapshot.json"
exit 0
}
# Default values
DIRECTORIES=()
OUTPUT="/dev/stdout"
FORMAT="csv"
# Parse arguments
while [[ $# -gt 0 ]]; do
case "$1" in
-o | --output)
OUTPUT="$2"
shift 2
;;
-f | --format)
FORMAT="$2"
shift 2
;;
-h | --help)
usage
;;
*)
if [[ -d "$1" ]]; then
# Use cd + pwd instead of realpath for better performance
DIRECTORIES+=("$(cd "$1" && pwd)")
shift
else
echo "Error: Unknown option or invalid directory: $1"
usage
fi
;;
esac
done
# Check if at least one directory was provided
if [ ${#DIRECTORIES[@]} -eq 0 ]; then
DIRECTORIES=("$(pwd)")
fi
# Replace template variables in output filename
DATE_SHORT=$(date +"%Y%m%d")
TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S")
OUTPUT="${OUTPUT//\{DATE\}/$DATE_SHORT}"
OUTPUT="${OUTPUT//\{TIMESTAMP\}/$TIMESTAMP}"
# Ensure the output directory exists
OUTPUT_DIR=$(dirname "$OUTPUT")
[ "$OUTPUT_DIR" != "/dev" ] && mkdir -p "$OUTPUT_DIR"
# Timestamp for the snapshot
TIMESTAMP_HUMAN=$(date +"%Y-%m-%d %H:%M:%S")
EPOCH=$(date +%s)
# Create a temporary file for processing
TEMP_DATA=$(mktemp)
trap 'rm -f "$TEMP_DATA"' EXIT
# Collect data from all directories more efficiently
for DIR in "${DIRECTORIES[@]}"; do
# Process entries directly without the separate xargs+stat calls
fd . "$DIR" -H -t f -t d -t l -0 | perl -0 -ne '
chomp;
$dir = "'$DIR'";
$path = $_;
if (-f $path) { $type = "file"; }
elsif (-d $path) { $type = "dir"; }
elsif (-l $path) { $type = "symlink"; }
else { $type = "other"; }
$rel_path = $path;
$rel_path =~ s/^\Q$dir\E\/?//;
$rel_path = "." if $rel_path eq "";
($size, $modified, $perms) = (stat($path))[7, 9, 2];
$perms = sprintf("%o", $perms & 07777);
print "$type|$dir|$path|$size|$modified|$perms\n";
' >>"$TEMP_DATA"
done
# Create output based on format
case "$FORMAT" in
csv)
{
echo "type,directory,path,size,modified,permissions"
# Process the collected data with proper CSV quoting and full directory paths
awk -F'|' '{
# Get the full directory path from column 2
dir_full = $2;
# Get relative path (path - dir prefix)
rel_path = $3;
gsub("^"$2"/", "", rel_path);
if (rel_path == $2) rel_path = ".";
# Properly quote fields that might contain commas
printf "%s,\"%s\",\"%s\",%s,%s,%s\n",
$1, dir_full, rel_path, $4, $5, $6;
}' "$TEMP_DATA"
} >"$OUTPUT"
;;
json)
{
echo "{"
echo " \"timestamp\": \"$TIMESTAMP_HUMAN\","
echo " \"epoch\": $EPOCH,"
echo " \"directories\": ["
# First output the list of directories
first_dir=true
for DIR in "${DIRECTORIES[@]}"; do
if $first_dir; then
first_dir=false
else
echo ","
fi
echo " {"
echo " \"path\": \"$DIR\","
echo " \"name\": \"$(basename "$DIR")\""
echo -n " }"
done
echo ""
echo " ],"
echo " \"entries\": ["
# Process entries for JSON output with proper null handling
awk -F'|' '
BEGIN { first = 1 }
{
if (!first) printf ",\n"
type = $1
dir = $2 # Full directory path
path = $3
size = $4
modified = $5
perms = $6
# Get relative path
rel_path = path
gsub("^"dir"/", "", rel_path)
if (rel_path == dir) rel_path = "."
# Format with null for empty values
printf " {\n \"type\": \"%s\",\n \"directory\": \"%s\",\n \"path\": \"%s\"",
type, dir, rel_path
# Handle potentially null values
if (size == "" || size == 0)
printf ",\n \"size\": null"
else
printf ",\n \"size\": %s", size
if (modified == "")
printf ",\n \"modified\": null"
else
printf ",\n \"modified\": %s", modified
if (perms == "")
printf ",\n \"permissions\": null"
else
printf ",\n \"permissions\": \"%s\"", perms
printf "\n }"
first = 0
}' "$TEMP_DATA"
echo ""
echo " ]"
echo "}"
} >"$OUTPUT"
;;
*)
echo "Error: Unknown format: $FORMAT"
echo "Supported formats: csv, json"
exit 1
;;
esac
# If output is not stdout, print a confirmation message
if [[ "$OUTPUT" != "/dev/stdout" ]]; then
echo "Snapshot created: $OUTPUT"
fi