#!/usr/bin/env bash # Directory structure snapshot tool using fd (https://github.com/sharkdp/fd) # Check if fd is installed if ! command -v fd &>/dev/null; then echo "Error: fd is not installed. Please install it first:" echo " - Debian/Ubuntu: sudo apt install fd-find" echo " - Fedora: sudo dnf install fd-find" echo " - Arch: sudo pacman -S fd" echo " - macOS: brew install fd" echo " - Cargo: cargo install fd-find" exit 1 fi # Check if ripgrep is installed, fall back to grep if not if command -v rg &>/dev/null; then GREP_CMD="rg" else GREP_CMD="grep" fi # Usage information usage() { echo "Usage: $(basename "$0") [OPTIONS] DIRECTORY [DIRECTORY...]" echo "Create a snapshot of directory structure(s) for benchmarking" echo echo "Options:" echo " -o, --output FILE Output file (default: stdout)" echo " Use {DATE} or {TIMESTAMP} for dynamic naming" echo " -f, --format FORMAT Output format: csv or json (default: csv)" echo " -h, --help Display this help message" echo echo "Examples:" echo " $(basename "$0") ~/projects" echo " $(basename "$0") -o snapshot.csv -f csv /path/to/dir" echo " $(basename "$0") -o snapshot-{DATE}.csv ~/dir1 ~/dir2" echo " $(basename "$0") -f json > snapshot.json" exit 0 } # Default values DIRECTORIES=() OUTPUT="/dev/stdout" FORMAT="csv" # Parse arguments while [[ $# -gt 0 ]]; do case "$1" in -o | --output) OUTPUT="$2" shift 2 ;; -f | --format) FORMAT="$2" shift 2 ;; -h | --help) usage ;; *) if [[ -d "$1" ]]; then # Use cd + pwd instead of realpath for better performance DIRECTORIES+=("$(cd "$1" && pwd)") shift else echo "Error: Unknown option or invalid directory: $1" usage fi ;; esac done # Check if at least one directory was provided if [ ${#DIRECTORIES[@]} -eq 0 ]; then DIRECTORIES=("$(pwd)") fi # Replace template variables in output filename DATE_SHORT=$(date +"%Y%m%d") TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") OUTPUT="${OUTPUT//\{DATE\}/$DATE_SHORT}" OUTPUT="${OUTPUT//\{TIMESTAMP\}/$TIMESTAMP}" # Ensure the output directory exists OUTPUT_DIR=$(dirname "$OUTPUT") [ "$OUTPUT_DIR" != "/dev" ] && mkdir -p "$OUTPUT_DIR" # Timestamp for the snapshot TIMESTAMP_HUMAN=$(date +"%Y-%m-%d %H:%M:%S") EPOCH=$(date +%s) # Create a temporary file for processing TEMP_DATA=$(mktemp) trap 'rm -f "$TEMP_DATA"' EXIT # Collect data from all directories more efficiently for DIR in "${DIRECTORIES[@]}"; do # Process entries directly without the separate xargs+stat calls fd . "$DIR" -H -t f -t d -t l -0 | perl -0 -ne ' chomp; $dir = "'$DIR'"; $path = $_; if (-f $path) { $type = "file"; } elsif (-d $path) { $type = "dir"; } elsif (-l $path) { $type = "symlink"; } else { $type = "other"; } $rel_path = $path; $rel_path =~ s/^\Q$dir\E\/?//; $rel_path = "." if $rel_path eq ""; ($size, $modified, $perms) = (stat($path))[7, 9, 2]; $perms = sprintf("%o", $perms & 07777); print "$type|$dir|$path|$size|$modified|$perms\n"; ' >>"$TEMP_DATA" done # Create output based on format case "$FORMAT" in csv) { echo "type,directory,path,size,modified,permissions" # Process the collected data with proper CSV quoting and full directory paths awk -F'|' '{ # Get the full directory path from column 2 dir_full = $2; # Get relative path (path - dir prefix) rel_path = $3; gsub("^"$2"/", "", rel_path); if (rel_path == $2) rel_path = "."; # Properly quote fields that might contain commas printf "%s,\"%s\",\"%s\",%s,%s,%s\n", $1, dir_full, rel_path, $4, $5, $6; }' "$TEMP_DATA" } >"$OUTPUT" ;; json) { echo "{" echo " \"timestamp\": \"$TIMESTAMP_HUMAN\"," echo " \"epoch\": $EPOCH," echo " \"directories\": [" # First output the list of directories first_dir=true for DIR in "${DIRECTORIES[@]}"; do if $first_dir; then first_dir=false else echo "," fi echo " {" echo " \"path\": \"$DIR\"," echo " \"name\": \"$(basename "$DIR")\"" echo -n " }" done echo "" echo " ]," echo " \"entries\": [" # Process entries for JSON output with proper null handling awk -F'|' ' BEGIN { first = 1 } { if (!first) printf ",\n" type = $1 dir = $2 # Full directory path path = $3 size = $4 modified = $5 perms = $6 # Get relative path rel_path = path gsub("^"dir"/", "", rel_path) if (rel_path == dir) rel_path = "." # Format with null for empty values printf " {\n \"type\": \"%s\",\n \"directory\": \"%s\",\n \"path\": \"%s\"", type, dir, rel_path # Handle potentially null values if (size == "" || size == 0) printf ",\n \"size\": null" else printf ",\n \"size\": %s", size if (modified == "") printf ",\n \"modified\": null" else printf ",\n \"modified\": %s", modified if (perms == "") printf ",\n \"permissions\": null" else printf ",\n \"permissions\": \"%s\"", perms printf "\n }" first = 0 }' "$TEMP_DATA" echo "" echo " ]" echo "}" } >"$OUTPUT" ;; *) echo "Error: Unknown format: $FORMAT" echo "Supported formats: csv, json" exit 1 ;; esac # If output is not stdout, print a confirmation message if [[ "$OUTPUT" != "/dev/stdout" ]]; then echo "Snapshot created: $OUTPUT" fi