diff --git a/.gitignore b/.gitignore index ea8c4bf..22662ea 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/benches/fixtures diff --git a/justfile b/justfile new file mode 100644 index 0000000..0171a63 --- /dev/null +++ b/justfile @@ -0,0 +1,2 @@ +snapshot +PATHS: + ./scripts/snapshot -o benches/fixtures/"snapshot-{TIMESTAMP}.csv" -f csv {{PATHS}} diff --git a/scripts/snapshot b/scripts/snapshot new file mode 100755 index 0000000..9eb0bfd --- /dev/null +++ b/scripts/snapshot @@ -0,0 +1,222 @@ +#!/usr/bin/env bash +# Directory structure snapshot tool using fd (https://github.com/sharkdp/fd) + +# Check if fd is installed +if ! command -v fd &>/dev/null; then + echo "Error: fd is not installed. Please install it first:" + echo " - Debian/Ubuntu: sudo apt install fd-find" + echo " - Fedora: sudo dnf install fd-find" + echo " - Arch: sudo pacman -S fd" + echo " - macOS: brew install fd" + echo " - Cargo: cargo install fd-find" + exit 1 +fi + +# Check if ripgrep is installed, fall back to grep if not +if command -v rg &>/dev/null; then + GREP_CMD="rg" +else + GREP_CMD="grep" +fi + +# Usage information +usage() { + echo "Usage: $(basename "$0") [OPTIONS] DIRECTORY [DIRECTORY...]" + echo "Create a snapshot of directory structure(s) for benchmarking" + echo + echo "Options:" + echo " -o, --output FILE Output file (default: stdout)" + echo " Use {DATE} or {TIMESTAMP} for dynamic naming" + echo " -f, --format FORMAT Output format: csv or json (default: csv)" + echo " -h, --help Display this help message" + echo + echo "Examples:" + echo " $(basename "$0") ~/projects" + echo " $(basename "$0") -o snapshot.csv -f csv /path/to/dir" + echo " $(basename "$0") -o snapshot-{DATE}.csv ~/dir1 ~/dir2" + echo " $(basename "$0") -f json > snapshot.json" + exit 0 +} + +# Default values +DIRECTORIES=() +OUTPUT="/dev/stdout" +FORMAT="csv" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case "$1" in + -o | --output) + OUTPUT="$2" + shift 2 + ;; + -f | --format) + FORMAT="$2" + shift 2 + ;; + -h | --help) + usage + ;; + *) + if [[ -d "$1" ]]; then + # Use cd + pwd instead of realpath for better performance + DIRECTORIES+=("$(cd "$1" && pwd)") + shift + else + echo "Error: Unknown option or invalid directory: $1" + usage + fi + ;; + esac +done + +# Check if at least one directory was provided +if [ ${#DIRECTORIES[@]} -eq 0 ]; then + DIRECTORIES=("$(pwd)") +fi + +# Replace template variables in output filename +DATE_SHORT=$(date +"%Y%m%d") +TIMESTAMP=$(date +"%Y-%m-%d_%H-%M-%S") +OUTPUT="${OUTPUT//\{DATE\}/$DATE_SHORT}" +OUTPUT="${OUTPUT//\{TIMESTAMP\}/$TIMESTAMP}" + +# Ensure the output directory exists +OUTPUT_DIR=$(dirname "$OUTPUT") +[ "$OUTPUT_DIR" != "/dev" ] && mkdir -p "$OUTPUT_DIR" + +# Timestamp for the snapshot +TIMESTAMP_HUMAN=$(date +"%Y-%m-%d %H:%M:%S") +EPOCH=$(date +%s) + +# Create a temporary file for processing +TEMP_DATA=$(mktemp) +trap 'rm -f "$TEMP_DATA"' EXIT + +# Collect data from all directories more efficiently +for DIR in "${DIRECTORIES[@]}"; do + # Process entries directly without the separate xargs+stat calls + fd . "$DIR" -H -t f -t d -t l -0 | perl -0 -ne ' + chomp; + $dir = "'$DIR'"; + $path = $_; + + if (-f $path) { $type = "file"; } + elsif (-d $path) { $type = "dir"; } + elsif (-l $path) { $type = "symlink"; } + else { $type = "other"; } + + $rel_path = $path; + $rel_path =~ s/^\Q$dir\E\/?//; + $rel_path = "." if $rel_path eq ""; + + ($size, $modified, $perms) = (stat($path))[7, 9, 2]; + $perms = sprintf("%o", $perms & 07777); + + print "$type|$dir|$path|$size|$modified|$perms\n"; + ' >>"$TEMP_DATA" +done + +# Create output based on format +case "$FORMAT" in +csv) + { + echo "type,directory,path,size,modified,permissions" + + # Process the collected data with proper CSV quoting and full directory paths + awk -F'|' '{ + # Get the full directory path from column 2 + dir_full = $2; + + # Get relative path (path - dir prefix) + rel_path = $3; + gsub("^"$2"/", "", rel_path); + if (rel_path == $2) rel_path = "."; + + # Properly quote fields that might contain commas + printf "%s,\"%s\",\"%s\",%s,%s,%s\n", + $1, dir_full, rel_path, $4, $5, $6; + }' "$TEMP_DATA" + } >"$OUTPUT" + ;; +json) + { + echo "{" + echo " \"timestamp\": \"$TIMESTAMP_HUMAN\"," + echo " \"epoch\": $EPOCH," + echo " \"directories\": [" + + # First output the list of directories + first_dir=true + for DIR in "${DIRECTORIES[@]}"; do + if $first_dir; then + first_dir=false + else + echo "," + fi + echo " {" + echo " \"path\": \"$DIR\"," + echo " \"name\": \"$(basename "$DIR")\"" + echo -n " }" + done + echo "" + echo " ]," + echo " \"entries\": [" + + # Process entries for JSON output with proper null handling + awk -F'|' ' + BEGIN { first = 1 } + { + if (!first) printf ",\n" + type = $1 + dir = $2 # Full directory path + path = $3 + size = $4 + modified = $5 + perms = $6 + + # Get relative path + rel_path = path + gsub("^"dir"/", "", rel_path) + if (rel_path == dir) rel_path = "." + + # Format with null for empty values + printf " {\n \"type\": \"%s\",\n \"directory\": \"%s\",\n \"path\": \"%s\"", + type, dir, rel_path + + # Handle potentially null values + if (size == "" || size == 0) + printf ",\n \"size\": null" + else + printf ",\n \"size\": %s", size + + if (modified == "") + printf ",\n \"modified\": null" + else + printf ",\n \"modified\": %s", modified + + if (perms == "") + printf ",\n \"permissions\": null" + else + printf ",\n \"permissions\": \"%s\"", perms + + printf "\n }" + first = 0 + }' "$TEMP_DATA" + + echo "" + echo " ]" + echo "}" + } >"$OUTPUT" + ;; +*) + echo "Error: Unknown format: $FORMAT" + echo "Supported formats: csv, json" + exit 1 + ;; +esac + +# If output is not stdout, print a confirmation message +if [[ "$OUTPUT" != "/dev/stdout" ]]; then + echo "Snapshot created: $OUTPUT" +fi