#!/bin/bash # Sort big files by length of lines, using zstd for compression. # (C) F-Hauri.ch 2023 - Licensed under terms of GPL V3+ # Usage: $0 "/path/to/FileToSortByLengthOfLines" # File are to be completely read two time: 1st to prepare compression steps # and second to split'n compesss. shopt -s extglob tdir=$(mktemp -d) mapfile -t sizes < <( awk '{ val[length($0)]+=1 }; END{ for (var in val) print var};' "$1") subpids=() for size in ${sizes[@]}; do printf -v file '%s/part_%08d' "$tdir" $size exec {cmpr[$size]}> >(zstd >$file) subpids+=($!) done awkBegin=${cmpr[@]@A} awkBegin=${awkBegin##*\(} awkBegin=${awkBegin%)} awkBegin=${awkBegin// /;} awk "BEGIN{ ${awkBegin//\[/cfd\[};};"'{ print >(sprintf("/dev/fd/%d",cfd[length($0)])) };' < "$1" for size in ${sizes[@]}; do exec {cmpr[$size]}>&- done wait ${subpids[@]} zstdcat "$tdir"/part_* rm "$tdir"/part_* rmdir "$tdir"