#!/bin/bash # # Problem is: Some UTF-8 characters could need, sometime more, sometime # less than one character width [em] in monospaced font. # Run this again, with any (dummy) argument for best formated result: # $0 workaround testStrings=( Généralités Language Théorème Février "Left: ←" "Yin Yang ☯" "I'm 􏹤􏸎􏸫􏸫􏸲" "󰽞 󱀸" "🀽🁗🁊🁓" "𝙎𝙤𝙧𝙧𝙮" "  " "對不起" "죄송합니" "ⷨⷪⷢ" "ⶦⶰⶑ" "⸺⸻" "⽄⾃" ) strU8DiffLen () { # Syntax $0 local i vU8 byt1='' bytlen order oLang=$LANG oLcAll=$LC_ALL local -A counter info LANG=C LC_ALL=C bytlen=${#1} LANG=$oLang LC_ALL=$oLcAll printf -v $2 "%d" $(( bytlen - ${#1} )) for ((i=0; i<${#1}; i++)) ;do # for all char in string... LANG=C printf -v vU8 "%q" "${1:i:1}" # store in vU8 scriptable form vU8=(${vU8//\\/ }) # split vU8 on '\' and transform vU8 in array ((${#vU8[@]}>1)) && { # if array > 1 element (is utf8) ((counter[${1:i:1}]++)) || order+=(${1:i:1}) # incr counter[char] printf -v info[${1:i:1}] '%02X+%d(%s)' 0${vU8[1]} \ $((${#vU8[@]}-2)) "${1:i:1}" # 1st byte hex + morecount ( char ) } done for i in ${order[@]} ;do # prepare answer string byt1+=${counter[$i]}x${info[$i]}\ done printf -v $3 "%s" "${byt1% }" } for string in "${testStrings[@]}" ;do strU8DiffLen "$string" diff detail if [ -z "$1" ] ;then printf " - %-$((14+diff))s%2d chrs, %2d bts [%s]\n"\ "'$string'" ${#string} $((${#string}+diff)) "$detail" else printf " - %s\r\e[17C%2d chrs, %2d bts [%s]\n"\ "'$string'" ${#string} $((${#string}+diff)) "$detail" fi done ((BASH_ARGC)) || while IFS= read -r line && [ "$line" ] ;do [ "${line:1:1}" = " " ] && line=${line:2} && echo "${line//\$0/$0}" done <"$0"