In a Bash script I would like to split a line into pieces and store them in an array.
The line:
Paris, France, Europe
I would like
The key to splitting your string into an array is the multi character delimiter of ", "
. Any solution using IFS
for multi character delimiters is inherently wrong since IFS is a set of those characters, not a string.
If you assign IFS=", "
then the string will break on EITHER ","
OR " "
or any combination of them which is not an accurate representation of the two character delimiter of ", "
.
You can use awk
or sed
to split the string, with process substitution:
#!/bin/bash
str="Paris, France, Europe"
array=()
while read -r -d $'\0' each; do # use a NUL terminated field separator
array+=("$each")
done < <(printf "%s" "$str" | awk '{ gsub(/,[ ]+|$/,"\0"); print }')
declare -p array
# declare -a array=([0]="Paris" [1]="France" [2]="Europe") output
It is more efficient to use a regex you directly in Bash:
#!/bin/bash
str="Paris, France, Europe"
array=()
while [[ $str =~ ([^,]+)(,[ ]+|$) ]]; do
array+=("${BASH_REMATCH[1]}") # capture the field
i=${#BASH_REMATCH} # length of field + delimiter
str=${str:i} # advance the string by that length
done # the loop deletes $str, so make a copy if needed
declare -p array
# declare -a array=([0]="Paris" [1]="France" [2]="Europe") output...
With the second form, there is no sub shell and it will be inherently faster.
Edit by bgoldst: Here are some benchmarks comparing my readarray
solution to dawg's regex solution, and I also included the read
solution for the heck of it (note: I slightly modified the regex solution for greater harmony with my solution) (also see my comments below the post):
## competitors
function c_readarray { readarray -td '' a < <(awk '{ gsub(/, /,"\0"); print; };' <<<"$1, "); unset 'a[-1]'; };
function c_read { a=(); local REPLY=''; while read -r -d ''; do a+=("$REPLY"); done < <(awk '{ gsub(/, /,"\0"); print; };' <<<"$1, "); };
function c_regex { a=(); local s="$1, "; while [[ $s =~ ([^,]+),\ ]]; do a+=("${BASH_REMATCH[1]}"); s=${s:${#BASH_REMATCH}}; done; };
## helper functions
function rep {
local -i i=-1;
for ((i = 0; i<$1; ++i)); do
printf %s "$2";
done;
}; ## end rep()
function testAll {
local funcs=();
local args=();
local func='';
local -i rc=-1;
while [[ "$1" != ':' ]]; do
func="$1";
if [[ ! "$func" =~ ^[_a-zA-Z][_a-zA-Z0-9]*$ ]]; then
echo "bad function name: $func" >&2;
return 2;
fi;
funcs+=("$func");
shift;
done;
shift;
args=("$@");
for func in "${funcs[@]}"; do
echo -n "$func ";
{ time $func "${args[@]}" >/dev/null 2>&1; } 2>&1| tr '\n' '/';
rc=${PIPESTATUS[0]}; if [[ $rc -ne 0 ]]; then echo "[$rc]"; else echo; fi;
done| column -ts/;
}; ## end testAll()
function makeStringToSplit {
local -i n=$1; ## number of fields
if [[ $n -lt 0 ]]; then echo "bad field count: $n" >&2; return 2; fi;
if [[ $n -eq 0 ]]; then
echo;
elif [[ $n -eq 1 ]]; then
echo 'first field';
elif [[ "$n" -eq 2 ]]; then
echo 'first field, last field';
else
echo "first field, $(rep $[$1-2] 'mid field, ')last field";
fi;
}; ## end makeStringToSplit()
function testAll_splitIntoArray {
local -i n=$1; ## number of fields in input string
local s='';
echo "===== $n field$(if [[ $n -ne 1 ]]; then echo 's'; fi;) =====";
s="$(makeStringToSplit "$n")";
testAll c_readarray c_read c_regex : "$s";
}; ## end testAll_splitIntoArray()
## results
testAll_splitIntoArray 1;
## ===== 1 field =====
## c_readarray real 0m0.067s user 0m0.000s sys 0m0.000s
## c_read real 0m0.064s user 0m0.000s sys 0m0.000s
## c_regex real 0m0.000s user 0m0.000s sys 0m0.000s
##
testAll_splitIntoArray 10;
## ===== 10 fields =====
## c_readarray real 0m0.067s user 0m0.000s sys 0m0.000s
## c_read real 0m0.064s user 0m0.000s sys 0m0.000s
## c_regex real 0m0.001s user 0m0.000s sys 0m0.000s
##
testAll_splitIntoArray 100;
## ===== 100 fields =====
## c_readarray real 0m0.069s user 0m0.000s sys 0m0.062s
## c_read real 0m0.065s user 0m0.000s sys 0m0.046s
## c_regex real 0m0.005s user 0m0.000s sys 0m0.000s
##
testAll_splitIntoArray 1000;
## ===== 1000 fields =====
## c_readarray real 0m0.084s user 0m0.031s sys 0m0.077s
## c_read real 0m0.092s user 0m0.031s sys 0m0.046s
## c_regex real 0m0.125s user 0m0.125s sys 0m0.000s
##
testAll_splitIntoArray 10000;
## ===== 10000 fields =====
## c_readarray real 0m0.209s user 0m0.093s sys 0m0.108s
## c_read real 0m0.333s user 0m0.234s sys 0m0.109s
## c_regex real 0m9.095s user 0m9.078s sys 0m0.000s
##
testAll_splitIntoArray 100000;
## ===== 100000 fields =====
## c_readarray real 0m1.460s user 0m0.326s sys 0m1.124s
## c_read real 0m2.780s user 0m1.686s sys 0m1.092s
## c_regex real 17m38.208s user 15m16.359s sys 2m19.375s
##