From ff0f245fc108b99dde3bd671504ef74976179d5e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Sat, 18 Oct 2025 17:44:49 +0100 Subject: [PATCH 6/6] numfmt: support multi-byte --delimiter * bootstrap.conf: Depend on mbsstr() to robustly search for a multi-byte delimiter character (string) within a multi-byte string. * src/numfmt.c (main): Accept a valid multi-byte delimiter character. (next_field): Adjust delimiter search from single byte to multi-byte aware. Use mbsstr to find the first match. * tests/misc/numfmt.pl: Add test case. * NEWS: Mention the improvement. --- NEWS | 3 ++- bootstrap.conf | 1 + src/numfmt.c | 46 +++++++++++++++++++++++--------------------- tests/misc/numfmt.pl | 7 +++++++ 4 files changed, 34 insertions(+), 23 deletions(-) diff --git a/NEWS b/NEWS index f80363f87..c5b94d792 100644 --- a/NEWS +++ b/NEWS @@ -44,7 +44,8 @@ GNU coreutils NEWS -*- outline -*- ** Improvements numfmt now parses numbers with a non-breaking space character before a unit, - and numbers containing grouping characters from the current locale. + and parses numbers containing grouping characters from the current locale. + It also supports a multi-byte --delimeter character. wc -l now operates 10% faster on hosts that support AVX512 instructions. diff --git a/bootstrap.conf b/bootstrap.conf index 8f9194341..5125d6697 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -176,6 +176,7 @@ gnulib_modules=" mbs_endswith mbschr mbslen + mbsstr mbswidth mbszero mcel-prefer diff --git a/src/numfmt.c b/src/numfmt.c index cc80ccc5d..0f0a8770b 100644 --- a/src/numfmt.c +++ b/src/numfmt.c @@ -156,9 +156,6 @@ static struct option const longopts[] = {nullptr, 0, nullptr, 0} }; -/* If delimiter has this value, blanks separate fields. */ -enum { DELIMITER_DEFAULT = CHAR_MAX + 1 }; - /* Maximum number of digits we can safely handle without precision loss, if scaling is 'none'. */ enum { MAX_UNSCALED_DIGITS = LDBL_DIG }; @@ -194,8 +191,8 @@ static int conv_exit_code = EXIT_CONVERSION_WARNINGS; /* auto-pad each line based on skipped whitespace. */ static int auto_padding = 0; -/* field delimiter */ -static int delimiter = DELIMITER_DEFAULT; +/* field delimiter - if nullptr, blanks separate fields. */ +static char const *delimiter = nullptr; /* line delimiter. */ static unsigned char line_delim = '\n'; @@ -1374,14 +1371,10 @@ next_field (char **line) char *field_start = *line; char *field_end = field_start; - if (delimiter != DELIMITER_DEFAULT) + if (delimiter) { - if (*field_start != delimiter) - { - while (*field_end && *field_end != delimiter) - ++field_end; - } - /* else empty field */ + if (! *delimiter || ! (field_end = mbsstr (field_start, delimiter))) + field_end = strchr (field_start, '\0'); } else { @@ -1462,11 +1455,13 @@ process_line (char *line, bool newline) if (! process_field (next, field)) valid_number = false; - fputc ((delimiter == DELIMITER_DEFAULT) ? - ' ' : delimiter, stdout); + if (delimiter != nullptr) + fputs (delimiter, stdout); + else + fputc (' ', stdout); - if (delimiter != DELIMITER_DEFAULT) - line++; + if (delimiter) + line += MAX (strlen (delimiter), 1); else { *line = end_field; @@ -1573,10 +1568,17 @@ main (int argc, char **argv) case 'd': /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */ - if (optarg[0] != '\0' && optarg[1] != '\0') - error (EXIT_FAILURE, 0, - _("the delimiter must be a single character")); - delimiter = optarg[0]; + if (optarg[0] != '\0') + { + mcel_t g = mcel_scanz (optarg); + /* Note we always allow single bytes, especially since mcel + explicitly does not avoid https://sourceware.org/PR29511 + I.e., we ignore g.err, and rely on g.len==1 with g.err. */ + if (optarg[g.len] != '\0') + error (EXIT_FAILURE, 0, + _("the delimiter must be a single character")); + } + delimiter = optarg; break; case 'z': @@ -1642,7 +1644,7 @@ main (int argc, char **argv) && !grouping && (padding_width == 0) && (format_str == nullptr)) error (0, 0, _("no conversion option specified")); - if (debug && unit_separator && delimiter == DELIMITER_DEFAULT) + if (debug && unit_separator && delimiter == nullptr) error (0, 0, _("field delimiters have higher precedence than unit separators")); @@ -1657,7 +1659,7 @@ main (int argc, char **argv) error (0, 0, _("grouping has no effect in this locale")); } - auto_padding = (padding_width == 0 && delimiter == DELIMITER_DEFAULT); + auto_padding = (padding_width == 0 && delimiter == nullptr); if (inval_style != inval_abort) conv_exit_code = 0; diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl index 2f03efd1c..75de1a9f9 100755 --- a/tests/misc/numfmt.pl +++ b/tests/misc/numfmt.pl @@ -283,6 +283,9 @@ my @Tests = ['delim-4', '--delimiter=: --from=auto 40M:60M', {OUT=>'40000000:60M'}], ['delim-5', '-d: --field=2 --from=auto :40M:60M', {OUT=>':40000000:60M'}], ['delim-6', '-d: --field 3 --from=auto 40M:60M', {OUT=>"40M:60M"}], + # Ensure we don't hit https://sourceware.org/PR29511 + ['delim-7', "-d '\xc2' --field=2 --invalid=ignore '1\xc2\xb72K'", + {OUT => "1\xc2\xb72K"}], ['delim-err-1', '-d,, --to=si 1', {EXIT=>1}, {ERR => "$prog: the delimiter must be a single character\n"}], @@ -1187,6 +1190,10 @@ my @Locale_Tests = ['lcl-suf-11', "--field=2 '1 \xe2\x80\x832'", {OUT => "1 2"}, {ENV=>"LC_ALL=$locale"}], + # Support multi-byte delimiter + ['lcl-delim-1', "-d '\xc2\xb7' --field=2 --from=auto '1\xc2\xb72K'", + {OUT => "1\xc2\xb72000"}, {ENV=>"LC_ALL=$locale"}], + ); if ($locale ne 'C') { -- 2.51.0