From 8bc11f80a3eff1afec437383a63953e21a2063cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Tue, 14 Oct 2025 16:17:56 +0100 Subject: [PATCH 1/6] numfmt: support reading numbers with NBSP before unit * src/numfmt.c (simple_strtod_human): Accept (multi-byte) non-breaking space character between number and unit. Note we restrict this to a single character between number and unit, to allow less ambiguous parsing if multiple blanks are used to delimit fields. * tests/misc/numfmt.pl: Add test cases. * doc/coreutils.texi (numfmt invocation): Fix stale description --delimiter skipping whitespace. * NEWS: Mention the improvement. --- NEWS | 2 ++ doc/coreutils.texi | 2 +- src/numfmt.c | 31 +++++++++++++++++++++---------- tests/misc/numfmt.pl | 23 +++++++++++++++++++++++ 4 files changed, 47 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index e6053a04b..40d443942 100644 --- a/NEWS +++ b/NEWS @@ -35,6 +35,8 @@ GNU coreutils NEWS -*- outline -*- ** Improvements + numfmt now parses numbers with a non-breaking space character before a unit. + wc -l now operates 10% faster on hosts that support AVX512 instructions. diff --git a/doc/coreutils.texi b/doc/coreutils.texi index 26c9209a3..b50e5f724 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -19447,7 +19447,7 @@ Print (to standard error) warning messages about possible erroneous usage. @itemx --delimiter=@var{d} @opindex -d @opindex --delimiter -Use the character @var{d} as input field separator (default: whitespace). +Use the character @var{d} as input field separator (default: newline or blank). Using non-default delimiter turns off automatic padding. @item --field=@var{fields} diff --git a/src/numfmt.c b/src/numfmt.c index 0cc12689e..fbf104b51 100644 --- a/src/numfmt.c +++ b/src/numfmt.c @@ -25,6 +25,7 @@ #include "argmatch.h" #include "c-ctype.h" #include "mbswidth.h" +#include "mcel.h" #include "quote.h" #include "skipchars.h" #include "system.h" @@ -210,6 +211,11 @@ static int decimal_point_length; /* debugging for developers. Enables devmsg(). */ static bool dev_debug = false; +static bool +newline_or_blank (mcel_t g) +{ + return g.ch == '\n' || c32isblank (g.ch); +} static inline int default_scale_base (enum scale_type scale) @@ -645,15 +651,23 @@ simple_strtod_human (char const *input_str, { /* process suffix. */ - /* Skip any blanks between the number and suffix. */ - while (isblank (to_uchar (**endptr))) - (*endptr)++; + /* Skip a single blank or NBSP between the number and suffix. */ + mcel_t g = mcel_scanz (*endptr); + if (c32isblank (g.ch) || c32isnbspace (g.ch)) + (*endptr) += g.len; if (**endptr == '\0') break; /* Treat as no suffix. */ if (!valid_suffix (**endptr)) - return SSE_INVALID_SUFFIX; + { + /* Trailing blanks are allowed. */ + *endptr = skip_str_matching (*endptr, newline_or_blank, true); + if (**endptr == '\0') + break; + + return SSE_INVALID_SUFFIX; + } if (allowed_scaling == scale_none) return SSE_VALID_BUT_FORBIDDEN_SUFFIX; @@ -680,6 +694,9 @@ simple_strtod_human (char const *input_str, *precision = 0; /* Reset, to select precision based on scale. */ + /* Trailing blanks are allowed. */ + *endptr = skip_str_matching (*endptr, newline_or_blank, true); + break; } @@ -1320,12 +1337,6 @@ process_suffixed_number (char *text, long double *result, return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS); } -static bool -newline_or_blank (mcel_t g) -{ - return g.ch == '\n' || c32isblank (g.ch); -} - /* Return a pointer to the beginning of the next field in line. The line pointer is moved to the end of the next field. */ static char* diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl index 4dd9718c9..85c888cd8 100755 --- a/tests/misc/numfmt.pl +++ b/tests/misc/numfmt.pl @@ -164,6 +164,14 @@ my @Tests = '--suffix=Foo' . 'x' x 122 . 'y 0', {OUT => '0Foo' . 'x' x 122 . 'y'}], ['suf-21', "-d '' --from=si '4 '", {OUT => "4"}], + # Multiple spaces between number and suffix should be rejected + ['suf-22', "-d '' --from=auto '2 K'", + {ERR => "$prog: invalid suffix in input: '2 K'\n"}, + {EXIT => 2}], + # Trailing spaces should be accepted + ['suf-23', "-d '' --from=auto '2 '", {OUT=>'2'}], + ['suf-24', "-d '' --from=auto '2 '", {OUT=>'2'}], + ['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}], ## GROUPING @@ -1067,6 +1075,21 @@ my @Locale_Tests = ['lcl-fmt-7', '--format="%0\'\'6f" 1234',{OUT=>"01${lg}234"}, {ENV=>"LC_ALL=$locale"}], + # Single blank/NBSP acceptance between number and suffix + ['lcl-suf-1', "-d '' --from=auto '2 K'", {OUT => "2000"}, + {ENV=>"LC_ALL=$locale"}], + ['lcl-suf-2', "-d '' --from=auto '2\tK'", {OUT => "2000"}, + {ENV=>"LC_ALL=$locale"}], + # NBSP characters: U+00A0, U+2007, U+202F, U+2060 + ['lcl-suf-3', "--from=auto '2\xc2\xa0K'", {OUT => "2000"}, + {ENV=>"LC_ALL=$locale"}], + ['lcl-suf-4', "--from=auto '2\xe2\x80\x87Ki'", {OUT => "2048"}, + {ENV=>"LC_ALL=$locale"}], + ['lcl-suf-5', "--from=auto '2\xe2\x80\xafK'", {OUT => "2000"}, + {ENV=>"LC_ALL=$locale"}], + ['lcl-suf-6', "--from=auto '2\xe2\x81\xa0Ki'", {OUT => "2048"}, + {ENV=>"LC_ALL=$locale"}], + ); if ($locale ne 'C') { -- 2.51.0