From 8bc11f80a3eff1afec437383a63953e21a2063cd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <P@draigBrady.com>
Date: Tue, 14 Oct 2025 16:17:56 +0100
Subject: [PATCH 1/6] numfmt: support reading numbers with NBSP before unit

* src/numfmt.c (simple_strtod_human): Accept (multi-byte)
non-breaking space character between number and unit.
Note we restrict this to a single character between number
and unit, to allow less ambiguous parsing if multiple blanks
are used to delimit fields.
* tests/misc/numfmt.pl: Add test cases.
* doc/coreutils.texi (numfmt invocation): Fix stale description
--delimiter skipping whitespace.
* NEWS: Mention the improvement.
---
 NEWS                 |  2 ++
 doc/coreutils.texi   |  2 +-
 src/numfmt.c         | 31 +++++++++++++++++++++----------
 tests/misc/numfmt.pl | 23 +++++++++++++++++++++++
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/NEWS b/NEWS
index e6053a04b..40d443942 100644
--- a/NEWS
+++ b/NEWS
@@ -35,6 +35,8 @@ GNU coreutils NEWS                                    -*- outline -*-
 
 ** Improvements
 
+  numfmt now parses numbers with a non-breaking space character before a unit.
+
   wc -l now operates 10% faster on hosts that support AVX512 instructions.
 
 
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index 26c9209a3..b50e5f724 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -19447,7 +19447,7 @@ Print (to standard error) warning messages about possible erroneous usage.
 @itemx --delimiter=@var{d}
 @opindex -d
 @opindex --delimiter
-Use the character @var{d} as input field separator (default: whitespace).
+Use the character @var{d} as input field separator (default: newline or blank).
 Using non-default delimiter turns off automatic padding.
 
 @item --field=@var{fields}
diff --git a/src/numfmt.c b/src/numfmt.c
index 0cc12689e..fbf104b51 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -25,6 +25,7 @@
 #include "argmatch.h"
 #include "c-ctype.h"
 #include "mbswidth.h"
+#include "mcel.h"
 #include "quote.h"
 #include "skipchars.h"
 #include "system.h"
@@ -210,6 +211,11 @@ static int decimal_point_length;
 /* debugging for developers.  Enables devmsg().  */
 static bool dev_debug = false;
 
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
 
 static inline int
 default_scale_base (enum scale_type scale)
@@ -645,15 +651,23 @@ simple_strtod_human (char const *input_str,
     {
       /* process suffix.  */
 
-      /* Skip any blanks between the number and suffix.  */
-      while (isblank (to_uchar (**endptr)))
-        (*endptr)++;
+      /* Skip a single blank or NBSP between the number and suffix.  */
+      mcel_t g = mcel_scanz (*endptr);
+      if (c32isblank (g.ch) || c32isnbspace (g.ch))
+        (*endptr) += g.len;
 
       if (**endptr == '\0')
         break;  /* Treat as no suffix.  */
 
       if (!valid_suffix (**endptr))
-        return SSE_INVALID_SUFFIX;
+        {
+          /* Trailing blanks are allowed.  */
+          *endptr = skip_str_matching (*endptr, newline_or_blank, true);
+          if (**endptr == '\0')
+            break;
+
+          return SSE_INVALID_SUFFIX;
+        }
 
       if (allowed_scaling == scale_none)
         return SSE_VALID_BUT_FORBIDDEN_SUFFIX;
@@ -680,6 +694,9 @@ simple_strtod_human (char const *input_str,
 
       *precision = 0;  /* Reset, to select precision based on scale.  */
 
+      /* Trailing blanks are allowed.  */
+      *endptr = skip_str_matching (*endptr, newline_or_blank, true);
+
       break;
     }
 
@@ -1320,12 +1337,6 @@ process_suffixed_number (char *text, long double *result,
   return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
 }
 
-static bool
-newline_or_blank (mcel_t g)
-{
-  return g.ch == '\n' || c32isblank (g.ch);
-}
-
 /* Return a pointer to the beginning of the next field in line.
    The line pointer is moved to the end of the next field. */
 static char*
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index 4dd9718c9..85c888cd8 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -164,6 +164,14 @@ my @Tests =
       '--suffix=Foo' . 'x' x 122 . 'y 0',
       {OUT => '0Foo' . 'x' x 122 . 'y'}],
      ['suf-21', "-d '' --from=si '4  '",         {OUT => "4"}],
+     # Multiple spaces between number and suffix should be rejected
+     ['suf-22', "-d '' --from=auto '2  K'",
+             {ERR => "$prog: invalid suffix in input: '2  K'\n"},
+             {EXIT => 2}],
+     # Trailing spaces should be accepted
+     ['suf-23', "-d '' --from=auto '2 '",  {OUT=>'2'}],
+     ['suf-24', "-d '' --from=auto '2  '", {OUT=>'2'}],
+     ['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}],
 
      ## GROUPING
 
@@ -1067,6 +1075,21 @@ my @Locale_Tests =
      ['lcl-fmt-7', '--format="%0\'\'6f" 1234',{OUT=>"01${lg}234"},
              {ENV=>"LC_ALL=$locale"}],
 
+     # Single blank/NBSP acceptance between number and suffix
+     ['lcl-suf-1', "-d '' --from=auto '2 K'",      {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-2', "-d '' --from=auto '2\tK'",      {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     # NBSP characters: U+00A0, U+2007, U+202F, U+2060
+     ['lcl-suf-3', "--from=auto '2\xc2\xa0K'", {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-4', "--from=auto '2\xe2\x80\x87Ki'", {OUT => "2048"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-5', "--from=auto '2\xe2\x80\xafK'", {OUT => "2000"},
+             {ENV=>"LC_ALL=$locale"}],
+     ['lcl-suf-6', "--from=auto '2\xe2\x81\xa0Ki'", {OUT => "2048"},
+             {ENV=>"LC_ALL=$locale"}],
+
   );
 if ($locale ne 'C')
   {
-- 
2.51.0