From 0f7babad383547c182bd85e97ff497edd11f6c20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?= <P@draigBrady.com>
Date: Thu, 9 Oct 2025 14:24:12 +0100
Subject: [PATCH 3/6] numfmt: add --unit-separator

Output, accept, or disallow a string between the number and unit
as recommended in <https://physics.nist.gov/cuu/Units/checklist.html>
I.e. support outputting numbers of the form: "1234 M"

* src/numfmt.c (simple_strtod_human): Skip unit separator if present,
or disallow a unit separator if empty.
(double_to_human): Output unit separator if specified.
(main): Accept --unit-separator.
* tests/misc/numfmt.pl: Add test cases.
* doc/coreutils.texi: Describe the new option,
giving examples of interaction with --delimiter.
* NEWS: Mention the new feature.
* THANKS.in: Add Johannes Schauer Marin Rodrigues,
who provided a preliminary patch.
---
 NEWS                 |  5 ++++
 THANKS.in            |  1 +
 doc/coreutils.texi   | 17 +++++++++++
 src/numfmt.c         | 44 ++++++++++++++++++++++-----
 tests/misc/numfmt.pl | 71 ++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 131 insertions(+), 7 deletions(-)

diff --git a/NEWS b/NEWS
index a07fe298c..b34513271 100644
--- a/NEWS
+++ b/NEWS
@@ -33,6 +33,11 @@ GNU coreutils NEWS                                    -*- outline -*-
   that use the GNU extension /NUM or +NUM formats.
   [bug introduced in coreutils-8.28]
 
+** New Features
+
+  'numfmt' now accepts the --unit-separator=SEP option, to output or accept
+  a separator between the number and unit.  For e.g. "1234 M".
+
 ** Improvements
 
   numfmt now parses numbers with a non-breaking space character before a unit,
diff --git a/THANKS.in b/THANKS.in
index 8c97a8138..8f6af1b61 100644
--- a/THANKS.in
+++ b/THANKS.in
@@ -315,6 +315,7 @@ Joey Hess                           joeyh@debian.org
 Johan Boule                         bohan@bohan.dyndns.org
 Johan Danielsson                    joda@pdc.kth.se
 Johannes Altmanninger               aclopte@gmail.com
+Johannes Schauer Marin Rodrigues    josch@debian.org
 John Bley                           jbb6@acpub.duke.edu
 John Gatewood Ham                   zappaman@alphabox.compsci.buu.ac.th
 John Gotts                          jgotts@umich.edu
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index b50e5f724..89534db72 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -19544,6 +19544,23 @@ the output numbers represent other units (e.g. to represent @samp{4,000,000}
 bytes in blocks of 1kB, use @samp{--to=si --to-unit=1000}).
 Suffixes are handled as with @samp{--from=auto}.
 
+@item --unit-separator=@var{sep}
+@opindex --unit-separator
+Support a separator @var{sep} between the number and unit,
+with @option{--from} or @option{--to} auto-scaled units.
+By default a blank or non-breaking space character is accepted on input,
+and no separator is printed on output.
+When parsing input, the specified unit separator has lower precedence
+than field delimiters.  See the @option{--delimiter} option above.
+
+Examples:
+@example
+Add a space on output: @option{--unit-separator=' '}
+Disable blanks on input: @option{--unit-separator=''}
+Support blanks on input: @option{--delimiter=''}
+Ditto and output non-breaking space: @option{-d '' --unit-separator=$'\u00A0'}
+@end example
+
 @optZeroTerminated
 @newlineFieldSeparator
 
diff --git a/src/numfmt.c b/src/numfmt.c
index 1a744770f..26f918054 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -60,7 +60,8 @@ enum
   DEV_DEBUG_OPTION,
   HEADER_OPTION,
   FORMAT_OPTION,
-  INVALID_OPTION
+  INVALID_OPTION,
+  UNIT_SEPARATOR_OPTION
 };
 
 enum scale_type
@@ -140,6 +141,7 @@ static struct option const longopts[] =
   {"round", required_argument, nullptr, ROUND_OPTION},
   {"padding", required_argument, nullptr, PADDING_OPTION},
   {"suffix", required_argument, nullptr, SUFFIX_OPTION},
+  {"unit-separator", required_argument, nullptr, UNIT_SEPARATOR_OPTION},
   {"grouping", no_argument, nullptr, GROUPING_OPTION},
   {"delimiter", required_argument, nullptr, 'd'},
   {"field", required_argument, nullptr, FIELD_OPTION},
@@ -172,6 +174,7 @@ static enum scale_type scale_to = scale_none;
 static enum round_type round_style = round_from_zero;
 static enum inval_type inval_style = inval_abort;
 static char const *suffix = nullptr;
+static char const *unit_separator = nullptr;
 static uintmax_t from_unit_size = 1;
 static uintmax_t to_unit_size = 1;
 static int grouping = 0;
@@ -658,10 +661,24 @@ simple_strtod_human (char const *input_str,
     {
       /* process suffix.  */
 
-      /* Skip a single blank or NBSP between the number and suffix.  */
-      mcel_t g = mcel_scanz (*endptr);
-      if (c32isblank (g.ch) || c32isnbspace (g.ch))
-        (*endptr) += g.len;
+      /* Skip a single blank, NBSP or specified unit separator.
+         Note an explicit empty --unit-sep should disable blank matching. */
+      bool matched_unit_sep = false;
+      if (unit_separator)
+        {
+          size_t sep_len = strlen (unit_separator);
+          if (STREQ_LEN (*endptr, unit_separator, sep_len))
+            {
+              matched_unit_sep = true;
+              (*endptr) += sep_len;
+            }
+        }
+      if (!matched_unit_sep)
+        {
+          mcel_t g = mcel_scanz (*endptr);
+          if (c32isblank (g.ch) || c32isnbspace (g.ch))
+            (*endptr) += g.len;
+        }
 
       if (**endptr == '\0')
         break;  /* Treat as no suffix.  */
@@ -768,7 +785,7 @@ double_to_human (long double val, int precision,
                  char *buf, idx_t buf_size,
                  enum scale_type scale, int group, enum round_type round)
 {
-  char fmt[sizeof "%'0.*Lfi%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)];
+  char fmt[sizeof "%'0.*Lfi%s%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)];
   char *pfmt = fmt;
   *pfmt++ = '%';
 
@@ -835,11 +852,12 @@ double_to_human (long double val, int precision,
 
   devmsg ("  after rounding, value=%Lf * %0.f ^ %d\n", val, scale_base, power);
 
-  strcpy (pfmt, ".*Lf%s%s%s");
+  strcpy (pfmt, ".*Lf%s%s%s%s");
 
   int prec = user_precision == -1 ? show_decimal_point : user_precision;
 
   return snprintf (buf, buf_size, fmt, prec, val,
+                   (power > 0 && unit_separator) ? unit_separator : "",
                    power == 1 && scale == scale_SI
                    ? "k" : suffix_power_char (power),
                    &"i"[! (scale == scale_IEC_I && 0 < power)],
@@ -954,6 +972,10 @@ Reformat NUMBER(s), or the numbers from standard input if none are specified.\n\
       fputs (_("\
       --suffix=SUFFIX  add SUFFIX to output numbers, and accept optional\n\
                          SUFFIX in input numbers\n\
+"), stdout);
+      fputs (_("\
+      --unit-separator=SEP  insert SEP between number and unit on output,\n\
+                         and accept optional SEP in input numbers\n\
 "), stdout);
       fputs (_("\
       --to=UNIT        auto-scale output numbers to UNITs; see UNIT below\n\
@@ -1556,6 +1578,10 @@ main (int argc, char **argv)
           suffix = optarg;
           break;
 
+        case UNIT_SEPARATOR_OPTION:
+          unit_separator = optarg;
+          break;
+
         case DEBUG_OPTION:
           debug = true;
           break;
@@ -1607,6 +1633,10 @@ main (int argc, char **argv)
       && !grouping && (padding_width == 0) && (format_str == nullptr))
     error (0, 0, _("no conversion option specified"));
 
+  if (debug && unit_separator && delimiter == DELIMITER_DEFAULT)
+    error (0, 0,
+           _("field delimiters have higher precedence than unit separators"));
+
   if (format_str)
     parse_format_string (format_str);
 
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index 1d3c4202c..ff22c7303 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -173,6 +173,77 @@ my @Tests =
      ['suf-24', "-d '' --from=auto '2  '", {OUT=>'2'}],
      ['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}],
 
+     ## Unit Separator
+     # Output with space separator
+     ['unit-sep-1', '--to=si --unit-separator=" " 1000',  {OUT=>"1.0 k"}],
+     ['unit-sep-2', '--to=iec --unit-separator=" " 1024', {OUT=>"1.0 K"}],
+     ['unit-sep-3', '--to=iec-i --unit-separator=" " 2048', {OUT=>"2.0 Ki"}],
+
+     # Output with multi-character separator
+     ['unit-sep-4', '--to=si --unit-separator="__" 1000', {OUT=>"1.0__k"}],
+     ['unit-sep-5', '--to=iec --unit-separator="::" 2048', {OUT=>"2.0::K"}],
+
+     # Input with space separator
+     ['unit-sep-6', '-d "" --from=si --unit-sep=" " "1 K"', {OUT=>"1000"}],
+     ['unit-sep-7', '-d "" --from=iec --unit-sep=" " "2 M"', {OUT=>"2097152"}],
+
+     # Input with multi-character separator
+     ['unit-sep-8', '-d "" --from=si --unit-separator="  "',
+      {IN_PIPE=>"1  K\n2  M\n3  G\n"},
+      {OUT=>"1000\n2000000\n3000000000"}],
+     ['unit-sep-9', '--from=iec --unit-separator="'."\xC2\xA0".'"',
+      {IN_PIPE=>"4\xC2\xA0K\n"}, {OUT=>"4096"}],
+     ['unit-sep-10', '--from=iec --unit-separator="::"',
+      {IN_PIPE=>"4::K\n"}, {OUT=>"4096"}],
+
+     # input with empty separator
+     ['unit-sep-11', '-d "" --from=si --unit-separator=""',
+      {IN_PIPE=>"1K\n2M\n3G\n"},
+      {OUT=>"1000\n2000000\n3000000000"}],
+     ['unit-sep-12', '-d "" --from=si --unit-separator="" "1 K"',
+      {ERR=>"$prog: invalid suffix in input: '1 K'\n"},
+      {EXIT=>2}],
+
+     # Combined with suffix
+     ['unit-sep-13', '--to=si --unit-separator=" " --suffix=B 1000',
+      {OUT=>"1.0 kB"}],
+     ['unit-sep-14', '--to=si --unit-separator=" " --suffix=" B" 1000',
+      {OUT=>"1.0 k B"}],
+     ['unit-sep-15', '-d "" --from=si --unit-separator=" " --suffix=B',
+      {IN_PIPE=>"5 KB\n"}, {OUT=>"5000B"}],
+
+     # No separator when there's no unit (power=0)
+     ['unit-sep-16', '--to=si --unit-separator=" " 500', {OUT=>"500"}],
+
+     # Round-trip test
+     ['unit-sep-17', '--from=iec --to=iec --unit-separator="_"',
+      {IN_PIPE=>"1_K\n"}, {OUT=>"1.0_K"}],
+
+     # Currently field delimiters have higher precedence than unit separators.
+     # Even if this is changed in future, the following should hold.
+
+     # The space should act as a field delimiter here
+     ['unit-sep-18', '--from=si --unit-separator=" " "1 K_Field2"',
+      {OUT=>"1 K_Field2"}],
+     # Same as above but with 'i' suffix - should split at space with --from=si
+     ['unit-sep-19', '--from=si --unit-separator=" " "5 Ki_Field2"',
+      {OUT=>"5 Ki_Field2"}],
+     # With --from=auto, Ki followed by invalid char should also split
+     ['unit-sep-20', '--from=auto --unit-separator=" " "5 Ki_Field2"',
+      {OUT=>"5 Ki_Field2"}],
+     # With custom delimiter, space after K should not be treated as delimiter
+     ['unit-sep-21', '-d: --from=si --unit-separator=" " "5 K:Field2"',
+      {OUT=>"5000:Field2"}],
+     # Fail case: space after K with custom delimiter should error
+     ['unit-sep-22-fail', '-d: --from=si --unit-separator=" " "5 K Field2"',
+      {ERR=>"$prog: invalid suffix in input '5 K Field2': 'Field2'\n"},
+      {EXIT=>2}],
+
+     # If Unit separator consumed before delimiter char,
+     # this would change to outputting "5000 2"
+     ['unit-sep-23', '--from=si --field=1 --unit-separator=" " -d " " "5 K 2"',
+      {OUT=>"5 K 2"}],
+
      ## GROUPING
 
      # "C" locale - no grouping (locale-specific tests, below)
-- 
2.51.0