From 0f7babad383547c182bd85e97ff497edd11f6c20 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?P=C3=A1draig=20Brady?=
Date: Thu, 9 Oct 2025 14:24:12 +0100
Subject: [PATCH 3/6] numfmt: add --unit-separator
Output, accept, or disallow a string between the number and unit
as recommended in
I.e. support outputting numbers of the form: "1234 M"
* src/numfmt.c (simple_strtod_human): Skip unit separator if present,
or disallow a unit separator if empty.
(double_to_human): Output unit separator if specified.
(main): Accept --unit-separator.
* tests/misc/numfmt.pl: Add test cases.
* doc/coreutils.texi: Describe the new option,
giving examples of interaction with --delimiter.
* NEWS: Mention the new feature.
* THANKS.in: Add Johannes Schauer Marin Rodrigues,
who provided a preliminary patch.
---
NEWS | 5 ++++
THANKS.in | 1 +
doc/coreutils.texi | 17 +++++++++++
src/numfmt.c | 44 ++++++++++++++++++++++-----
tests/misc/numfmt.pl | 71 ++++++++++++++++++++++++++++++++++++++++++++
5 files changed, 131 insertions(+), 7 deletions(-)
diff --git a/NEWS b/NEWS
index a07fe298c..b34513271 100644
--- a/NEWS
+++ b/NEWS
@@ -33,6 +33,11 @@ GNU coreutils NEWS -*- outline -*-
that use the GNU extension /NUM or +NUM formats.
[bug introduced in coreutils-8.28]
+** New Features
+
+ 'numfmt' now accepts the --unit-separator=SEP option, to output or accept
+ a separator between the number and unit. For e.g. "1234 M".
+
** Improvements
numfmt now parses numbers with a non-breaking space character before a unit,
diff --git a/THANKS.in b/THANKS.in
index 8c97a8138..8f6af1b61 100644
--- a/THANKS.in
+++ b/THANKS.in
@@ -315,6 +315,7 @@ Joey Hess joeyh@debian.org
Johan Boule bohan@bohan.dyndns.org
Johan Danielsson joda@pdc.kth.se
Johannes Altmanninger aclopte@gmail.com
+Johannes Schauer Marin Rodrigues josch@debian.org
John Bley jbb6@acpub.duke.edu
John Gatewood Ham zappaman@alphabox.compsci.buu.ac.th
John Gotts jgotts@umich.edu
diff --git a/doc/coreutils.texi b/doc/coreutils.texi
index b50e5f724..89534db72 100644
--- a/doc/coreutils.texi
+++ b/doc/coreutils.texi
@@ -19544,6 +19544,23 @@ the output numbers represent other units (e.g. to represent @samp{4,000,000}
bytes in blocks of 1kB, use @samp{--to=si --to-unit=1000}).
Suffixes are handled as with @samp{--from=auto}.
+@item --unit-separator=@var{sep}
+@opindex --unit-separator
+Support a separator @var{sep} between the number and unit,
+with @option{--from} or @option{--to} auto-scaled units.
+By default a blank or non-breaking space character is accepted on input,
+and no separator is printed on output.
+When parsing input, the specified unit separator has lower precedence
+than field delimiters. See the @option{--delimiter} option above.
+
+Examples:
+@example
+Add a space on output: @option{--unit-separator=' '}
+Disable blanks on input: @option{--unit-separator=''}
+Support blanks on input: @option{--delimiter=''}
+Ditto and output non-breaking space: @option{-d '' --unit-separator=$'\u00A0'}
+@end example
+
@optZeroTerminated
@newlineFieldSeparator
diff --git a/src/numfmt.c b/src/numfmt.c
index 1a744770f..26f918054 100644
--- a/src/numfmt.c
+++ b/src/numfmt.c
@@ -60,7 +60,8 @@ enum
DEV_DEBUG_OPTION,
HEADER_OPTION,
FORMAT_OPTION,
- INVALID_OPTION
+ INVALID_OPTION,
+ UNIT_SEPARATOR_OPTION
};
enum scale_type
@@ -140,6 +141,7 @@ static struct option const longopts[] =
{"round", required_argument, nullptr, ROUND_OPTION},
{"padding", required_argument, nullptr, PADDING_OPTION},
{"suffix", required_argument, nullptr, SUFFIX_OPTION},
+ {"unit-separator", required_argument, nullptr, UNIT_SEPARATOR_OPTION},
{"grouping", no_argument, nullptr, GROUPING_OPTION},
{"delimiter", required_argument, nullptr, 'd'},
{"field", required_argument, nullptr, FIELD_OPTION},
@@ -172,6 +174,7 @@ static enum scale_type scale_to = scale_none;
static enum round_type round_style = round_from_zero;
static enum inval_type inval_style = inval_abort;
static char const *suffix = nullptr;
+static char const *unit_separator = nullptr;
static uintmax_t from_unit_size = 1;
static uintmax_t to_unit_size = 1;
static int grouping = 0;
@@ -658,10 +661,24 @@ simple_strtod_human (char const *input_str,
{
/* process suffix. */
- /* Skip a single blank or NBSP between the number and suffix. */
- mcel_t g = mcel_scanz (*endptr);
- if (c32isblank (g.ch) || c32isnbspace (g.ch))
- (*endptr) += g.len;
+ /* Skip a single blank, NBSP or specified unit separator.
+ Note an explicit empty --unit-sep should disable blank matching. */
+ bool matched_unit_sep = false;
+ if (unit_separator)
+ {
+ size_t sep_len = strlen (unit_separator);
+ if (STREQ_LEN (*endptr, unit_separator, sep_len))
+ {
+ matched_unit_sep = true;
+ (*endptr) += sep_len;
+ }
+ }
+ if (!matched_unit_sep)
+ {
+ mcel_t g = mcel_scanz (*endptr);
+ if (c32isblank (g.ch) || c32isnbspace (g.ch))
+ (*endptr) += g.len;
+ }
if (**endptr == '\0')
break; /* Treat as no suffix. */
@@ -768,7 +785,7 @@ double_to_human (long double val, int precision,
char *buf, idx_t buf_size,
enum scale_type scale, int group, enum round_type round)
{
- char fmt[sizeof "%'0.*Lfi%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)];
+ char fmt[sizeof "%'0.*Lfi%s%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)];
char *pfmt = fmt;
*pfmt++ = '%';
@@ -835,11 +852,12 @@ double_to_human (long double val, int precision,
devmsg (" after rounding, value=%Lf * %0.f ^ %d\n", val, scale_base, power);
- strcpy (pfmt, ".*Lf%s%s%s");
+ strcpy (pfmt, ".*Lf%s%s%s%s");
int prec = user_precision == -1 ? show_decimal_point : user_precision;
return snprintf (buf, buf_size, fmt, prec, val,
+ (power > 0 && unit_separator) ? unit_separator : "",
power == 1 && scale == scale_SI
? "k" : suffix_power_char (power),
&"i"[! (scale == scale_IEC_I && 0 < power)],
@@ -954,6 +972,10 @@ Reformat NUMBER(s), or the numbers from standard input if none are specified.\n\
fputs (_("\
--suffix=SUFFIX add SUFFIX to output numbers, and accept optional\n\
SUFFIX in input numbers\n\
+"), stdout);
+ fputs (_("\
+ --unit-separator=SEP insert SEP between number and unit on output,\n\
+ and accept optional SEP in input numbers\n\
"), stdout);
fputs (_("\
--to=UNIT auto-scale output numbers to UNITs; see UNIT below\n\
@@ -1556,6 +1578,10 @@ main (int argc, char **argv)
suffix = optarg;
break;
+ case UNIT_SEPARATOR_OPTION:
+ unit_separator = optarg;
+ break;
+
case DEBUG_OPTION:
debug = true;
break;
@@ -1607,6 +1633,10 @@ main (int argc, char **argv)
&& !grouping && (padding_width == 0) && (format_str == nullptr))
error (0, 0, _("no conversion option specified"));
+ if (debug && unit_separator && delimiter == DELIMITER_DEFAULT)
+ error (0, 0,
+ _("field delimiters have higher precedence than unit separators"));
+
if (format_str)
parse_format_string (format_str);
diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl
index 1d3c4202c..ff22c7303 100755
--- a/tests/misc/numfmt.pl
+++ b/tests/misc/numfmt.pl
@@ -173,6 +173,77 @@ my @Tests =
['suf-24', "-d '' --from=auto '2 '", {OUT=>'2'}],
['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}],
+ ## Unit Separator
+ # Output with space separator
+ ['unit-sep-1', '--to=si --unit-separator=" " 1000', {OUT=>"1.0 k"}],
+ ['unit-sep-2', '--to=iec --unit-separator=" " 1024', {OUT=>"1.0 K"}],
+ ['unit-sep-3', '--to=iec-i --unit-separator=" " 2048', {OUT=>"2.0 Ki"}],
+
+ # Output with multi-character separator
+ ['unit-sep-4', '--to=si --unit-separator="__" 1000', {OUT=>"1.0__k"}],
+ ['unit-sep-5', '--to=iec --unit-separator="::" 2048', {OUT=>"2.0::K"}],
+
+ # Input with space separator
+ ['unit-sep-6', '-d "" --from=si --unit-sep=" " "1 K"', {OUT=>"1000"}],
+ ['unit-sep-7', '-d "" --from=iec --unit-sep=" " "2 M"', {OUT=>"2097152"}],
+
+ # Input with multi-character separator
+ ['unit-sep-8', '-d "" --from=si --unit-separator=" "',
+ {IN_PIPE=>"1 K\n2 M\n3 G\n"},
+ {OUT=>"1000\n2000000\n3000000000"}],
+ ['unit-sep-9', '--from=iec --unit-separator="'."\xC2\xA0".'"',
+ {IN_PIPE=>"4\xC2\xA0K\n"}, {OUT=>"4096"}],
+ ['unit-sep-10', '--from=iec --unit-separator="::"',
+ {IN_PIPE=>"4::K\n"}, {OUT=>"4096"}],
+
+ # input with empty separator
+ ['unit-sep-11', '-d "" --from=si --unit-separator=""',
+ {IN_PIPE=>"1K\n2M\n3G\n"},
+ {OUT=>"1000\n2000000\n3000000000"}],
+ ['unit-sep-12', '-d "" --from=si --unit-separator="" "1 K"',
+ {ERR=>"$prog: invalid suffix in input: '1 K'\n"},
+ {EXIT=>2}],
+
+ # Combined with suffix
+ ['unit-sep-13', '--to=si --unit-separator=" " --suffix=B 1000',
+ {OUT=>"1.0 kB"}],
+ ['unit-sep-14', '--to=si --unit-separator=" " --suffix=" B" 1000',
+ {OUT=>"1.0 k B"}],
+ ['unit-sep-15', '-d "" --from=si --unit-separator=" " --suffix=B',
+ {IN_PIPE=>"5 KB\n"}, {OUT=>"5000B"}],
+
+ # No separator when there's no unit (power=0)
+ ['unit-sep-16', '--to=si --unit-separator=" " 500', {OUT=>"500"}],
+
+ # Round-trip test
+ ['unit-sep-17', '--from=iec --to=iec --unit-separator="_"',
+ {IN_PIPE=>"1_K\n"}, {OUT=>"1.0_K"}],
+
+ # Currently field delimiters have higher precedence than unit separators.
+ # Even if this is changed in future, the following should hold.
+
+ # The space should act as a field delimiter here
+ ['unit-sep-18', '--from=si --unit-separator=" " "1 K_Field2"',
+ {OUT=>"1 K_Field2"}],
+ # Same as above but with 'i' suffix - should split at space with --from=si
+ ['unit-sep-19', '--from=si --unit-separator=" " "5 Ki_Field2"',
+ {OUT=>"5 Ki_Field2"}],
+ # With --from=auto, Ki followed by invalid char should also split
+ ['unit-sep-20', '--from=auto --unit-separator=" " "5 Ki_Field2"',
+ {OUT=>"5 Ki_Field2"}],
+ # With custom delimiter, space after K should not be treated as delimiter
+ ['unit-sep-21', '-d: --from=si --unit-separator=" " "5 K:Field2"',
+ {OUT=>"5000:Field2"}],
+ # Fail case: space after K with custom delimiter should error
+ ['unit-sep-22-fail', '-d: --from=si --unit-separator=" " "5 K Field2"',
+ {ERR=>"$prog: invalid suffix in input '5 K Field2': 'Field2'\n"},
+ {EXIT=>2}],
+
+ # If Unit separator consumed before delimiter char,
+ # this would change to outputting "5000 2"
+ ['unit-sep-23', '--from=si --field=1 --unit-separator=" " -d " " "5 K 2"',
+ {OUT=>"5 K 2"}],
+
## GROUPING
# "C" locale - no grouping (locale-specific tests, below)
--
2.51.0