From 0f7babad383547c182bd85e97ff497edd11f6c20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A1draig=20Brady?= Date: Thu, 9 Oct 2025 14:24:12 +0100 Subject: [PATCH 3/6] numfmt: add --unit-separator Output, accept, or disallow a string between the number and unit as recommended in I.e. support outputting numbers of the form: "1234 M" * src/numfmt.c (simple_strtod_human): Skip unit separator if present, or disallow a unit separator if empty. (double_to_human): Output unit separator if specified. (main): Accept --unit-separator. * tests/misc/numfmt.pl: Add test cases. * doc/coreutils.texi: Describe the new option, giving examples of interaction with --delimiter. * NEWS: Mention the new feature. * THANKS.in: Add Johannes Schauer Marin Rodrigues, who provided a preliminary patch. --- NEWS | 5 ++++ THANKS.in | 1 + doc/coreutils.texi | 17 +++++++++++ src/numfmt.c | 44 ++++++++++++++++++++++----- tests/misc/numfmt.pl | 71 ++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 131 insertions(+), 7 deletions(-) diff --git a/NEWS b/NEWS index a07fe298c..b34513271 100644 --- a/NEWS +++ b/NEWS @@ -33,6 +33,11 @@ GNU coreutils NEWS -*- outline -*- that use the GNU extension /NUM or +NUM formats. [bug introduced in coreutils-8.28] +** New Features + + 'numfmt' now accepts the --unit-separator=SEP option, to output or accept + a separator between the number and unit. For e.g. "1234 M". + ** Improvements numfmt now parses numbers with a non-breaking space character before a unit, diff --git a/THANKS.in b/THANKS.in index 8c97a8138..8f6af1b61 100644 --- a/THANKS.in +++ b/THANKS.in @@ -315,6 +315,7 @@ Joey Hess joeyh@debian.org Johan Boule bohan@bohan.dyndns.org Johan Danielsson joda@pdc.kth.se Johannes Altmanninger aclopte@gmail.com +Johannes Schauer Marin Rodrigues josch@debian.org John Bley jbb6@acpub.duke.edu John Gatewood Ham zappaman@alphabox.compsci.buu.ac.th John Gotts jgotts@umich.edu diff --git a/doc/coreutils.texi b/doc/coreutils.texi index b50e5f724..89534db72 100644 --- a/doc/coreutils.texi +++ b/doc/coreutils.texi @@ -19544,6 +19544,23 @@ the output numbers represent other units (e.g. to represent @samp{4,000,000} bytes in blocks of 1kB, use @samp{--to=si --to-unit=1000}). Suffixes are handled as with @samp{--from=auto}. +@item --unit-separator=@var{sep} +@opindex --unit-separator +Support a separator @var{sep} between the number and unit, +with @option{--from} or @option{--to} auto-scaled units. +By default a blank or non-breaking space character is accepted on input, +and no separator is printed on output. +When parsing input, the specified unit separator has lower precedence +than field delimiters. See the @option{--delimiter} option above. + +Examples: +@example +Add a space on output: @option{--unit-separator=' '} +Disable blanks on input: @option{--unit-separator=''} +Support blanks on input: @option{--delimiter=''} +Ditto and output non-breaking space: @option{-d '' --unit-separator=$'\u00A0'} +@end example + @optZeroTerminated @newlineFieldSeparator diff --git a/src/numfmt.c b/src/numfmt.c index 1a744770f..26f918054 100644 --- a/src/numfmt.c +++ b/src/numfmt.c @@ -60,7 +60,8 @@ enum DEV_DEBUG_OPTION, HEADER_OPTION, FORMAT_OPTION, - INVALID_OPTION + INVALID_OPTION, + UNIT_SEPARATOR_OPTION }; enum scale_type @@ -140,6 +141,7 @@ static struct option const longopts[] = {"round", required_argument, nullptr, ROUND_OPTION}, {"padding", required_argument, nullptr, PADDING_OPTION}, {"suffix", required_argument, nullptr, SUFFIX_OPTION}, + {"unit-separator", required_argument, nullptr, UNIT_SEPARATOR_OPTION}, {"grouping", no_argument, nullptr, GROUPING_OPTION}, {"delimiter", required_argument, nullptr, 'd'}, {"field", required_argument, nullptr, FIELD_OPTION}, @@ -172,6 +174,7 @@ static enum scale_type scale_to = scale_none; static enum round_type round_style = round_from_zero; static enum inval_type inval_style = inval_abort; static char const *suffix = nullptr; +static char const *unit_separator = nullptr; static uintmax_t from_unit_size = 1; static uintmax_t to_unit_size = 1; static int grouping = 0; @@ -658,10 +661,24 @@ simple_strtod_human (char const *input_str, { /* process suffix. */ - /* Skip a single blank or NBSP between the number and suffix. */ - mcel_t g = mcel_scanz (*endptr); - if (c32isblank (g.ch) || c32isnbspace (g.ch)) - (*endptr) += g.len; + /* Skip a single blank, NBSP or specified unit separator. + Note an explicit empty --unit-sep should disable blank matching. */ + bool matched_unit_sep = false; + if (unit_separator) + { + size_t sep_len = strlen (unit_separator); + if (STREQ_LEN (*endptr, unit_separator, sep_len)) + { + matched_unit_sep = true; + (*endptr) += sep_len; + } + } + if (!matched_unit_sep) + { + mcel_t g = mcel_scanz (*endptr); + if (c32isblank (g.ch) || c32isnbspace (g.ch)) + (*endptr) += g.len; + } if (**endptr == '\0') break; /* Treat as no suffix. */ @@ -768,7 +785,7 @@ double_to_human (long double val, int precision, char *buf, idx_t buf_size, enum scale_type scale, int group, enum round_type round) { - char fmt[sizeof "%'0.*Lfi%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)]; + char fmt[sizeof "%'0.*Lfi%s%s%s%s" + INT_STRLEN_BOUND (zero_padding_width)]; char *pfmt = fmt; *pfmt++ = '%'; @@ -835,11 +852,12 @@ double_to_human (long double val, int precision, devmsg (" after rounding, value=%Lf * %0.f ^ %d\n", val, scale_base, power); - strcpy (pfmt, ".*Lf%s%s%s"); + strcpy (pfmt, ".*Lf%s%s%s%s"); int prec = user_precision == -1 ? show_decimal_point : user_precision; return snprintf (buf, buf_size, fmt, prec, val, + (power > 0 && unit_separator) ? unit_separator : "", power == 1 && scale == scale_SI ? "k" : suffix_power_char (power), &"i"[! (scale == scale_IEC_I && 0 < power)], @@ -954,6 +972,10 @@ Reformat NUMBER(s), or the numbers from standard input if none are specified.\n\ fputs (_("\ --suffix=SUFFIX add SUFFIX to output numbers, and accept optional\n\ SUFFIX in input numbers\n\ +"), stdout); + fputs (_("\ + --unit-separator=SEP insert SEP between number and unit on output,\n\ + and accept optional SEP in input numbers\n\ "), stdout); fputs (_("\ --to=UNIT auto-scale output numbers to UNITs; see UNIT below\n\ @@ -1556,6 +1578,10 @@ main (int argc, char **argv) suffix = optarg; break; + case UNIT_SEPARATOR_OPTION: + unit_separator = optarg; + break; + case DEBUG_OPTION: debug = true; break; @@ -1607,6 +1633,10 @@ main (int argc, char **argv) && !grouping && (padding_width == 0) && (format_str == nullptr)) error (0, 0, _("no conversion option specified")); + if (debug && unit_separator && delimiter == DELIMITER_DEFAULT) + error (0, 0, + _("field delimiters have higher precedence than unit separators")); + if (format_str) parse_format_string (format_str); diff --git a/tests/misc/numfmt.pl b/tests/misc/numfmt.pl index 1d3c4202c..ff22c7303 100755 --- a/tests/misc/numfmt.pl +++ b/tests/misc/numfmt.pl @@ -173,6 +173,77 @@ my @Tests = ['suf-24', "-d '' --from=auto '2 '", {OUT=>'2'}], ['suf-25', "-d '' --from=auto '2K '", {OUT=>'2000'}], + ## Unit Separator + # Output with space separator + ['unit-sep-1', '--to=si --unit-separator=" " 1000', {OUT=>"1.0 k"}], + ['unit-sep-2', '--to=iec --unit-separator=" " 1024', {OUT=>"1.0 K"}], + ['unit-sep-3', '--to=iec-i --unit-separator=" " 2048', {OUT=>"2.0 Ki"}], + + # Output with multi-character separator + ['unit-sep-4', '--to=si --unit-separator="__" 1000', {OUT=>"1.0__k"}], + ['unit-sep-5', '--to=iec --unit-separator="::" 2048', {OUT=>"2.0::K"}], + + # Input with space separator + ['unit-sep-6', '-d "" --from=si --unit-sep=" " "1 K"', {OUT=>"1000"}], + ['unit-sep-7', '-d "" --from=iec --unit-sep=" " "2 M"', {OUT=>"2097152"}], + + # Input with multi-character separator + ['unit-sep-8', '-d "" --from=si --unit-separator=" "', + {IN_PIPE=>"1 K\n2 M\n3 G\n"}, + {OUT=>"1000\n2000000\n3000000000"}], + ['unit-sep-9', '--from=iec --unit-separator="'."\xC2\xA0".'"', + {IN_PIPE=>"4\xC2\xA0K\n"}, {OUT=>"4096"}], + ['unit-sep-10', '--from=iec --unit-separator="::"', + {IN_PIPE=>"4::K\n"}, {OUT=>"4096"}], + + # input with empty separator + ['unit-sep-11', '-d "" --from=si --unit-separator=""', + {IN_PIPE=>"1K\n2M\n3G\n"}, + {OUT=>"1000\n2000000\n3000000000"}], + ['unit-sep-12', '-d "" --from=si --unit-separator="" "1 K"', + {ERR=>"$prog: invalid suffix in input: '1 K'\n"}, + {EXIT=>2}], + + # Combined with suffix + ['unit-sep-13', '--to=si --unit-separator=" " --suffix=B 1000', + {OUT=>"1.0 kB"}], + ['unit-sep-14', '--to=si --unit-separator=" " --suffix=" B" 1000', + {OUT=>"1.0 k B"}], + ['unit-sep-15', '-d "" --from=si --unit-separator=" " --suffix=B', + {IN_PIPE=>"5 KB\n"}, {OUT=>"5000B"}], + + # No separator when there's no unit (power=0) + ['unit-sep-16', '--to=si --unit-separator=" " 500', {OUT=>"500"}], + + # Round-trip test + ['unit-sep-17', '--from=iec --to=iec --unit-separator="_"', + {IN_PIPE=>"1_K\n"}, {OUT=>"1.0_K"}], + + # Currently field delimiters have higher precedence than unit separators. + # Even if this is changed in future, the following should hold. + + # The space should act as a field delimiter here + ['unit-sep-18', '--from=si --unit-separator=" " "1 K_Field2"', + {OUT=>"1 K_Field2"}], + # Same as above but with 'i' suffix - should split at space with --from=si + ['unit-sep-19', '--from=si --unit-separator=" " "5 Ki_Field2"', + {OUT=>"5 Ki_Field2"}], + # With --from=auto, Ki followed by invalid char should also split + ['unit-sep-20', '--from=auto --unit-separator=" " "5 Ki_Field2"', + {OUT=>"5 Ki_Field2"}], + # With custom delimiter, space after K should not be treated as delimiter + ['unit-sep-21', '-d: --from=si --unit-separator=" " "5 K:Field2"', + {OUT=>"5000:Field2"}], + # Fail case: space after K with custom delimiter should error + ['unit-sep-22-fail', '-d: --from=si --unit-separator=" " "5 K Field2"', + {ERR=>"$prog: invalid suffix in input '5 K Field2': 'Field2'\n"}, + {EXIT=>2}], + + # If Unit separator consumed before delimiter char, + # this would change to outputting "5000 2" + ['unit-sep-23', '--from=si --field=1 --unit-separator=" " -d " " "5 K 2"', + {OUT=>"5 K 2"}], + ## GROUPING # "C" locale - no grouping (locale-specific tests, below) -- 2.51.0