join,uniq: support multi-byte separators

* NEWS: Mention this. * bootstrap.conf (gnulib_modules): Remove cu-ctype, as this module is now more trouble than it’s worth. All uses removed. Add skipchars. * gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype: Remove. * gl/lib/skipchars.c, gl/lib/skipchars.h, gl/modules/skipchars: * tests/misc/join-utf8.sh: New files. * src/join.c: Include skipchars.h and mcel.h instead of cu-ctype.h. (tab): Now mcel_t, not int. All uses changed. (output_separator, output_seplen): New static vars. (eq_tab, newline_or_blank, comma_or_blank): New functions. (xfields, prfields, prjoin, add_field_list, main): Support multi-byte characters. * src/numfmt.c: Include ctype.h, skipchars.h. Do not include cu-ctype.h. (newline_or_blank): New function. (next_field): Support multi-byte characters. * src/sort.c: Include ctype.h instead of cu-ctype.h. (inittables): Open-code field_sep since it no longer exists. ‘sort’ is not multi-byte safe yet, but when it is this code will need revamping anyway. * src/uniq.c: Include mcel.h and skipchars.h instead of cu-ctype.h. (newline_or_blank): New function. (find_field): Support multi-byte characters. * tests/local.mk (all_tests): Add tests/misc/join-utf8.sh
2026-01-26 15:29:07 +00:00 · 2023-10-30 00:32:51 -07:00 · 2023-10-30 00:32:51 -07:00 · 11b01fc21f
commit 11b01fc21f
parent 2709bea0f4
14 changed files with 244 additions and 128 deletions
--- a/5
+++ b/5
@ -8,6 +8,11 @@ GNU coreutils NEWS                                    -*- outline -*-
  to preserve ownership" when copying to GNU/Linux CIFS file systems.
  They do this by working around some Linux CIFS bugs.

+  join and uniq now support multi-byte characters better.
+  For example, 'join -tX' now works even if X is a multi-byte character,
+  and both programs now treat multi-byte characters like U+3000
+  IDEOGRAPHIC SPACE as blanks if the current locale treats them so.
+
  numfmt options like --suffix no longer have an arbitrary 127-byte limit.
  [bug introduced with numfmt in coreutils-8.21]

--- a/bootstrap.conf
+++ b/bootstrap.conf
@ -70,7 +70,6 @@ gnulib_modules="
  crypto/sha256
  crypto/sha512
  crypto/sm3
-  cu-ctype
  cycle-check
  d-ino
  d-type
@ -241,6 +240,7 @@ gnulib_modules="
  settime
  sig2str
  sigaction
+  skipchars
  smack
  ssize_t
  stat-macros
--- a/gl/lib/cu-ctype.c
+++ b/gl/lib/cu-ctype.c
@ -1,3 +0,0 @@
-#include <config.h>
-#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
-#include <cu-ctype.h>
--- a/gl/lib/cu-ctype.h
+++ b/gl/lib/cu-ctype.h
@ -1,35 +0,0 @@
-/* Character type definitions for coreutils
-
-   Copyright 2023 Free Software Foundation, Inc.
-
-   This program is free software: you can redistribute it and/or modify
-   it under the terms of the GNU General Public License as published by
-   the Free Software Foundation, either version 3 of the License, or
-   (at your option) any later version.
-
-   This program is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU General Public License for more details.
-
-   You should have received a copy of the GNU General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
-
-#include <ctype.h>
-
-#ifndef _GL_INLINE_HEADER_BEGIN
-# error "Please include config.h first."
-#endif
-_GL_INLINE_HEADER_BEGIN
-#ifndef CU_CTYPE_INLINE
-# define CU_CTYPE_INLINE _GL_INLINE
-#endif
-
-/* '\n' is considered a field separator with  --zero-terminated.  */
-CU_CTYPE_INLINE bool
-field_sep (unsigned char ch)
-{
-  return isblank (ch) || ch == '\n';
-}
-
-_GL_INLINE_HEADER_END
--- a/gl/lib/skipchars.c
+++ b/gl/lib/skipchars.c
@ -0,0 +1,3 @@
+#include <config.h>
+#define SKIPCHARS_INLINE _GL_EXTERN_INLINE
+#include <skipchars.h>
--- a/gl/lib/skipchars.h
+++ b/gl/lib/skipchars.h
@ -0,0 +1,56 @@
+/* Skipping sequences of characters satisfying a predicate
+
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include "mcel.h"
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef SKIPCHARS_INLINE
+# define SKIPCHARS_INLINE _GL_INLINE
+#endif
+
+/* Return the address just past the leading sequence of possibly
+   multi-byte characters or encoding errors G in STR that satisfy
+   PREDICATE (G) if OK is true, or that do not satisfy the predicate
+   call if OK is false.  */
+
+SKIPCHARS_INLINE char *
+skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok)
+{
+  char const *s = str;
+  for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok;
+       s += g.len)
+    continue;
+  return (char *) s;
+}
+
+/* Return the address just past the leading sequence of possibly
+   multi-byte characters or encoding errors G in BUF (which ends at LIM)
+   that satisfy PREDICATE (G) if OK is true, or that do not satisfy
+   the predicate call if OK is false.  */
+
+SKIPCHARS_INLINE char *
+skip_buf_matching (char const *buf, char const *lim,
+                   bool (*predicate) (mcel_t), bool ok)
+{
+  char const *s = buf;
+  for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok;
+       s += g.len)
+    continue;
+  return (char *) s;
+}
+
+_GL_INLINE_HEADER_END
--- a/gl/modules/cu-ctype
+++ b/gl/modules/cu-ctype
@ -1,24 +0,0 @@
-Description:
-ctype.h-like definitions for coreutils
-
-Files:
-lib/cu-ctype.c
-lib/cu-ctype.h
-
-Depends-on:
-ctype
-extern-inline
-
-configure.ac:
-
-Makefile.am:
-lib_SOURCES += cu-ctype.c
-
-Include:
-"cu-ctype.h"
-
-License:
-GPL
-
-Maintainer:
-all
--- a/gl/modules/skipchars
+++ b/gl/modules/skipchars
@ -0,0 +1,24 @@
+Description:
+Skip sequences of multi-byte characters or encoding errors
+
+Files:
+lib/skipchars.c
+lib/skipchars.h
+
+Depends-on:
+extern-inline
+mcel
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += skipchars.c
+
+Include:
+"skipchars.h"
+
+License:
+GPL
+
+Maintainer:
+all
--- a/src/join.c
+++ b/src/join.c
@ -23,12 +23,13 @@

 #include "system.h"
 #include "assure.h"
-#include "cu-ctype.h"
 #include "fadvise.h"
 #include "hard-locale.h"
 #include "linebuffer.h"
+#include "mcel.h"
 #include "memcasecmp.h"
 #include "quote.h"
+#include "skipchars.h"
 #include "stdio--.h"
 #include "xmemcoll.h"
 #include "xstrtol.h"
@ -135,10 +136,14 @@ static struct outlist outlist_head;
 /* Last element in 'outlist', where a new element can be added.  */
 static struct outlist *outlist_end = &outlist_head;

-/* Tab character separating fields.  If negative, fields are separated
-   by any nonempty string of blanks, otherwise by exactly one
-   tab character whose value (when cast to unsigned char) equals TAB.  */
-static int tab = -1;
+/* Tab character (or encoding error) separating fields.  If TAB.len == 0,
+   fields are separated by any nonempty string of blanks, otherwise by
+   exactly one tab character (or encoding error) equal to TAB.  */
+static mcel_t tab;
+
+/* The output separator to use, and its length in bytes.  */
+static char const *output_separator = " ";
+static idx_t output_seplen = 1;

 /* If nonzero, check that the input is correctly ordered. */
 static enum
@ -267,6 +272,18 @@ extract_field (struct line *line, char *field, idx_t len)
  ++(line->nfields);
 }

+static bool
+eq_tab (mcel_t g)
+{
+  return mcel_cmp (g, tab) == 0;
+}
+
+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
 /* Fill in the 'fields' structure in LINE.  */

 static void
@ -278,34 +295,29 @@ xfields (struct line *line)
  if (ptr == lim)
    return;

-  if (0 <= tab && tab != '\n')
+  if (!tab.len)
    {
-      char *sep;
-      for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
-        extract_field (line, ptr, sep - ptr);
-    }
-  else if (tab < 0)
-    {
-      /* Skip leading blanks before the first field.  */
-      while (field_sep (*ptr))
-        if (++ptr == lim)
-          return;
-
-      do
+      while (ptr < lim)
        {
-          char *sep;
-          for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
-            continue;
+          ptr = skip_buf_matching (ptr, lim, newline_or_blank, true);
+          if (!*ptr)
+            break;
+          char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
          extract_field (line, ptr, sep - ptr);
-          if (sep == lim)
-            return;
-          for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
-            continue;
+          ptr = sep;
        }
-      while (ptr != lim);
    }
+  else
+    {
+      if (tab.ch != '\n')
+        for (char *sep;
+             ((sep = skip_buf_matching (ptr, lim, eq_tab, false))
+              < lim);
+             ptr = sep + mcel_scan (sep, lim).len)
+          extract_field (line, ptr, sep - ptr);

-  extract_field (line, ptr, lim - ptr);
+      extract_field (line, ptr, lim - ptr);
+    }
 }

 static void
@ -568,16 +580,15 @@ prfields (struct line const *line, idx_t join_field, idx_t autocount)
 {
  idx_t i;
  idx_t nfields = autoformat ? autocount : line->nfields;
-  char output_separator = tab < 0 ? ' ' : tab;

  for (i = 0; i < join_field && i < nfields; ++i)
    {
-      putchar (output_separator);
+      fwrite (output_separator, 1, output_seplen, stdout);
      prfield (i, line);
    }
  for (i = join_field + 1; i < nfields; ++i)
    {
-      putchar (output_separator);
+      fwrite (output_separator, 1, output_seplen, stdout);
      prfield (i, line);
    }
 }
@ -588,7 +599,6 @@ static void
 prjoin (struct line const *line1, struct line const *line2)
 {
  const struct outlist *outlist;
-  char output_separator = tab < 0 ? ' ' : tab;
  idx_t field;
  struct line const *line;

@ -622,7 +632,7 @@ prjoin (struct line const *line1, struct line const *line2)
          o = o->next;
          if (o == nullptr)
            break;
-          putchar (output_separator);
+          fwrite (output_separator, 1, output_seplen, stdout);
        }
      putchar (eolchar);
    }
@ -886,6 +896,12 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index)
    }
 }

+static bool
+comma_or_blank (mcel_t g)
+{
+  return g.ch == ',' || c32isblank (g.ch);
+}
+
 /* Add the comma or blank separated field spec(s) in STR to 'outlist'.  */

 static void
@ -898,14 +914,17 @@ add_field_list (char *str)
      int file_index;
      idx_t field_index;
      char const *spec_item = p;
-
-      p = strpbrk (p, ", \t");
-      if (p)
-        *p++ = '\0';
+      p = skip_str_matching (spec_item, comma_or_blank, false);
+      if (*p)
+        {
+          mcel_t g = mcel_scanz (p);
+          *p = '\0';
+          p += g.len;
+        }
      decode_field_spec (spec_item, &file_index, &field_index);
      add_field (file_index, field_index);
    }
-  while (p);
+  while (*p);
 }

 /* Set the join field *VAR to VAL, but report an error if *VAR is set
@ -1087,20 +1106,30 @@ main (int argc, char **argv)

        case 't':
          {
-            unsigned char newtab = optarg[0];
-            if (! newtab)
-              newtab = '\n'; /* '' => process the whole line.  */
-            else if (optarg[1])
+            mcel_t newtab;
+            if (!*optarg)
              {
-                if (STREQ (optarg, "\\0"))
-                  newtab = '\0';
-                else
+                /* '' => process the whole line.  */
+                newtab = mcel_ch ('\n', 1);
+                /* output_separator does not matter.  */
+              }
+            else if (STREQ (optarg, "\\0"))
+              {
+                newtab = mcel_ch ('\0', 1);
+                output_separator = "";
+              }
+            else
+              {
+                newtab = mcel_scanz (optarg);
+                if (optarg[newtab.len])
                  error (EXIT_FAILURE, 0, _("multi-character tab %s"),
                         quote (optarg));
+                output_separator = optarg;
              }
-            if (0 <= tab && tab != newtab)
+            if (tab.len && mcel_cmp (tab, newtab) != 0)
              error (EXIT_FAILURE, 0, _("incompatible tabs"));
            tab = newtab;
+            output_seplen = newtab.len;
          }
          break;

--- a/src/numfmt.c
+++ b/src/numfmt.c
@ -15,6 +15,7 @@
   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */

 #include <config.h>
+#include <ctype.h>
 #include <float.h>
 #include <getopt.h>
 #include <stdckdint.h>
@ -24,9 +25,9 @@

 #include "argmatch.h"
 #include "c-ctype.h"
-#include "cu-ctype.h"
 #include "mbswidth.h"
 #include "quote.h"
+#include "skipchars.h"
 #include "system.h"
 #include "xstrtol.h"

@ -1314,6 +1315,12 @@ process_suffixed_number (char *text, long double *result,
  return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
 }

+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
 /* Return a pointer to the beginning of the next field in line.
   The line pointer is moved to the end of the next field. */
 static char*
@ -1334,11 +1341,8 @@ next_field (char **line)
  else
    {
      /* keep any space prefix in the returned field */
-      while (*field_end && field_sep (*field_end))
-        ++field_end;
-
-      while (*field_end && ! field_sep (*field_end))
-        ++field_end;
+      field_end = skip_str_matching (field_end, newline_or_blank, true);
+      field_end = skip_str_matching (field_end, newline_or_blank, false);
    }

  *line = field_end;
--- a/src/sort.c
+++ b/src/sort.c
@ -22,6 +22,7 @@

 #include <config.h>

+#include <ctype.h>
 #include <getopt.h>
 #include <pthread.h>
 #include <sys/resource.h>
@ -31,7 +32,6 @@
 #include "system.h"
 #include "argmatch.h"
 #include "assure.h"
-#include "cu-ctype.h"
 #include "fadvise.h"
 #include "filevercmp.h"
 #include "flexmember.h"
@ -1293,9 +1293,9 @@ inittables (void)

  for (i = 0; i < UCHAR_LIM; ++i)
    {
-      blanks[i] = field_sep (i);
+      blanks[i] = i == '\n' || isblank (i);
+      nondictionary[i] = ! blanks[i] && ! isalnum (i);
      nonprinting[i] = ! isprint (i);
-      nondictionary[i] = ! isalnum (i) && ! field_sep (i);
      fold_toupper[i] = toupper (i);
    }

--- a/src/uniq.c
+++ b/src/uniq.c
@ -23,10 +23,11 @@

 #include "system.h"
 #include "argmatch.h"
-#include "cu-ctype.h"
 #include "linebuffer.h"
 #include "fadvise.h"
+#include "mcel.h"
 #include "posixver.h"
+#include "skipchars.h"
 #include "stdio--.h"
 #include "xstrtol.h"
 #include "memcasecmp.h"
@ -248,6 +249,12 @@ size_opt (char const *opt, char const *msgid)
  return MIN (size, SIZE_MAX);
 }

+static bool
+newline_or_blank (mcel_t g)
+{
+  return g.ch == '\n' || c32isblank (g.ch);
+}
+
 /* Given a linebuffer LINE,
   return a pointer to the beginning of the line's field to be compared. */

@ -256,21 +263,19 @@ static char *
 find_field (struct linebuffer const *line)
 {
  size_t count;
-  char const *lp = line->buffer;
-  size_t size = line->length - 1;
-  size_t i = 0;
+  char *lp = line->buffer;
+  char const *lim = lp + line->length - 1;

-  for (count = 0; count < skip_fields && i < size; count++)
+  for (count = 0; count < skip_fields && lp < lim; count++)
    {
-      while (i < size && field_sep (lp[i]))
-        i++;
-      while (i < size && !field_sep (lp[i]))
-        i++;
+      lp = skip_buf_matching (lp, lim, newline_or_blank, true);
+      lp = skip_buf_matching (lp, lim, newline_or_blank, false);
    }

-  i += MIN (skip_chars, size - i);
+  for (size_t s = skip_chars; lp < lim && s; s--)
+    lp += mcel_scan (lp, lim).len;

-  return line->buffer + i;
+  return lp;
 }

 /* Return false if two strings OLD and NEW match, true if not.
--- a/tests/local.mk
+++ b/tests/local.mk
@ -271,6 +271,7 @@ all_tests =					\
  tests/misc/mktemp.pl				\
  tests/misc/arch.sh				\
  tests/misc/join.pl				\
+  tests/misc/join-utf8.sh			\
  tests/pr/pr-tests.pl				\
  tests/pwd/pwd-option.sh			\
  tests/chcon/chcon-fail.sh			\
--- a/tests/misc/join-utf8.sh
+++ b/tests/misc/join-utf8.sh
@ -0,0 +1,51 @@
+#!/bin/sh
+# Test join in a UTF-8 locale.
+
+# Copyright 2023 Free Software Foundation, Inc.
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
+print_ver_ join
+
+test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
+
+LC_ALL=$LOCALE_FR_UTF8
+export LC_ALL
+
+fail=0
+
+vertical_line='|'
+multiplication_sign='×'
+en_dash='–'
+old_Persian_word_divider='𐏐'
+
+for s in \
+    "$vertical_line" \
+    "$multiplication_sign" \
+    "$en_dash" \
+    "$old_Persian_word_divider"
+do
+  printf '0%sA\n1%sa\n2%sb\n4%sc\n' "$s" "$s" "$s" "$s" >a ||
+    framework_failure_
+  printf '0%sB\n1%sd\n3%se\n4%sf\n' "$s" "$s" "$s" "$s" >b ||
+    framework_failure_
+  join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
+  printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
+         "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
+    framework_failure
+  compare exp out || fail=1
+done
+
+Exit $fail