join,uniq: support multi-byte separators

* NEWS: Mention this.
* bootstrap.conf (gnulib_modules): Remove cu-ctype, as this module
is now more trouble than it’s worth.  All uses removed.
Add skipchars.
* gl/lib/cu-ctype.c, gl/lib/cu-ctype.h, gl/modules/cu-ctype:
Remove.
* gl/lib/skipchars.c, gl/lib/skipchars.h, gl/modules/skipchars:
* tests/misc/join-utf8.sh:
New files.
* src/join.c: Include skipchars.h and mcel.h instead of cu-ctype.h.
(tab): Now mcel_t, not int.  All uses changed.
(output_separator, output_seplen): New static vars.
(eq_tab, newline_or_blank, comma_or_blank): New functions.
(xfields, prfields, prjoin, add_field_list, main):
Support multi-byte characters.
* src/numfmt.c: Include ctype.h, skipchars.h.
Do not include cu-ctype.h.
(newline_or_blank): New function.
(next_field): Support multi-byte characters.
* src/sort.c: Include ctype.h instead of cu-ctype.h.
(inittables): Open-code field_sep since it no longer exists.
‘sort’ is not multi-byte safe yet, but when it is this code
will need revamping anyway.
* src/uniq.c: Include mcel.h and skipchars.h instead of cu-ctype.h.
(newline_or_blank): New function.
(find_field): Support multi-byte characters.
* tests/local.mk (all_tests): Add tests/misc/join-utf8.sh
This commit is contained in:
Paul Eggert 2023-10-30 00:32:51 -07:00
parent 2709bea0f4
commit 11b01fc21f
14 changed files with 244 additions and 128 deletions

5
NEWS
View File

@ -8,6 +8,11 @@ GNU coreutils NEWS -*- outline -*-
to preserve ownership" when copying to GNU/Linux CIFS file systems.
They do this by working around some Linux CIFS bugs.
join and uniq now support multi-byte characters better.
For example, 'join -tX' now works even if X is a multi-byte character,
and both programs now treat multi-byte characters like U+3000
IDEOGRAPHIC SPACE as blanks if the current locale treats them so.
numfmt options like --suffix no longer have an arbitrary 127-byte limit.
[bug introduced with numfmt in coreutils-8.21]

View File

@ -70,7 +70,6 @@ gnulib_modules="
crypto/sha256
crypto/sha512
crypto/sm3
cu-ctype
cycle-check
d-ino
d-type
@ -241,6 +240,7 @@ gnulib_modules="
settime
sig2str
sigaction
skipchars
smack
ssize_t
stat-macros

View File

@ -1,3 +0,0 @@
#include <config.h>
#define CU_CTYPE_INLINE _GL_EXTERN_INLINE
#include <cu-ctype.h>

View File

@ -1,35 +0,0 @@
/* Character type definitions for coreutils
Copyright 2023 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <ctype.h>
#ifndef _GL_INLINE_HEADER_BEGIN
# error "Please include config.h first."
#endif
_GL_INLINE_HEADER_BEGIN
#ifndef CU_CTYPE_INLINE
# define CU_CTYPE_INLINE _GL_INLINE
#endif
/* '\n' is considered a field separator with --zero-terminated. */
CU_CTYPE_INLINE bool
field_sep (unsigned char ch)
{
return isblank (ch) || ch == '\n';
}
_GL_INLINE_HEADER_END

3
gl/lib/skipchars.c Normal file
View File

@ -0,0 +1,3 @@
#include <config.h>
#define SKIPCHARS_INLINE _GL_EXTERN_INLINE
#include <skipchars.h>

56
gl/lib/skipchars.h Normal file
View File

@ -0,0 +1,56 @@
/* Skipping sequences of characters satisfying a predicate
Copyright 2023 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include "mcel.h"
_GL_INLINE_HEADER_BEGIN
#ifndef SKIPCHARS_INLINE
# define SKIPCHARS_INLINE _GL_INLINE
#endif
/* Return the address just past the leading sequence of possibly
multi-byte characters or encoding errors G in STR that satisfy
PREDICATE (G) if OK is true, or that do not satisfy the predicate
call if OK is false. */
SKIPCHARS_INLINE char *
skip_str_matching (char const *str, bool (*predicate) (mcel_t), bool ok)
{
char const *s = str;
for (mcel_t g; *s && predicate (g = mcel_scanz (s)) == ok;
s += g.len)
continue;
return (char *) s;
}
/* Return the address just past the leading sequence of possibly
multi-byte characters or encoding errors G in BUF (which ends at LIM)
that satisfy PREDICATE (G) if OK is true, or that do not satisfy
the predicate call if OK is false. */
SKIPCHARS_INLINE char *
skip_buf_matching (char const *buf, char const *lim,
bool (*predicate) (mcel_t), bool ok)
{
char const *s = buf;
for (mcel_t g; s < lim && predicate (g = mcel_scan (s, lim)) == ok;
s += g.len)
continue;
return (char *) s;
}
_GL_INLINE_HEADER_END

View File

@ -1,24 +0,0 @@
Description:
ctype.h-like definitions for coreutils
Files:
lib/cu-ctype.c
lib/cu-ctype.h
Depends-on:
ctype
extern-inline
configure.ac:
Makefile.am:
lib_SOURCES += cu-ctype.c
Include:
"cu-ctype.h"
License:
GPL
Maintainer:
all

24
gl/modules/skipchars Normal file
View File

@ -0,0 +1,24 @@
Description:
Skip sequences of multi-byte characters or encoding errors
Files:
lib/skipchars.c
lib/skipchars.h
Depends-on:
extern-inline
mcel
configure.ac:
Makefile.am:
lib_SOURCES += skipchars.c
Include:
"skipchars.h"
License:
GPL
Maintainer:
all

View File

@ -23,12 +23,13 @@
#include "system.h"
#include "assure.h"
#include "cu-ctype.h"
#include "fadvise.h"
#include "hard-locale.h"
#include "linebuffer.h"
#include "mcel.h"
#include "memcasecmp.h"
#include "quote.h"
#include "skipchars.h"
#include "stdio--.h"
#include "xmemcoll.h"
#include "xstrtol.h"
@ -135,10 +136,14 @@ static struct outlist outlist_head;
/* Last element in 'outlist', where a new element can be added. */
static struct outlist *outlist_end = &outlist_head;
/* Tab character separating fields. If negative, fields are separated
by any nonempty string of blanks, otherwise by exactly one
tab character whose value (when cast to unsigned char) equals TAB. */
static int tab = -1;
/* Tab character (or encoding error) separating fields. If TAB.len == 0,
fields are separated by any nonempty string of blanks, otherwise by
exactly one tab character (or encoding error) equal to TAB. */
static mcel_t tab;
/* The output separator to use, and its length in bytes. */
static char const *output_separator = " ";
static idx_t output_seplen = 1;
/* If nonzero, check that the input is correctly ordered. */
static enum
@ -267,6 +272,18 @@ extract_field (struct line *line, char *field, idx_t len)
++(line->nfields);
}
static bool
eq_tab (mcel_t g)
{
return mcel_cmp (g, tab) == 0;
}
static bool
newline_or_blank (mcel_t g)
{
return g.ch == '\n' || c32isblank (g.ch);
}
/* Fill in the 'fields' structure in LINE. */
static void
@ -278,34 +295,29 @@ xfields (struct line *line)
if (ptr == lim)
return;
if (0 <= tab && tab != '\n')
if (!tab.len)
{
char *sep;
for (; (sep = memchr (ptr, tab, lim - ptr)) != nullptr; ptr = sep + 1)
extract_field (line, ptr, sep - ptr);
}
else if (tab < 0)
{
/* Skip leading blanks before the first field. */
while (field_sep (*ptr))
if (++ptr == lim)
return;
do
while (ptr < lim)
{
char *sep;
for (sep = ptr + 1; sep != lim && ! field_sep (*sep); sep++)
continue;
ptr = skip_buf_matching (ptr, lim, newline_or_blank, true);
if (!*ptr)
break;
char *sep = skip_buf_matching (ptr, lim, newline_or_blank, false);
extract_field (line, ptr, sep - ptr);
if (sep == lim)
return;
for (ptr = sep + 1; ptr != lim && field_sep (*ptr); ptr++)
continue;
ptr = sep;
}
while (ptr != lim);
}
else
{
if (tab.ch != '\n')
for (char *sep;
((sep = skip_buf_matching (ptr, lim, eq_tab, false))
< lim);
ptr = sep + mcel_scan (sep, lim).len)
extract_field (line, ptr, sep - ptr);
extract_field (line, ptr, lim - ptr);
extract_field (line, ptr, lim - ptr);
}
}
static void
@ -568,16 +580,15 @@ prfields (struct line const *line, idx_t join_field, idx_t autocount)
{
idx_t i;
idx_t nfields = autoformat ? autocount : line->nfields;
char output_separator = tab < 0 ? ' ' : tab;
for (i = 0; i < join_field && i < nfields; ++i)
{
putchar (output_separator);
fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
for (i = join_field + 1; i < nfields; ++i)
{
putchar (output_separator);
fwrite (output_separator, 1, output_seplen, stdout);
prfield (i, line);
}
}
@ -588,7 +599,6 @@ static void
prjoin (struct line const *line1, struct line const *line2)
{
const struct outlist *outlist;
char output_separator = tab < 0 ? ' ' : tab;
idx_t field;
struct line const *line;
@ -622,7 +632,7 @@ prjoin (struct line const *line1, struct line const *line2)
o = o->next;
if (o == nullptr)
break;
putchar (output_separator);
fwrite (output_separator, 1, output_seplen, stdout);
}
putchar (eolchar);
}
@ -886,6 +896,12 @@ decode_field_spec (char const *s, int *file_index, idx_t *field_index)
}
}
static bool
comma_or_blank (mcel_t g)
{
return g.ch == ',' || c32isblank (g.ch);
}
/* Add the comma or blank separated field spec(s) in STR to 'outlist'. */
static void
@ -898,14 +914,17 @@ add_field_list (char *str)
int file_index;
idx_t field_index;
char const *spec_item = p;
p = strpbrk (p, ", \t");
if (p)
*p++ = '\0';
p = skip_str_matching (spec_item, comma_or_blank, false);
if (*p)
{
mcel_t g = mcel_scanz (p);
*p = '\0';
p += g.len;
}
decode_field_spec (spec_item, &file_index, &field_index);
add_field (file_index, field_index);
}
while (p);
while (*p);
}
/* Set the join field *VAR to VAL, but report an error if *VAR is set
@ -1087,20 +1106,30 @@ main (int argc, char **argv)
case 't':
{
unsigned char newtab = optarg[0];
if (! newtab)
newtab = '\n'; /* '' => process the whole line. */
else if (optarg[1])
mcel_t newtab;
if (!*optarg)
{
if (STREQ (optarg, "\\0"))
newtab = '\0';
else
/* '' => process the whole line. */
newtab = mcel_ch ('\n', 1);
/* output_separator does not matter. */
}
else if (STREQ (optarg, "\\0"))
{
newtab = mcel_ch ('\0', 1);
output_separator = "";
}
else
{
newtab = mcel_scanz (optarg);
if (optarg[newtab.len])
error (EXIT_FAILURE, 0, _("multi-character tab %s"),
quote (optarg));
output_separator = optarg;
}
if (0 <= tab && tab != newtab)
if (tab.len && mcel_cmp (tab, newtab) != 0)
error (EXIT_FAILURE, 0, _("incompatible tabs"));
tab = newtab;
output_seplen = newtab.len;
}
break;

View File

@ -15,6 +15,7 @@
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
#include <ctype.h>
#include <float.h>
#include <getopt.h>
#include <stdckdint.h>
@ -24,9 +25,9 @@
#include "argmatch.h"
#include "c-ctype.h"
#include "cu-ctype.h"
#include "mbswidth.h"
#include "quote.h"
#include "skipchars.h"
#include "system.h"
#include "xstrtol.h"
@ -1314,6 +1315,12 @@ process_suffixed_number (char *text, long double *result,
return (e == SSE_OK || e == SSE_OK_PRECISION_LOSS);
}
static bool
newline_or_blank (mcel_t g)
{
return g.ch == '\n' || c32isblank (g.ch);
}
/* Return a pointer to the beginning of the next field in line.
The line pointer is moved to the end of the next field. */
static char*
@ -1334,11 +1341,8 @@ next_field (char **line)
else
{
/* keep any space prefix in the returned field */
while (*field_end && field_sep (*field_end))
++field_end;
while (*field_end && ! field_sep (*field_end))
++field_end;
field_end = skip_str_matching (field_end, newline_or_blank, true);
field_end = skip_str_matching (field_end, newline_or_blank, false);
}
*line = field_end;

View File

@ -22,6 +22,7 @@
#include <config.h>
#include <ctype.h>
#include <getopt.h>
#include <pthread.h>
#include <sys/resource.h>
@ -31,7 +32,6 @@
#include "system.h"
#include "argmatch.h"
#include "assure.h"
#include "cu-ctype.h"
#include "fadvise.h"
#include "filevercmp.h"
#include "flexmember.h"
@ -1293,9 +1293,9 @@ inittables (void)
for (i = 0; i < UCHAR_LIM; ++i)
{
blanks[i] = field_sep (i);
blanks[i] = i == '\n' || isblank (i);
nondictionary[i] = ! blanks[i] && ! isalnum (i);
nonprinting[i] = ! isprint (i);
nondictionary[i] = ! isalnum (i) && ! field_sep (i);
fold_toupper[i] = toupper (i);
}

View File

@ -23,10 +23,11 @@
#include "system.h"
#include "argmatch.h"
#include "cu-ctype.h"
#include "linebuffer.h"
#include "fadvise.h"
#include "mcel.h"
#include "posixver.h"
#include "skipchars.h"
#include "stdio--.h"
#include "xstrtol.h"
#include "memcasecmp.h"
@ -248,6 +249,12 @@ size_opt (char const *opt, char const *msgid)
return MIN (size, SIZE_MAX);
}
static bool
newline_or_blank (mcel_t g)
{
return g.ch == '\n' || c32isblank (g.ch);
}
/* Given a linebuffer LINE,
return a pointer to the beginning of the line's field to be compared. */
@ -256,21 +263,19 @@ static char *
find_field (struct linebuffer const *line)
{
size_t count;
char const *lp = line->buffer;
size_t size = line->length - 1;
size_t i = 0;
char *lp = line->buffer;
char const *lim = lp + line->length - 1;
for (count = 0; count < skip_fields && i < size; count++)
for (count = 0; count < skip_fields && lp < lim; count++)
{
while (i < size && field_sep (lp[i]))
i++;
while (i < size && !field_sep (lp[i]))
i++;
lp = skip_buf_matching (lp, lim, newline_or_blank, true);
lp = skip_buf_matching (lp, lim, newline_or_blank, false);
}
i += MIN (skip_chars, size - i);
for (size_t s = skip_chars; lp < lim && s; s--)
lp += mcel_scan (lp, lim).len;
return line->buffer + i;
return lp;
}
/* Return false if two strings OLD and NEW match, true if not.

View File

@ -271,6 +271,7 @@ all_tests = \
tests/misc/mktemp.pl \
tests/misc/arch.sh \
tests/misc/join.pl \
tests/misc/join-utf8.sh \
tests/pr/pr-tests.pl \
tests/pwd/pwd-option.sh \
tests/chcon/chcon-fail.sh \

51
tests/misc/join-utf8.sh Executable file
View File

@ -0,0 +1,51 @@
#!/bin/sh
# Test join in a UTF-8 locale.
# Copyright 2023 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
print_ver_ join
test "${LOCALE_FR_UTF8+set}" = set || skip_ "French UTF-8 locale not available"
LC_ALL=$LOCALE_FR_UTF8
export LC_ALL
fail=0
vertical_line='|'
multiplication_sign='×'
en_dash=''
old_Persian_word_divider='𐏐'
for s in \
"$vertical_line" \
"$multiplication_sign" \
"$en_dash" \
"$old_Persian_word_divider"
do
printf '0%sA\n1%sa\n2%sb\n4%sc\n' "$s" "$s" "$s" "$s" >a ||
framework_failure_
printf '0%sB\n1%sd\n3%se\n4%sf\n' "$s" "$s" "$s" "$s" >b ||
framework_failure_
join -t"$s" -a1 -a2 -eouch -o0,1.2,2.2 a b >out || fail=1
printf '0%sA%sB\n1%sa%sd\n2%sb%souch\n3%souch%se\n4%sc%sf\n' \
"$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" "$s" >exp ||
framework_failure
compare exp out || fail=1
done
Exit $fail