paste: support multi-byte delimiters

* src/paste.c (collapse_escapes): This is the central --delimiters
parsing function, so adjust to handle multi-byte chars with
mcel_scanz().  Populate a delimiters length array to support
characters of differing lengths.
(paste_serial): Use the delimiters length array to output
the appropriate delimiter.
(paste_parallel): Likewise.
* tests/paste/multi-byte.sh: A new test.
* tests/local.mk: Reference the new test.
* NEWS: Mention the improvement.
This commit is contained in:
Pádraig Brady 2026-01-12 23:41:29 +00:00
parent e326d0df65
commit 06d228043f
5 changed files with 213 additions and 76 deletions

2
NEWS
View File

@ -81,6 +81,8 @@ GNU coreutils NEWS -*- outline -*-
'du' now processes directories with 10,000 or more entries up to 9 times
faster on the Lustre file system.
'paste' now supports multi-byte --delimiters characters.
'pinky' will now exit immediately upon receiving a write error, which is
significant when reading large plan or project files.

View File

@ -42,6 +42,7 @@
#include <sys/types.h>
#include "system.h"
#include "fadvise.h"
#include "mcel.h"
/* The official name of this program (e.g., no 'g' prefix). */
#define PROGRAM_NAME "paste"
@ -50,9 +51,6 @@
proper_name ("David M. Ihnat"), \
proper_name ("David MacKenzie")
/* Indicates that no delimiter should be added in the current position. */
#define EMPTY_DELIM '\0'
/* If nonzero, we have read standard input at some point. */
static bool have_read_stdin;
@ -60,11 +58,16 @@ static bool have_read_stdin;
corresponding lines from each file in parallel. */
static bool serial_merge;
/* The delimiters between lines of input files (used cyclically). */
/* The delimiters between lines of input files (used cyclically).
This stores the raw bytes of all delimiters concatenated. */
static char *delims;
/* A pointer to the character after the end of 'delims'. */
static char const *delim_end;
/* Length of each delimiter in bytes (supports multi-byte characters).
A length of 0 indicates no delimiter at this position (from \0 escape). */
static size_t *delim_lens;
/* Number of delimiters. */
static idx_t num_delims;
static unsigned char line_delim = '\n';
@ -78,10 +81,10 @@ static struct option const longopts[] =
{nullptr, 0, nullptr, 0}
};
/* Set globals delims and delim_end. Copy STRPTR to DELIMS, converting
backslash representations of special characters in STRPTR to their actual
values. The set of possible backslash characters has been expanded beyond
that recognized by the Unix version.
/* Set globals delims, delim_lens, and num_delims.
Process STRPTR converting backslash representations of special characters
to their actual values. The set of possible backslash characters has been
expanded beyond that recognized by the Unix version.
Return 0 upon success.
If the string ends in an odd number of backslashes, ignore the
final backslash and return nonzero. */
@ -93,62 +96,65 @@ collapse_escapes (char const *strptr)
bool backslash_at_end = false;
delims = strout;
delim_lens = xnmalloc (MAX (1, strlen (strptr)), sizeof *delim_lens);
while (*strptr)
char const *s = strptr;
idx_t idx = 0;
while (*s)
{
if (*strptr != '\\') /* Is it an escape character? */
*strout++ = *strptr++; /* No, just transfer it. */
else
if (*s == '\\')
{
switch (*++strptr)
s++;
if (*s == '\0')
{
case '0':
*strout++ = EMPTY_DELIM;
break;
case 'b':
*strout++ = '\b';
break;
case 'f':
*strout++ = '\f';
break;
case 'n':
*strout++ = '\n';
break;
case 'r':
*strout++ = '\r';
break;
case 't':
*strout++ = '\t';
break;
case 'v':
*strout++ = '\v';
break;
case '\\':
*strout++ = '\\';
break;
case '\0':
backslash_at_end = true;
goto done;
default:
*strout++ = *strptr;
break;
}
strptr++;
else if (*s == '0')
{
/* Empty delimiter at this position. */
s++;
delim_lens[idx++] = 0;
}
else
{
switch (*s)
{
case 'b': *strout++ = '\b'; break;
case 'f': *strout++ = '\f'; break;
case 'n': *strout++ = '\n'; break;
case 'r': *strout++ = '\r'; break;
case 't': *strout++ = '\t'; break;
case 'v': *strout++ = '\v'; break;
case '\\': *strout++ = '\\'; break;
default: goto copy_character;
}
s++;
delim_lens[idx++] = 1;
}
continue;
}
copy_character:
mcel_t g = mcel_scanz (s);
strout = mempcpy (strout, s, g.len);
s += g.len;
delim_lens[idx++] = g.len;
}
done:
*strout = '\0';
if (idx == 0)
{
delim_lens[0] = 0;
idx = 1;
}
num_delims = idx;
delim_end = strout;
return backslash_at_end ? 1 : 0;
}
@ -161,6 +167,16 @@ xputchar (char c)
write_error ();
}
/* Output the delimiter at DELIMPTR with length LEN.
If LEN is 0, nothing is output (empty delimiter from \0 escape). */
static inline void
output_delim (char const *delimptr, size_t len)
{
if (len > 0 && fwrite (delimptr, 1, len, stdout) != len)
write_error ();
}
/* Perform column paste on the NFILES files named in FNAMPTR.
Return true if successful, false if one or more files could not be
opened or read. */
@ -171,9 +187,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
bool ok = true;
/* If all files are just ready to be closed, or will be on this
round, the string of delimiters must be preserved.
delbuf[0] through delbuf[nfiles]
store the delimiters for closed files. */
char *delbuf = xmalloc (nfiles + 2);
delbuf stores the delimiter bytes for closed files.
Size it to hold up to (nfiles - 1) delimiters. */
char *delbuf = xmalloc ((nfiles - 1) * MB_CUR_MAX + 1);
/* Streams open to the files to process; null if the corresponding
stream is closed. */
@ -218,8 +234,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
{
/* Set up for the next line. */
bool somedone = false;
char const *delimptr = delims;
size_t delims_saved = 0; /* Number of delims saved in 'delbuf'. */
idx_t delimidx = 0; /* Current delimiter index. */
idx_t delimoff = 0; /* Current offset into delims. */
idx_t delims_saved = 0; /* Bytes saved in 'delbuf'. */
for (size_t i = 0; i < nfiles && files_open; i++)
{
@ -292,10 +309,18 @@ paste_parallel (size_t nfiles, char **fnamptr)
else
{
/* Closed file; add delimiter to 'delbuf'. */
if (*delimptr != EMPTY_DELIM)
delbuf[delims_saved++] = *delimptr;
if (++delimptr == delim_end)
delimptr = delims;
size_t len = delim_lens[delimidx];
if (len > 0)
{
memcpy (delbuf + delims_saved, delims + delimoff, len);
delims_saved += len;
}
delimoff += len;
if (++delimidx == num_delims)
{
delimidx = 0;
delimoff = 0;
}
}
}
else
@ -308,10 +333,13 @@ paste_parallel (size_t nfiles, char **fnamptr)
{
if (chr != line_delim && chr != EOF)
xputchar (chr);
if (*delimptr != EMPTY_DELIM)
xputchar (*delimptr);
if (++delimptr == delim_end)
delimptr = delims;
output_delim (delims + delimoff, delim_lens[delimidx]);
delimoff += delim_lens[delimidx];
if (++delimidx == num_delims)
{
delimidx = 0;
delimoff = 0;
}
}
else
{
@ -337,7 +365,6 @@ paste_serial (size_t nfiles, char **fnamptr)
{
bool ok = true; /* false if open or read errors occur. */
int charnew, charold; /* Current and previous char read. */
char const *delimptr; /* Current delimiter char. */
FILE *fileptr; /* Open for reading current file. */
for (; nfiles; nfiles--, fnamptr++)
@ -361,7 +388,8 @@ paste_serial (size_t nfiles, char **fnamptr)
fadvise (fileptr, FADVISE_SEQUENTIAL);
}
delimptr = delims; /* Set up for delimiter string. */
idx_t delimidx = 0; /* Current delimiter index. */
idx_t delimoff = 0; /* Current offset into delims. */
charold = getc (fileptr);
saved_errno = errno;
@ -378,11 +406,13 @@ paste_serial (size_t nfiles, char **fnamptr)
/* Process the old character. */
if (charold == line_delim)
{
if (*delimptr != EMPTY_DELIM)
xputchar (*delimptr);
if (++delimptr == delim_end)
delimptr = delims;
output_delim (delims + delimoff, delim_lens[delimidx]);
delimoff += delim_lens[delimidx];
if (++delimidx == num_delims)
{
delimidx = 0;
delimoff = 0;
}
}
else
xputchar (charold);
@ -520,6 +550,7 @@ main (int argc, char **argv)
(nfiles, &argv[optind]));
free (delims);
free (delim_lens);
if (have_read_stdin && fclose (stdin) == EOF)
error (EXIT_FAILURE, errno, "-");

View File

@ -377,9 +377,10 @@ all_tests = \
tests/od/od-j.sh \
tests/od/od-multiple-t.sh \
tests/od/od-x8.sh \
tests/misc/paste.pl \
tests/misc/pathchk.sh \
tests/misc/printenv.sh \
tests/paste/paste.pl \
tests/paste/multi-byte.sh \
tests/printf/printf.sh \
tests/printf/printf-cov.pl \
tests/printf/printf-hex.sh \

103
tests/paste/multi-byte.sh Executable file
View File

@ -0,0 +1,103 @@
#!/bin/sh
# Test multi-byte delimiter handling in paste
# Copyright (C) 2026 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
print_ver_ paste printf
test "$LOCALE_FR_UTF8" != none || skip_ 'French UTF-8 locale not available'
LC_ALL=$LOCALE_FR_UTF8
export LC_ALL
# UTF-8 test: 2-byte character (e.g., cent sign)
delim_cent=$(env printf '\xc2\xa2')
# UTF-8 test: 3-byte character (e.g., euro sign)
delim_euro=$(env printf '\xe2\x82\xac')
# UTF-8 test: 4-byte character (e.g., emoji: U+1F600)
delim_emoji=$(env printf '\xf0\x9f\x98\x80')
printf '1\n2\n' > f1 || framework_failure_
printf 'a\nb\n' > f2 || framework_failure_
# Test parallel mode with multi-byte delimiters
for delim in "$delim_cent" "$delim_euro" "$delim_emoji"; do
paste -d "$delim" f1 f2 > out || fail=1
printf "1${delim}a\n2${delim}b\n" > exp || framework_failure_
compare exp out || fail=1
done
# Test serial mode with multi-byte delimiters
printf '1\n2\n3\n' > f3 || framework_failure_
for delim in "$delim_cent" "$delim_euro"; do
paste -s -d "$delim" f3 > out || fail=1
printf "1${delim}2${delim}3\n" > exp || framework_failure_
compare exp out || fail=1
done
# Test multiple multi-byte delimiters cycling
printf 'a\nb\nc\n' > f4 || framework_failure_
printf '1\n2\n3\n' > f5 || framework_failure_
printf 'x\ny\nz\n' > f6 || framework_failure_
paste -d "${delim_cent}${delim_euro}" f4 f5 f6 > out || fail=1
printf "a${delim_cent}1${delim_euro}x\n" > exp || framework_failure_
printf "b${delim_cent}2${delim_euro}y\n" >> exp || framework_failure_
printf "c${delim_cent}3${delim_euro}z\n" >> exp || framework_failure_
compare exp out || fail=1
# Test multi-byte delimiters mixed with empty delimiter (\0)
paste -s -d "${delim_euro}\\0" f3 > out || fail=1
printf "1${delim_euro}23\n" > exp || framework_failure_
compare exp out || fail=1
# Test invalid UTF-8 sequences are still passed through
delims_invalid=$(bad_unicode)
delim_invalid=$(env printf '%s' "$delims_invalid" | cut -b1)
paste -d "$delims_invalid" f1 f2 > out || fail=1
printf "1${delim_invalid}a\n2${delim_invalid}b\n" > exp || framework_failure_
compare exp out || fail=1
# Test that \<multi-byte char> is treated like <multi-byte char>
# (unknown escapes pass through the escaped character)
paste -d "\\${delim_euro}" f1 f2 > out || fail=1
paste -d "$delim_euro" f1 f2 > exp || fail=1
compare exp out || fail=1
# Test GB18030 encoding if available
export LC_ALL=zh_CN.gb18030
if test "$(locale charmap 2>/dev/null | sed 's/gb/GB/')" = GB18030; then
# GB18030 2-byte character (e.g., 0xA2 0xE3 is a valid GB18030 char)
delim_gb18030=$(env printf '\xa2\xe3')
paste -d "$delim_gb18030" f1 f2 > out || fail=1
printf "1${delim_gb18030}a\n2${delim_gb18030}b\n" > exp || framework_failure_
compare exp out || fail=1
paste -s -d "$delim_gb18030" f3 > out || fail=1
printf "1${delim_gb18030}2${delim_gb18030}3\n" > exp || framework_failure_
compare exp out || fail=1
# Note 0xFF is invalid in GB18030, but we support all single byte delimiters
delim_ff=$(env printf '\xff')
paste -d "$delim_ff" f1 f2 > out || fail=1
printf "1${delim_ff}a\n2${delim_ff}b\n" > exp || framework_failure_
compare exp out || fail=1
fi
Exit $fail