mirror of
https://https.git.savannah.gnu.org/git/coreutils.git
synced 2026-01-27 01:44:21 +00:00
paste: support multi-byte delimiters
* src/paste.c (collapse_escapes): This is the central --delimiters parsing function, so adjust to handle multi-byte chars with mcel_scanz(). Populate a delimiters length array to support characters of differing lengths. (paste_serial): Use the delimiters length array to output the appropriate delimiter. (paste_parallel): Likewise. * tests/paste/multi-byte.sh: A new test. * tests/local.mk: Reference the new test. * NEWS: Mention the improvement.
This commit is contained in:
parent
e326d0df65
commit
06d228043f
2
NEWS
2
NEWS
@ -81,6 +81,8 @@ GNU coreutils NEWS -*- outline -*-
|
||||
'du' now processes directories with 10,000 or more entries up to 9 times
|
||||
faster on the Lustre file system.
|
||||
|
||||
'paste' now supports multi-byte --delimiters characters.
|
||||
|
||||
'pinky' will now exit immediately upon receiving a write error, which is
|
||||
significant when reading large plan or project files.
|
||||
|
||||
|
||||
181
src/paste.c
181
src/paste.c
@ -42,6 +42,7 @@
|
||||
#include <sys/types.h>
|
||||
#include "system.h"
|
||||
#include "fadvise.h"
|
||||
#include "mcel.h"
|
||||
|
||||
/* The official name of this program (e.g., no 'g' prefix). */
|
||||
#define PROGRAM_NAME "paste"
|
||||
@ -50,9 +51,6 @@
|
||||
proper_name ("David M. Ihnat"), \
|
||||
proper_name ("David MacKenzie")
|
||||
|
||||
/* Indicates that no delimiter should be added in the current position. */
|
||||
#define EMPTY_DELIM '\0'
|
||||
|
||||
/* If nonzero, we have read standard input at some point. */
|
||||
static bool have_read_stdin;
|
||||
|
||||
@ -60,11 +58,16 @@ static bool have_read_stdin;
|
||||
corresponding lines from each file in parallel. */
|
||||
static bool serial_merge;
|
||||
|
||||
/* The delimiters between lines of input files (used cyclically). */
|
||||
/* The delimiters between lines of input files (used cyclically).
|
||||
This stores the raw bytes of all delimiters concatenated. */
|
||||
static char *delims;
|
||||
|
||||
/* A pointer to the character after the end of 'delims'. */
|
||||
static char const *delim_end;
|
||||
/* Length of each delimiter in bytes (supports multi-byte characters).
|
||||
A length of 0 indicates no delimiter at this position (from \0 escape). */
|
||||
static size_t *delim_lens;
|
||||
|
||||
/* Number of delimiters. */
|
||||
static idx_t num_delims;
|
||||
|
||||
static unsigned char line_delim = '\n';
|
||||
|
||||
@ -78,10 +81,10 @@ static struct option const longopts[] =
|
||||
{nullptr, 0, nullptr, 0}
|
||||
};
|
||||
|
||||
/* Set globals delims and delim_end. Copy STRPTR to DELIMS, converting
|
||||
backslash representations of special characters in STRPTR to their actual
|
||||
values. The set of possible backslash characters has been expanded beyond
|
||||
that recognized by the Unix version.
|
||||
/* Set globals delims, delim_lens, and num_delims.
|
||||
Process STRPTR converting backslash representations of special characters
|
||||
to their actual values. The set of possible backslash characters has been
|
||||
expanded beyond that recognized by the Unix version.
|
||||
Return 0 upon success.
|
||||
If the string ends in an odd number of backslashes, ignore the
|
||||
final backslash and return nonzero. */
|
||||
@ -93,62 +96,65 @@ collapse_escapes (char const *strptr)
|
||||
bool backslash_at_end = false;
|
||||
|
||||
delims = strout;
|
||||
delim_lens = xnmalloc (MAX (1, strlen (strptr)), sizeof *delim_lens);
|
||||
|
||||
while (*strptr)
|
||||
char const *s = strptr;
|
||||
idx_t idx = 0;
|
||||
|
||||
while (*s)
|
||||
{
|
||||
if (*strptr != '\\') /* Is it an escape character? */
|
||||
*strout++ = *strptr++; /* No, just transfer it. */
|
||||
else
|
||||
if (*s == '\\')
|
||||
{
|
||||
switch (*++strptr)
|
||||
s++;
|
||||
if (*s == '\0')
|
||||
{
|
||||
case '0':
|
||||
*strout++ = EMPTY_DELIM;
|
||||
break;
|
||||
|
||||
case 'b':
|
||||
*strout++ = '\b';
|
||||
break;
|
||||
|
||||
case 'f':
|
||||
*strout++ = '\f';
|
||||
break;
|
||||
|
||||
case 'n':
|
||||
*strout++ = '\n';
|
||||
break;
|
||||
|
||||
case 'r':
|
||||
*strout++ = '\r';
|
||||
break;
|
||||
|
||||
case 't':
|
||||
*strout++ = '\t';
|
||||
break;
|
||||
|
||||
case 'v':
|
||||
*strout++ = '\v';
|
||||
break;
|
||||
|
||||
case '\\':
|
||||
*strout++ = '\\';
|
||||
break;
|
||||
|
||||
case '\0':
|
||||
backslash_at_end = true;
|
||||
goto done;
|
||||
|
||||
default:
|
||||
*strout++ = *strptr;
|
||||
break;
|
||||
}
|
||||
strptr++;
|
||||
else if (*s == '0')
|
||||
{
|
||||
/* Empty delimiter at this position. */
|
||||
s++;
|
||||
delim_lens[idx++] = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
switch (*s)
|
||||
{
|
||||
case 'b': *strout++ = '\b'; break;
|
||||
case 'f': *strout++ = '\f'; break;
|
||||
case 'n': *strout++ = '\n'; break;
|
||||
case 'r': *strout++ = '\r'; break;
|
||||
case 't': *strout++ = '\t'; break;
|
||||
case 'v': *strout++ = '\v'; break;
|
||||
case '\\': *strout++ = '\\'; break;
|
||||
default: goto copy_character;
|
||||
}
|
||||
|
||||
s++;
|
||||
delim_lens[idx++] = 1;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
copy_character:
|
||||
mcel_t g = mcel_scanz (s);
|
||||
strout = mempcpy (strout, s, g.len);
|
||||
s += g.len;
|
||||
delim_lens[idx++] = g.len;
|
||||
}
|
||||
|
||||
done:
|
||||
*strout = '\0';
|
||||
|
||||
if (idx == 0)
|
||||
{
|
||||
delim_lens[0] = 0;
|
||||
idx = 1;
|
||||
}
|
||||
|
||||
num_delims = idx;
|
||||
|
||||
delim_end = strout;
|
||||
return backslash_at_end ? 1 : 0;
|
||||
}
|
||||
|
||||
@ -161,6 +167,16 @@ xputchar (char c)
|
||||
write_error ();
|
||||
}
|
||||
|
||||
/* Output the delimiter at DELIMPTR with length LEN.
|
||||
If LEN is 0, nothing is output (empty delimiter from \0 escape). */
|
||||
|
||||
static inline void
|
||||
output_delim (char const *delimptr, size_t len)
|
||||
{
|
||||
if (len > 0 && fwrite (delimptr, 1, len, stdout) != len)
|
||||
write_error ();
|
||||
}
|
||||
|
||||
/* Perform column paste on the NFILES files named in FNAMPTR.
|
||||
Return true if successful, false if one or more files could not be
|
||||
opened or read. */
|
||||
@ -171,9 +187,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
|
||||
bool ok = true;
|
||||
/* If all files are just ready to be closed, or will be on this
|
||||
round, the string of delimiters must be preserved.
|
||||
delbuf[0] through delbuf[nfiles]
|
||||
store the delimiters for closed files. */
|
||||
char *delbuf = xmalloc (nfiles + 2);
|
||||
delbuf stores the delimiter bytes for closed files.
|
||||
Size it to hold up to (nfiles - 1) delimiters. */
|
||||
char *delbuf = xmalloc ((nfiles - 1) * MB_CUR_MAX + 1);
|
||||
|
||||
/* Streams open to the files to process; null if the corresponding
|
||||
stream is closed. */
|
||||
@ -218,8 +234,9 @@ paste_parallel (size_t nfiles, char **fnamptr)
|
||||
{
|
||||
/* Set up for the next line. */
|
||||
bool somedone = false;
|
||||
char const *delimptr = delims;
|
||||
size_t delims_saved = 0; /* Number of delims saved in 'delbuf'. */
|
||||
idx_t delimidx = 0; /* Current delimiter index. */
|
||||
idx_t delimoff = 0; /* Current offset into delims. */
|
||||
idx_t delims_saved = 0; /* Bytes saved in 'delbuf'. */
|
||||
|
||||
for (size_t i = 0; i < nfiles && files_open; i++)
|
||||
{
|
||||
@ -292,10 +309,18 @@ paste_parallel (size_t nfiles, char **fnamptr)
|
||||
else
|
||||
{
|
||||
/* Closed file; add delimiter to 'delbuf'. */
|
||||
if (*delimptr != EMPTY_DELIM)
|
||||
delbuf[delims_saved++] = *delimptr;
|
||||
if (++delimptr == delim_end)
|
||||
delimptr = delims;
|
||||
size_t len = delim_lens[delimidx];
|
||||
if (len > 0)
|
||||
{
|
||||
memcpy (delbuf + delims_saved, delims + delimoff, len);
|
||||
delims_saved += len;
|
||||
}
|
||||
delimoff += len;
|
||||
if (++delimidx == num_delims)
|
||||
{
|
||||
delimidx = 0;
|
||||
delimoff = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
@ -308,10 +333,13 @@ paste_parallel (size_t nfiles, char **fnamptr)
|
||||
{
|
||||
if (chr != line_delim && chr != EOF)
|
||||
xputchar (chr);
|
||||
if (*delimptr != EMPTY_DELIM)
|
||||
xputchar (*delimptr);
|
||||
if (++delimptr == delim_end)
|
||||
delimptr = delims;
|
||||
output_delim (delims + delimoff, delim_lens[delimidx]);
|
||||
delimoff += delim_lens[delimidx];
|
||||
if (++delimidx == num_delims)
|
||||
{
|
||||
delimidx = 0;
|
||||
delimoff = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -337,7 +365,6 @@ paste_serial (size_t nfiles, char **fnamptr)
|
||||
{
|
||||
bool ok = true; /* false if open or read errors occur. */
|
||||
int charnew, charold; /* Current and previous char read. */
|
||||
char const *delimptr; /* Current delimiter char. */
|
||||
FILE *fileptr; /* Open for reading current file. */
|
||||
|
||||
for (; nfiles; nfiles--, fnamptr++)
|
||||
@ -361,7 +388,8 @@ paste_serial (size_t nfiles, char **fnamptr)
|
||||
fadvise (fileptr, FADVISE_SEQUENTIAL);
|
||||
}
|
||||
|
||||
delimptr = delims; /* Set up for delimiter string. */
|
||||
idx_t delimidx = 0; /* Current delimiter index. */
|
||||
idx_t delimoff = 0; /* Current offset into delims. */
|
||||
|
||||
charold = getc (fileptr);
|
||||
saved_errno = errno;
|
||||
@ -378,11 +406,13 @@ paste_serial (size_t nfiles, char **fnamptr)
|
||||
/* Process the old character. */
|
||||
if (charold == line_delim)
|
||||
{
|
||||
if (*delimptr != EMPTY_DELIM)
|
||||
xputchar (*delimptr);
|
||||
|
||||
if (++delimptr == delim_end)
|
||||
delimptr = delims;
|
||||
output_delim (delims + delimoff, delim_lens[delimidx]);
|
||||
delimoff += delim_lens[delimidx];
|
||||
if (++delimidx == num_delims)
|
||||
{
|
||||
delimidx = 0;
|
||||
delimoff = 0;
|
||||
}
|
||||
}
|
||||
else
|
||||
xputchar (charold);
|
||||
@ -520,6 +550,7 @@ main (int argc, char **argv)
|
||||
(nfiles, &argv[optind]));
|
||||
|
||||
free (delims);
|
||||
free (delim_lens);
|
||||
|
||||
if (have_read_stdin && fclose (stdin) == EOF)
|
||||
error (EXIT_FAILURE, errno, "-");
|
||||
|
||||
@ -377,9 +377,10 @@ all_tests = \
|
||||
tests/od/od-j.sh \
|
||||
tests/od/od-multiple-t.sh \
|
||||
tests/od/od-x8.sh \
|
||||
tests/misc/paste.pl \
|
||||
tests/misc/pathchk.sh \
|
||||
tests/misc/printenv.sh \
|
||||
tests/paste/paste.pl \
|
||||
tests/paste/multi-byte.sh \
|
||||
tests/printf/printf.sh \
|
||||
tests/printf/printf-cov.pl \
|
||||
tests/printf/printf-hex.sh \
|
||||
|
||||
103
tests/paste/multi-byte.sh
Executable file
103
tests/paste/multi-byte.sh
Executable file
@ -0,0 +1,103 @@
|
||||
#!/bin/sh
|
||||
# Test multi-byte delimiter handling in paste
|
||||
|
||||
# Copyright (C) 2026 Free Software Foundation, Inc.
|
||||
|
||||
# This program is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
|
||||
. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src
|
||||
print_ver_ paste printf
|
||||
|
||||
test "$LOCALE_FR_UTF8" != none || skip_ 'French UTF-8 locale not available'
|
||||
|
||||
LC_ALL=$LOCALE_FR_UTF8
|
||||
export LC_ALL
|
||||
|
||||
# UTF-8 test: 2-byte character (e.g., cent sign)
|
||||
delim_cent=$(env printf '\xc2\xa2')
|
||||
# UTF-8 test: 3-byte character (e.g., euro sign)
|
||||
delim_euro=$(env printf '\xe2\x82\xac')
|
||||
# UTF-8 test: 4-byte character (e.g., emoji: U+1F600)
|
||||
delim_emoji=$(env printf '\xf0\x9f\x98\x80')
|
||||
|
||||
printf '1\n2\n' > f1 || framework_failure_
|
||||
printf 'a\nb\n' > f2 || framework_failure_
|
||||
|
||||
# Test parallel mode with multi-byte delimiters
|
||||
for delim in "$delim_cent" "$delim_euro" "$delim_emoji"; do
|
||||
paste -d "$delim" f1 f2 > out || fail=1
|
||||
printf "1${delim}a\n2${delim}b\n" > exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
done
|
||||
|
||||
# Test serial mode with multi-byte delimiters
|
||||
printf '1\n2\n3\n' > f3 || framework_failure_
|
||||
for delim in "$delim_cent" "$delim_euro"; do
|
||||
paste -s -d "$delim" f3 > out || fail=1
|
||||
printf "1${delim}2${delim}3\n" > exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
done
|
||||
|
||||
# Test multiple multi-byte delimiters cycling
|
||||
printf 'a\nb\nc\n' > f4 || framework_failure_
|
||||
printf '1\n2\n3\n' > f5 || framework_failure_
|
||||
printf 'x\ny\nz\n' > f6 || framework_failure_
|
||||
paste -d "${delim_cent}${delim_euro}" f4 f5 f6 > out || fail=1
|
||||
printf "a${delim_cent}1${delim_euro}x\n" > exp || framework_failure_
|
||||
printf "b${delim_cent}2${delim_euro}y\n" >> exp || framework_failure_
|
||||
printf "c${delim_cent}3${delim_euro}z\n" >> exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
|
||||
# Test multi-byte delimiters mixed with empty delimiter (\0)
|
||||
paste -s -d "${delim_euro}\\0" f3 > out || fail=1
|
||||
printf "1${delim_euro}23\n" > exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
|
||||
# Test invalid UTF-8 sequences are still passed through
|
||||
delims_invalid=$(bad_unicode)
|
||||
delim_invalid=$(env printf '%s' "$delims_invalid" | cut -b1)
|
||||
paste -d "$delims_invalid" f1 f2 > out || fail=1
|
||||
printf "1${delim_invalid}a\n2${delim_invalid}b\n" > exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
|
||||
# Test that \<multi-byte char> is treated like <multi-byte char>
|
||||
# (unknown escapes pass through the escaped character)
|
||||
paste -d "\\${delim_euro}" f1 f2 > out || fail=1
|
||||
paste -d "$delim_euro" f1 f2 > exp || fail=1
|
||||
compare exp out || fail=1
|
||||
|
||||
|
||||
# Test GB18030 encoding if available
|
||||
export LC_ALL=zh_CN.gb18030
|
||||
|
||||
if test "$(locale charmap 2>/dev/null | sed 's/gb/GB/')" = GB18030; then
|
||||
# GB18030 2-byte character (e.g., 0xA2 0xE3 is a valid GB18030 char)
|
||||
delim_gb18030=$(env printf '\xa2\xe3')
|
||||
|
||||
paste -d "$delim_gb18030" f1 f2 > out || fail=1
|
||||
printf "1${delim_gb18030}a\n2${delim_gb18030}b\n" > exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
|
||||
paste -s -d "$delim_gb18030" f3 > out || fail=1
|
||||
printf "1${delim_gb18030}2${delim_gb18030}3\n" > exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
|
||||
# Note 0xFF is invalid in GB18030, but we support all single byte delimiters
|
||||
delim_ff=$(env printf '\xff')
|
||||
paste -d "$delim_ff" f1 f2 > out || fail=1
|
||||
printf "1${delim_ff}a\n2${delim_ff}b\n" > exp || framework_failure_
|
||||
compare exp out || fail=1
|
||||
fi
|
||||
|
||||
Exit $fail
|
||||
Loading…
x
Reference in New Issue
Block a user