diff: modularize and tune mcel code

Go back to a single mcel module, instead of trying to break it up
into ucore and mcel pieces, as breaking it up hurt performance.
Use gnulib-tool’s --local-dir to create diffutils-specific modules
for mcel; the idea is that this will eventually migrate into Gnulib.
* bootstrap.conf (avoided_gnulib_modules): Add mbuiterf.
(gnulib_modules): Add mbscasecmp, mcel-prefer.
(gnulib_tool_option_extras): Add --local-dir=gl to pick up new files.
* cfg.mk (exclude_file_name_regexp--sc_prohibit_doubled_word):
Do not exclude now-removed files lib/ucore.c, lib/ucore.h.
* lib/Makefile.am: Adjust to use of modules.
(noinst_HEADERS): Remove mcel.h, ucore.h.
(libdiffutils_a_SOURCES): Remove mcel.c, mcel-casecmp.c, ucore.c
* lib/mcel-casecmp.c, lib/ucore.c, lib/ucore.h: Remove.
* lib/mcel.h: Switch to LGPLv2.1+.  Do not include ucore.h.
All uses of ucore_t changed back to using char32_t.
Do what ucore.h used to do: include verify.h, limits.h, stddef.h,
uchar.h; require config.h, define _GL_LIKELY, _GL_UNLIKELY.
(MCEl_CHAR_MAX, MCEL_ERR_MIN, MCEL_ERR_MAX): New constants.
(mcel_t): Switch from single ucore_t c to a char32_t ch and
unsigned char err.  This has significantly better performance on
Fedora 38 x86-64.  All uses changed.  Check that unsigned char
promotes to int.
(mcel_ch, mcel_err, mcel_cmp, mcel_tocmp): New functions.
(MCEL_ERR_SHIFT): Rename from MCEL_ENCODING_ERROR_SHIFT.
All uses changed.
(mcel_isbasic): Add a _GL_LIKELY to help compilers.  All uses changed.
(mcel_scan, mcel_scant): Simplify by using mcel_ch, mcel_err.
(mcel_casecmp): Remove decl.  Callers changed to use mbscasecmp.
* gl/lib/mcel.c, gl/lib/mcel.h: Rename from lib/mcel.c, lib/mcel.h.
* gl/lib/mbscasecmp.c: New file.
* gl/modules/mcel, gl/modules/mcel-prefer, gl/modules/mcel-tests:
* gl/tests/test-mcel.c:
New files.
* src/io.c: Revert use of ucore API.  Use plain c32isspace etc.
instead of ucore_is.  Use .err instead of ucore_iserr.
(same_ch_err): Bring back, and use it instead of ucore_cmp.
* src/side.c (print_half_line):  Use .err instead of ucore_iserr.
This commit is contained in:
Paul Eggert 2023-08-21 08:38:16 -07:00
parent 574e81bff2
commit ae1cdc7239
15 changed files with 517 additions and 322 deletions

View File

@ -18,6 +18,7 @@
avoided_gnulib_modules='
--avoid=localename
--avoid=lock-tests
--avoid=mbuiterf
--avoid=setlocale
'
@ -73,7 +74,8 @@ largefile
lstat
maintainer-makefile
manywarnings
mbrtoc32
mbscasecmp
mcel-prefer
mempcpy
minmax
mkstemp
@ -140,6 +142,7 @@ XGETTEXT_OPTIONS=$XGETTEXT_OPTIONS'\\\
'
gnulib_tool_option_extras="--tests-base=gnulib-tests
--local-dir=gl
--with-tests
--symlink
--makefile-name=gnulib.mk

3
cfg.mk
View File

@ -74,8 +74,7 @@ config-save:
cp lib/config.h config.status $(_cf_state_dir)/latest
exclude_file_name_regexp--sc_space_tab = ^gl/lib/.*\.c\.diff$$
exclude_file_name_regexp--sc_prohibit_doubled_word = \
^(tests/y2038-vs-32bit|lib/ucore\.h)$$
exclude_file_name_regexp--sc_prohibit_doubled_word = ^tests/y2038-vs-32bit$$
# Tell gnulib's tight_scope rule that we mark externs with XTERN
export _gl_TS_extern = extern|XTERN|DIFF_INLINE|SYSTEM_INLINE|SYSTEM_EXTERN

112
gl/lib/mbscasecmp.c Normal file
View File

@ -0,0 +1,112 @@
/* Case-insensitive string comparison function.
Copyright (C) 1998-1999, 2005-2023 Free Software Foundation, Inc.
Written by Bruno Haible <bruno@clisp.org>, 2005,
based on earlier glibc code.
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
/* Specification. */
#include <string.h>
#include <ctype.h>
#include <limits.h>
#include <stdlib.h>
#if GNULIB_MCEL_PREFER
# include "mcel.h"
#else
# include "mbuiterf.h"
#endif
/* Compare the character strings S1 and S2, ignoring case, returning less than,
equal to or greater than zero if S1 is lexicographically less than, equal to
or greater than S2.
Note: This function may, in multibyte locales, return 0 for strings of
different lengths! */
int
mbscasecmp (const char *s1, const char *s2)
{
if (s1 == s2)
return 0;
char const *p1 = s1;
char const *p2 = s2;
/* Be careful not to look at the entire extent of s1 or s2 until needed.
This is useful because when two strings differ, the difference is
most often already in the very few first characters. */
if (MB_CUR_MAX > 1)
{
#if GNULIB_MCEL_PREFER
while (true)
{
mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
int cmp = mcel_tocmp (c32tolower, g1, g2);
if (cmp | !g1.ch)
return cmp;
}
#else
mbuif_state_t state1;
const char *iter1;
mbuif_init (state1);
iter1 = s1;
mbuif_state_t state2;
const char *iter2;
mbuif_init (state2);
iter2 = s2;
while (mbuif_avail (state1, iter1) && mbuif_avail (state2, iter2))
{
mbchar_t cur1 = mbuif_next (state1, iter1);
mbchar_t cur2 = mbuif_next (state2, iter2);
int cmp = mb_casecmp (cur1, cur2);
if (cmp != 0)
return cmp;
iter1 += mb_len (cur1);
iter2 += mb_len (cur2);
}
if (mbuif_avail (state1, iter1))
/* s2 terminated before s1. */
return 1;
if (mbuif_avail (state2, iter2))
/* s1 terminated before s2. */
return -1;
return 0;
#endif
}
else
while (true)
{
unsigned char c1 = *p1++;
unsigned char c2 = *p2++;
/* On machines where 'char' and 'int' are types of the same size, the
difference of two 'unsigned char' values - including the sign bit -
doesn't fit in an 'int'. */
int cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
if (cmp)
{
c1 = tolower (c1);
c2 = tolower (c2);
cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
}
if (cmp | !c1)
return cmp;
}
}

View File

@ -3,7 +3,7 @@
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 3 of the
published by the Free Software Foundation; either version 2.1 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
@ -16,7 +16,14 @@
/* Written by Paul Eggert. */
/* The mcel_scan function lets code iterate through an array of bytes,
/* The macros in this file implement multi-byte character representation
and forward iteration through a multi-byte string.
They are simpler and typically faster than the mbiter family.
However, they do not support obsolescent encodings like CP864,
EBCDIC, Johab, and Shift JIS that glibc also does not support,
and it is up to the caller to coalesce encoding-error bytes if desired.
The mcel_scan function lets code iterate through an array of bytes,
supporting character encodings in practical use
more simply than using plain mbrtoc32.
@ -35,8 +42,11 @@
process (g);
}
You can select from G using G.ch, G.err, and G.len.
G is an encoding error if G.err is nonzero, a character otherwise.
The mcel_scanz function is similar except it works with a
string of unknown length that is terminated with '\0'.
string of unknown but positive length that is terminated with '\0'.
Instead of this single-byte code:
char *p = ...;
@ -57,12 +67,16 @@
'\n', '.', '/' are safe, as they cannot be a part (even a trailing
byte) of a multi-byte character.
You can select from G using G.c and G.len.
You can use ucore_* functions on G.c, e.g., ucore_iserr (G.c),
ucore_is (c32isalpha, G.c), and ucore_to (c32tolower, G.c).
mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values.
mcel_strcasecmp compares two null-terminated multi-byte strings
lexicographically, ignoring case.
mcel_cmp (G1, G2) compares two mcel_t values lexicographically by
character or by encoding byte value, with encoding bytes sorting
after characters.
Calls like c32isalpha (G.ch) test G; they return false for encoding
errors since calls like c32isalpha (0) return false. Calls like
mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2),
but transliterate first.
Although ISO C and POSIX allow encodings that have shift states or
that can produce multiple characters from an indivisible byte sequence,
@ -73,9 +87,20 @@
#ifndef _MCEL_H
#define _MCEL_H 1
/* This API is an extension of ucore.h. Programs that include this
file can assume ucore.h is included too. */
#include <ucore.h>
#if !_GL_CONFIG_H_INCLUDED
#error "Please include config.h first."
#endif
#include <verify.h>
#include <limits.h>
#include <stddef.h>
#include <uchar.h>
/* Pacify GCC re type limits. */
#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
# pragma GCC diagnostic ignored "-Wtype-limits"
#endif
/* The maximum multi-byte character length supported on any platform.
This can be less than MB_LEN_MAX because many platforms have a
@ -85,24 +110,41 @@
0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX. */
enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
/* mcel_t is a type representing a character or encoding error C,
along with a count of the LEN bytes that represent C.
1 <= LEN <= MB_LEN_MAX. */
/* Bounds for mcel_t members. */
enum { MCEL_CHAR_MAX = 0x10FFFF };
enum { MCEL_ERR_MIN = 0x80 };
enum { MCEL_ERR_MAX = UCHAR_MAX };
/* mcel_t is a type representing a character CH or an encoding error byte ERR,
along with a count of the LEN bytes that represent CH or ERR.
If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX;
otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR <= MCEL_ERR_MAX.
CH == 0, and LEN == 1. */
typedef struct
{
ucore_t c;
char32_t ch;
unsigned char err;
unsigned char len;
} mcel_t;
/* Every multi-byte character length fits in mcel_t's LEN. */
static_assert (MB_LEN_MAX <= UCHAR_MAX);
/* Shifting an encoding error byte left by this value
suffices to sort encoding errors after characters. */
enum { MCEL_ERR_SHIFT = 14 };
static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT);
/* Unsigned char promotes to int. */
static_assert (UCHAR_MAX <= INT_MAX);
/* Bytes have 8 bits, as POSIX requires. */
static_assert (CHAR_BIT == 8);
/* Pacify GCC re 'c <= 0x7f' below. */
#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
# pragma GCC diagnostic ignored "-Wtype-limits"
#ifndef _GL_LIKELY
/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */
# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
#endif
_GL_INLINE_HEADER_BEGIN
@ -110,18 +152,44 @@ _GL_INLINE_HEADER_BEGIN
# define MCEL_INLINE _GL_INLINE
#endif
/* With mcel there should be no need for the performance overhead of
replacing glibc mbrtoc32, as callers shouldn't care whether the
C locale treats a byte with the high bit set as an encoding error. */
#ifdef __GLIBC__
# undef mbrtoc32
#endif
/* mcel_t constructors. */
MCEL_INLINE mcel_t
mcel_ch (char32_t ch, size_t len)
{
assume (0 < len);
assume (len <= MCEL_LEN_MAX);
assume (ch <= MCEL_CHAR_MAX);
return (mcel_t) {ch: ch, len: len};
}
MCEL_INLINE mcel_t
mcel_err (unsigned char err)
{
assume (MCEL_ERR_MIN <= err);
assume (err <= MCEL_ERR_MAX);
return (mcel_t) {err: err, len: 1};
}
/* Shifting an encoding error byte (at least 0x80) left by this value
yields a value in the range UCORE_ERR_MIN .. 2*UCORE_ERR_MIN - 1.
This suffices to sort encoding errors after characters. */
enum { MCEL_ENCODING_ERROR_SHIFT = 14 };
static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
/* Compare C1 and C2, with encoding errors sorting after characters.
Return <0, 0, >0 for <, =, >. */
MCEL_INLINE int
mcel_cmp (mcel_t c1, mcel_t c2)
{
int ch1 = c1.ch, ch2 = c2.ch;
return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2);
}
/* Apply the uchar translator TO to C1 and C2 and compare the results,
with encoding errors sorting after characters,
Return <0, 0, >0 for <, =, >. */
MCEL_INLINE int
mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2)
{
int cmp = mcel_cmp (c1, c2);
if (_GL_LIKELY ((c1.err - c2.err) | !cmp))
return cmp;
int ch1 = to (c1.ch), ch2 = to (c2.ch);
return ch1 - ch2;
}
/* Whether C represents itself as a Unicode character
when it is the first byte of a single- or multi-byte character.
@ -130,9 +198,16 @@ static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
MCEL_INLINE bool
mcel_isbasic (char c)
{
return 0 <= c && c <= 0x7f;
return _GL_LIKELY (0 <= c && c <= 0x7f);
}
/* With mcel there should be no need for the performance overhead of
replacing glibc mbrtoc32, as callers shouldn't care whether the
C locale treats a byte with the high bit set as an encoding error. */
#ifdef __GLIBC__
# undef mbrtoc32
#endif
/* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM.
Return the character or encoding error starting at P. */
MCEL_INLINE mcel_t
@ -141,8 +216,8 @@ mcel_scan (char const *p, char const *lim)
/* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
In supported encodings, the first byte of a multi-byte character
cannot be an ASCII byte. */
if (_GL_LIKELY (mcel_isbasic (*p)))
return (mcel_t) { .c = *p, .len = 1 };
if (mcel_isbasic (*p))
return mcel_ch (*p, 1);
/* An initial mbstate_t; initialization optimized for some platforms.
For details about these and other platforms, see wchar.in.h. */
@ -171,29 +246,17 @@ mcel_scan (char const *p, char const *lim)
mbstate_t mbs = {0};
#endif
char32_t c;
size_t len = mbrtoc32 (&c, p, lim - p, &mbs);
char32_t ch;
size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);
/* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
is not supported and MB_LEN_MAX is small. */
if (_GL_LIKELY (len <= (size_t) -1 / 2))
{
/* A multi-byte character. LEN must be positive,
as *P != '\0' and shift sequences are not supported. */
assume (0 < len);
assume (len <= MB_LEN_MAX);
assume (c <= UCORE_CHAR_MAX);
return (mcel_t) { .c = c, .len = len };
}
else
{
/* An encoding error. */
unsigned char b = *p;
c = b << MCEL_ENCODING_ERROR_SHIFT;
assume (UCORE_ERR_MIN <= c);
assume (c <= UCORE_ERR_MAX);
return (mcel_t) { .c = c, .len = 1 };
}
if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
return mcel_err (*p);
/* A multi-byte character. LEN must be positive,
as *P != '\0' and shift sequences are not supported. */
return mcel_ch (ch, len);
}
/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
@ -205,11 +268,11 @@ MCEL_INLINE mcel_t
mcel_scant (char const *p, char terminator)
{
/* Handle ASCII quickly for speed. */
if (_GL_LIKELY (mcel_isbasic (*p)))
return (mcel_t) { .c = *p, .len = 1 };
if (mcel_isbasic (*p))
return mcel_ch (*p, 1);
/* Defer to mcel_scan for non-ASCII. Compute length with code that
is typically branch-free and faster than memchr or strnlen. */
is typically faster than strnlen. */
char const *lim = p + 1;
for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
lim += *lim != terminator;
@ -226,11 +289,6 @@ mcel_scanz (char const *p)
return mcel_scant (p, '\0');
}
/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
Return <0, 0, >0 for <, =, >. Consider encoding errors to be
greater than characters and compare them byte by byte. */
int mcel_casecmp (char const *s1, char const *s2);
_GL_INLINE_HEADER_END
#endif /* _MCEL_H */

34
gl/modules/mcel Normal file
View File

@ -0,0 +1,34 @@
Description:
Multibye Characters, Encoding errors, and Lengths
Files:
lib/mcel.c
lib/mcel.h
Depends-on:
assert-h
extern-inline
limits-h
mbrtoc32
stdbool
uchar
verify
configure.ac:
Makefile.am:
lib_SOURCES += mcel.c mcel.h
Include:
"mcel.h"
Link:
$(LTLIBUNISTRING) when linking with libtool, $(LIBUNISTRING) otherwise
$(MBRTOWC_LIB)
$(LTLIBC32CONV) when linking with libtool, $(LIBC32CONV) otherwise
License:
LGPLv2+
Maintainer:
all

24
gl/modules/mcel-prefer Normal file
View File

@ -0,0 +1,24 @@
Description:
mcel is preferred to the mbiter family when either will do.
mcel is simpler and faster. However, it does not support some
obsolete encodings that are also not supported by glibc locales,
and the caller is responsible for coalescing sequences of
error-encoding bytes if that is desired.
Files:
Depends-on:
mcel
configure.ac:
gl_MODULE_INDICATOR([mcel-prefer])
Makefile.am:
Include:
License:
LGPLv2+
Maintainer:
Paul Eggert

12
gl/modules/mcel-tests Normal file
View File

@ -0,0 +1,12 @@
Files:
tests/test-mcel.c
Depends-on:
assert-h
setlocale
configure.ac:
Makefile.am:
TESTS += test-mcel
check_PROGRAMS += test-mcel

138
gl/tests/test-mcel.c Normal file
View File

@ -0,0 +1,138 @@
/* Test <mcel.h>
Copyright 2023 Free Software Foundation, Inc.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
#include <config.h>
#include <mcel.h>
#include <locale.h>
#include "macros.h"
static wint_t
to_ascii (wint_t c)
{
return c & 0x7f;
}
static int
sgn (int i)
{
return (i > 0) - (i < 0);
}
static void
test_mcel_vs_mbrtoc32 (unsigned char uc, mcel_t c, size_t n, char32_t ch)
{
ASSERT (!c.err == (n <= MB_LEN_MAX));
ASSERT (c.err
? c.err == uc && c.ch == 0 && c.len == 1
: c.ch == ch && c.len == (n ? n : 1));
}
int
main (void)
{
/* configure should already have checked that the locale is supported. */
if (setlocale (LC_ALL, "") == NULL)
return 1;
mcel_t prev;
for (int ch = 0; ch < 0x80; ch++)
{
mcel_t c = mcel_ch (ch, 1);
ASSERT (c.ch == ch);
ASSERT (c.len == 1);
ASSERT (!c.err);
ASSERT (mcel_cmp (c, c) == 0);
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
if (ch)
{
ASSERT (mcel_cmp (prev, c) < 0);
ASSERT (mcel_cmp (c, prev) > 0);
ASSERT (mcel_tocmp (to_ascii, prev, c) < 0);
ASSERT (mcel_tocmp (to_ascii, c, prev) > 0);
}
ASSERT (mcel_isbasic (ch));
prev = c;
}
for (char ch = CHAR_MIN; ; ch++)
{
ASSERT (mcel_isbasic (ch) == (0 <= ch && ch <= 0x7f));
if (ch == CHAR_MAX)
break;
}
for (int ch = 0x80; ch < 0x200; ch++)
{
mcel_t c = mcel_ch (ch, 2);
ASSERT (c.ch == ch);
ASSERT (c.len == 2);
ASSERT (!c.err);
ASSERT (mcel_cmp (c, c) == 0);
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
ASSERT (mcel_cmp (prev, c) < 0);
ASSERT (mcel_cmp (c, prev) > 0);
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
int cmp = to_ascii (c.ch) ? -1 : 1;
ASSERT (sgn (mcel_tocmp (to_ascii, prev, c)) == cmp);
ASSERT (sgn (mcel_tocmp (to_ascii, c, prev)) == -cmp);
prev = c;
}
for (unsigned char err = 0x80; ; err++)
{
mcel_t c = mcel_err (err);
ASSERT (!c.ch);
ASSERT (c.len == 1);
ASSERT (c.err == err);
ASSERT (mcel_cmp (c, c) == 0);
ASSERT (mcel_cmp (prev, c) < 0);
ASSERT (mcel_cmp (c, prev) > 0);
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
ASSERT (mcel_tocmp (to_ascii, prev, c) < 0);
ASSERT (mcel_tocmp (to_ascii, c, prev) > 0);
prev = c;
if (err == (unsigned char) -1)
break;
}
for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
for (int j = CHAR_MIN; i <= CHAR_MAX; i++)
for (int k = CHAR_MIN; k <= CHAR_MAX; k++)
{
char const ijk[] = {i, j, k};
mbstate_t mbs = {0};
char32_t ch;
size_t n = mbrtoc32 (&ch, ijk, sizeof ijk, &mbs);
mcel_t c = mcel_scan (ijk, ijk + sizeof ijk);
test_mcel_vs_mbrtoc32 (i, c, n, ch);
static char const terminator[] = "\r\n./";
for (int ti = 0; ti < sizeof terminator; ti++)
{
char t = terminator[ti];
if (i == t)
continue;
char const ijkt[] = {i, j, k, t};
mcel_t d = mcel_scant (ijk, t);
ASSERT (c.ch == d.ch && c.err == d.err && c.len == d.len);
if (!t)
{
mcel_t z = mcel_scanz (ijk);
ASSERT (d.ch == z.ch && d.err == z.err && d.len == z.len);
}
}
}
}

View File

@ -29,7 +29,7 @@ noinst_HEADERS =
include gnulib.mk
noinst_HEADERS += cmpbuf.h diagnose.h mcel.h ucore.h
libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mcel.c mcel-casecmp.c ucore.c
noinst_HEADERS += cmpbuf.h diagnose.h
libdiffutils_a_SOURCES += cmpbuf.c diagnose.c
AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS)

View File

@ -1,60 +0,0 @@
/* Case-insensitive string comparison function.
Copyright 2023 Free Software Foundation, Inc.
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* Written by Paul Eggert. */
#include <config.h>
/* Specification. */
#include <mcel.h>
#include <ctype.h>
#include <stdlib.h>
int
mcel_casecmp (char const *s1, char const *s2)
{
char const *p1 = s1;
char const *p2 = s2;
/* Do not look at the entire extent of S1 or S2 until needed:
when two strings differ, the difference is typically early. */
if (MB_CUR_MAX == 1)
while (true)
{
static_assert (UCHAR_MAX <= INT_MAX);
unsigned char c1 = *p1++;
unsigned char c2 = *p2++;
int cmp = c1 - c2;
if (_GL_UNLIKELY (cmp))
{
c1 = tolower (c1);
c2 = tolower (c2);
cmp = c1 - c2;
}
if (_GL_UNLIKELY (cmp | !c1))
return cmp;
}
else
while (true)
{
mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
int cmp = ucore_tocmp (c32tolower, g1.c, g2.c);
if (_GL_UNLIKELY (cmp | !g1.c))
return cmp;
}
}

View File

@ -1,3 +0,0 @@
#include <config.h>
#define UCORE_INLINE _GL_EXTERN_INLINE
#include "ucore.h"

View File

@ -1,132 +0,0 @@
/* Unicode Characters OR Encoding errors (UCOREs)
Copyright 2023 Free Software Foundation, Inc.
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 3 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* Written by Paul Eggert. */
/* This API's fundamental type ucore_t represents
a single Unicode character or an encoding error.
ucore_iserr (C) tests whether C is an encoding error.
ucore_is (P, C) etc. test whether char class P accepts C.
ucore_to (TO, C) etc. use TO to convert C.
ucore_cmp (C1, C2) and ucore_tocmp (TO, C1, C2) compare C1 and C2,
with encoding errors sorting after characters. */
#ifndef _UCORE_H
#define _UCORE_H 1
#if !_GL_CONFIG_H_INCLUDED
#error "Please include config.h first."
#endif
#include <verify.h>
#include <limits.h>
#include <stddef.h>
#include <uchar.h>
/* ucore_t represents a Unicode Character OR Encoding error.
If 0 <= C <= UCORE_CHAR_MAX, C represents a Unicode character.
If UCORE_ERR_MIN <= C <= UCORE_ERR_MAX, C represents an encoding error.
Other ucore_t values C are invalid. */
typedef int ucore_t;
enum {
UCORE_CHAR_MAX = 0x10FFFF,
UCORE_ERR_MIN = 0x200000,
UCORE_ERR_MAX = 2 * UCORE_ERR_MIN - 1
};
/* Information is not lost by encoding errors as integers. */
static_assert (UCHAR_MAX <= UCORE_ERR_MAX - UCORE_ERR_MIN);
/* On glibc platforms, predicates like c32isalnum and c32tolower
do the right thing for char32_t values that are not valid characters.
POSIX says the behavior is undefined, so play it safe elsewhere.
Do not rely on UCORE_C32_SAFE for c32width. */
#ifdef __GLIBC__
enum { UCORE_C32_SAFE = true };
#else
enum { UCORE_C32_SAFE = false };
#endif
#ifndef _GL_LIKELY
/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */
# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
#endif
_GL_INLINE_HEADER_BEGIN
#ifndef UCORE_INLINE
# define UCORE_INLINE _GL_INLINE
#endif
/* Return true if C represents an encoding error, false otherwise. */
UCORE_INLINE bool
ucore_iserr (ucore_t c)
{
/* (c & UCORE_ERR_MIN) is a bit cheaper than (UCORE_ERR_MIN <= c)
with GCC 13 x86-64. */
if (_GL_UNLIKELY (c & UCORE_ERR_MIN))
{
assume (UCORE_ERR_MIN <= c && c <= UCORE_ERR_MAX);
return true;
}
else
{
assume (0 <= c && c <= UCORE_CHAR_MAX);
return false;
}
}
/* Whether the uchar predicate P accepts C, e.g., ucore_is (c32isalpha, C). */
UCORE_INLINE bool
ucore_is (int (*p) (wint_t), wint_t c)
{
/* When C is out of range, predicates based on glibc return false.
Behavior is undefined on other platforms, so play it safe. */
return (UCORE_C32_SAFE || ! ucore_iserr (c)) && p (c);
}
/* Apply the uchar translator TO to C, e.g., ucore_to (c32tolower, C). */
UCORE_INLINE wint_t
ucore_to (wint_t (*to) (wint_t), ucore_t c)
{
return UCORE_C32_SAFE || ! ucore_iserr (c) ? to (c) : c;
}
/* Compare C1 and C2, with encoding errors sorting after characters.
Return <0, 0, >0 for <, =, >. */
UCORE_INLINE int
ucore_cmp (ucore_t c1, ucore_t c2)
{
return c1 - c2;
}
/* Apply the uchar translater TO to C1 and C2 and compare the results,
with encoding errors sorting after characters,
Return <0, 0, >0 for <, =, >. */
UCORE_INLINE int
ucore_tocmp (wint_t (*to) (wint_t), ucore_t c1, ucore_t c2)
{
if (c1 == c2)
return 0;
int i1 = ucore_to (to, c1), i2 = ucore_to (to, c2);
return i1 - i2;
}
_GL_INLINE_HEADER_END
#endif /* _MCEL_H */

132
src/io.c
View File

@ -230,6 +230,14 @@ slurp (struct file_data *current)
}
}
/* Return true if CH1 and ERR1 stand for the same character or
encoding error as CH2 and ERR2. */
static bool
same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2)
{
return ! ((ch1 ^ ch2) | (err1 ^ err2));
}
/* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
one line from each input file) according to the command line options.
Line lengths include the trailing newline.
@ -427,7 +435,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
{
char const *lim1 = s1 + s1len;
char const *lim2 = s2 + s2len;
ucore_t c1prev = 0;
char32_t ch1prev = 0;
while (true)
{
@ -435,27 +443,27 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
mcel_t g2 = mcel_scan (t2, lim2);
t1 += g1.len;
t2 += g2.len;
ucore_t c1 = g1.c;
ucore_t c2 = g2.c;
char32_t ch1 = g1.ch;
char32_t ch2 = g2.ch;
/* Test for exact equality first, since it's a common case. */
if (ucore_cmp (c1, c2) != 0)
if (! same_ch_err (ch1, g1.err, ch2, g2.err))
{
switch (ignore_white_space)
{
case IGNORE_ALL_SPACE:
/* For -w, just skip past any white space. */
while (c1 != '\n' && ! ucore_is (c32isspace, c1))
while (ch1 != '\n' && c32isspace (ch1))
{
g1 = mcel_scan (t1, lim1);
t1 += g1.len;
c1 = g1.c;
ch1 = g1.ch;
}
while (c2 != '\n' && ucore_is (c32isspace, c2))
while (ch2 != '\n' && c32isspace (ch2))
{
g2 = mcel_scan (t2, lim2);
t2 += g2.len;
c2 = g2.c;
ch2 = g2.ch;
}
break;
@ -463,48 +471,46 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
/* For -b, advance past any sequence of white space in
line 1 and consider it just one space, or nothing at
all if it is at the end of the line. */
if (ucore_is (c32isspace, c1))
while (c1 != '\n')
if (c32isspace (ch1))
while (ch1 != '\n')
{
g1 = mcel_scan (t1, lim1);
t1 += g1.len;
c1 = g1.c;
if (! ucore_is (c32isspace, c1))
ch1 = g1.ch;
if (! c32isspace (ch1))
{
t1 -= g1.len;
c1 = ' ';
ch1 = ' ';
break;
}
}
/* Likewise for line 2. */
if (ucore_is (c32isspace, c2))
while (c2 != '\n')
if (c32isspace (ch2))
while (ch2 != '\n')
{
g2 = mcel_scan (t2, lim2);
t2 += g2.len;
c2 = g2.c;
if (! ucore_is (c32isspace, c2))
ch2 = g2.ch;
if (! c32isspace (ch2))
{
t2 -= g2.len;
c2 = ' ';
ch2 = ' ';
break;
}
}
if (c1 != c2)
if (ch1 != ch2)
{
/* If we went too far when doing the simple test
for equality, go back to the first non-white-space
character in both sides and try again. */
if (c2 == ' ' && c1 != '\n'
&& ucore_is (c32isspace, c1prev))
if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev))
{
t1 -= g1.len;
continue;
}
if (c1 == ' ' && c2 != '\n'
&& ucore_is (c32isspace, c1prev))
if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev))
{
t2 -= g2.len;
continue;
@ -515,28 +521,28 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
case IGNORE_TRAILING_SPACE:
case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
if (ucore_is (c32isspace, c1) && ucore_is (c32isspace, c2))
if (c32isspace (ch1) && c32isspace (ch2))
{
if (c1 != '\n')
if (ch1 != '\n')
{
char const *p = t1;
while (*p != '\n')
{
mcel_t g = mcel_scan (p, lim1);
if (! ucore_is (c32isspace, g.c))
if (c32isspace (g.ch))
break;
p += g.len;
}
if (*p != '\n')
break;
}
if (c2 != '\n')
if (ch2 != '\n')
{
char const *p = t2;
while (*p != '\n')
{
mcel_t g = mcel_scan (p, lim2);
if (! ucore_is (c32isspace, g.c))
if (! c32isspace (g.ch))
break;
p += g.len;
}
@ -550,45 +556,45 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
break;
FALLTHROUGH;
case IGNORE_TAB_EXPANSION:
if ((c1 == ' ' && c2 == '\t')
|| (c1 == '\t' && c2 == ' '))
if ((ch1 == ' ' && ch2 == '\t')
|| (ch1 == '\t' && ch2 == ' '))
{
intmax_t tab2 = tab, column2 = column;
while (true)
{
if (c1 == '\t'
|| (c1 == ' ' && column == tabsize - 1))
if (ch1 == '\t'
|| (ch1 == ' ' && column == tabsize - 1))
{
tab++;
column = 0;
}
else if (c1 == ' ')
else if (ch1 == ' ')
column++;
else
break;
g1 = mcel_scan (t1, lim1);
t1 += g1.len;
c1 = g1.c;
ch1 = g1.ch;
}
while (true)
{
if (c2 == '\t'
|| (c2 == ' ' && column2 == tabsize - 1))
if (ch2 == '\t'
|| (ch2 == ' ' && column2 == tabsize - 1))
{
tab2++;
column2 = 0;
}
else if (c2 == ' ')
else if (ch2 == ' ')
column2++;
else
break;
g2 = mcel_scan (t2, lim2);
t2 += g2.len;
c2 = g2.c;
ch2 = g2.ch;
}
if (tab != tab2 || column != column2)
@ -602,15 +608,15 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
if (ignore_case)
{
c1 = ucore_to (c32tolower, c1);
c2 = ucore_to (c32tolower, c2);
ch1 = c32tolower (ch1);
ch2 = c32tolower (ch2);
}
if (ucore_cmp (c1, c2) != 0)
if (! same_ch_err (ch1, g1.err, ch2, g2.err))
break;
}
switch (c1)
switch (ch1)
{
case '\n':
return false;
@ -634,7 +640,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
default:
/* Assume that downcasing does not change print width. */
column += ucore_iserr (c1) ? 1 : c32width (c1);
column += g1.err ? 1 : c32width (ch1);
if (column < tabsize)
break;
FALLTHROUGH;
@ -644,7 +650,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
break;
}
c1prev = c1;
ch1prev = ch1;
}
}
@ -698,8 +704,8 @@ find_and_hash_each_line (struct file_data *current)
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mcel_scan (p, suffix_begin);
if (! ucore_is (c32isspace, g.c))
h = hash (h, (ig_case ? ucore_to (c32tolower, g.c) : g.c));
if (! c32isspace (g.ch))
h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
}
break;
@ -727,7 +733,7 @@ find_and_hash_each_line (struct file_data *current)
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mcel_scan (p, suffix_begin);
if (ucore_is (c32isspace, g.c))
if (c32isspace (g.ch))
{
do
{
@ -736,13 +742,13 @@ find_and_hash_each_line (struct file_data *current)
goto hashing_done;
g = mcel_scan (p, suffix_begin);
}
while (ucore_is (c32isspace, g.c));
while (c32isspace (g.ch));
h = hash (h, ' ');
}
/* G is now the first non-space. */
h = hash (h, ig_case ? ucore_to (c32tolower, g.c) : g.c);
h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
}
break;
@ -818,13 +824,17 @@ find_and_hash_each_line (struct file_data *current)
intmax_t repetitions = 1;
g = mcel_scan (p, suffix_begin);
ucore_t c = g.c;
if (ucore_iserr (c))
column++;
char32_t ch;
if (g.err)
{
ch = -g.err;
column++;
}
else
{
ch = g.ch;
if (ig_white_space & IGNORE_TRAILING_SPACE
&& ucore_is (c32isspace, c))
&& c32isspace (ch))
{
char const *p1 = p + g.len;
for (mcel_t g1; ; p1 += g1.len)
@ -835,13 +845,13 @@ find_and_hash_each_line (struct file_data *current)
goto hashing_done;
}
g1 = mcel_scan (p1, suffix_begin);
if (! ucore_is (c32isspace, g1.c))
if (! c32isspace (g1.ch))
break;
}
}
if (ig_white_space & IGNORE_TAB_EXPANSION)
switch (c)
switch (ch)
{
case '\b':
if (0 < column)
@ -854,7 +864,7 @@ find_and_hash_each_line (struct file_data *current)
break;
case '\t':
c = ' ';
ch = ' ';
repetitions = tabsize - column % tabsize;
tab += column / tabsize + 1;
column = 0;
@ -868,16 +878,16 @@ find_and_hash_each_line (struct file_data *current)
break;
default:
column += c32width (c);
column += c32width (ch);
break;
}
if (ig_case)
c = c32tolower (c);
ch = c32tolower (ch);
}
do
h = hash (h, c);
h = hash (h, ch);
while (--repetitions != 0);
}
}
@ -899,13 +909,13 @@ find_and_hash_each_line (struct file_data *current)
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mcel_scan (p, suffix_begin);
h = hash (h, ucore_to (c32tolower, g.c));
h = hash (h, c32tolower (g.ch) - g.err);
}
else
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mcel_scan (p, suffix_begin);
h = hash (h, g.c);
h = hash (h, g.ch - g.err);
}
}
break;

View File

@ -146,7 +146,7 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound)
Increase TEXT_POINTER, counting columns.
Assume encoding errors have print width 1. */
mcel_t g = mcel_scan (tp0, text_limit);
int width = ucore_iserr (g.c) ? 1 : c32width (g.c);
int width = g.err ? 1 : c32width (g.ch);
if (0 < width && ckd_add (&in_position, in_position, width))
return out_position;