diff: simplify multi-byte code (mbcel -> mcel)

* lib/Makefile.am: Adjust to file renamings and additions.
* lib/mbcel.c, lib/mbcel.h: Split into two APIs, replacing with ...
* lib/mcel.c, lib/mcel.h, lib/ucore.c, lib/ucore.h: ... these new files.
* lib/mcel.h: Simplify by assuming ucore.h is included.
Check that bytes have 8 bits.
(MCEL_LEN_MAX, mcel_t, MCEL_INLINE, MCEL_ENCODING_ERROR_SHIFT)
(mcel_scan, mcel_scant, mcel_scanz, mcel_casecmp):
Rename from MBCEL_LEN_MAX, mbcel_t, MBCEL_INLINE,
MBCEL_ENCODING_ERROR_SHIFT, mbcel_scan, mbcel_scanz, mbcel_scant,
mbcel_casecmp.
(mcel_t): New member c, replacing old members ch and err.
All uses changed.
(MBCEL_UCHAR_FITS, MBCEL_UCHAR_EASILY_FITS): Remove.
All uses removed.  No longer needed now 8-bit bytes are assumed.
(MCEL_ENCODING_ERROR_SHIFT): Check that it matches UCORE_ERR_MIN.
(mcel_isbasic): New function.  Use it where appropriate.
(mbcel_cmp, mbcel_casecmp): Remove; replaced by ucore_cmp,
ucore_tocmp.  All uses changed.
* lib/mcel-casecmp.c: Rename from lib/mbcel-strcasecmp.c.
Include mcel.h instead of mbcel.h.
(mcel_casecmp): Rename from mbcel_strcasecmp.  All uses changed.
Assert that UCHAR_MAX <= INT_MAX, as POSIX requires,
and simplify code accordingly.  Use mcel rather than mbcel.
* lib/ucore.h: Include verify.h.
(ucore_t): New type.
(UCORE_CHAR_MAX, UCORE_ERR_MIN, UCORE_ERR_MAX, UCORE_C32_SAFE):
New constants.  Check that information is not lost by encoding
errors as integers; this is a weaker test than CHAR_BIT == 8.
(ucore_iserr, ucore_is, ucore_to): New functions.
(ucore_cmp, ucore_tocmp): New functions, replacing the old
mbcel_cmp, mbcel_casecmp.  All uses changed.
* src/dir.c, src/io.c, src/side.c: Use mcel rather than mbcel.
* src/io.c (same_ch_err): Remove.  All uses replaced by ucore_cmp.
This commit is contained in:
Paul Eggert 2023-08-15 10:26:37 -07:00
parent e016d12581
commit 12bcf0bd50
11 changed files with 483 additions and 389 deletions

View File

@ -29,7 +29,7 @@ noinst_HEADERS =
include gnulib.mk
noinst_HEADERS += cmpbuf.h diagnose.h mbcel.h
libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mbcel.c mbcel-strcasecmp.c
noinst_HEADERS += cmpbuf.h diagnose.h mcel.h ucore.h
libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mcel.c mcel-casecmp.c ucore.c
AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS)

View File

@ -1,3 +0,0 @@
#include <config.h>
#define MBCEL_INLINE _GL_EXTERN_INLINE
#include "mbcel.h"

View File

@ -1,266 +0,0 @@
/* Multi-byte characters, error encodings, and lengths
Copyright 2023 Free Software Foundation, Inc.
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 3 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* Written by Paul Eggert. */
/* The mbcel_scan function lets code iterate through an array of bytes,
supporting character encodings in practical use
more simply than using plain mbrtoc32.
Instead of this single-byte code:
char *p = ..., *lim = ...;
for (; p < lim; p++)
process (*p);
You can use this multi-byte code:
char *p = ..., *lim = ...;
for (mbcel_t g; p < lim; p += g.len)
{
g = mbcel_scan (p, lim);
process (g);
}
You can select from G using G.ch, G.err, and G.len.
The mbcel_scanz function is similar except it works with a
string of unknown length that is terminated with '\0'.
Instead of this single-byte code:
char *p = ...;
for (; *p; p++)
process (*p);
You can use this multi-byte code:
char *p = ...;
for (mbcel_t g; *p; p += g.len)
{
g = mbcel_scanz (p);
process (g);
}
mbcel_scant (P, TERMINATOR) is like mbcel_scanz (P) except the
string is terminated by TERMINATOR. The TERMINATORs '\0', '\r',
'\n', '.', '/' are safe, as they cannot be a part (even a trailing
byte) of a multi-byte character.
mbcel_cmp (G1, G2) and mbcel_casecmp (G1, G2) compare two mbcel_t
values lexicographically by character or by encoding byte value,
with encoding bytes sorting after characters. mbcel_casecmp
ignores case in characters. mbcel_strcasecmp compares two
null-terminated strings lexicographically.
Although ISO C and POSIX allow encodings that have shift states or
that can produce multiple characters from an indivisible byte sequence,
POSIX does not require support for these encodings,
they are not in practical use on GNUish platforms,
and omitting support for them simplifies the API. */
#ifndef _MBCEL_H
#define _MBCEL_H 1
/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE,
_GL_ATTRIBUTE_MAY_ALIAS. */
#if !_GL_CONFIG_H_INCLUDED
#error "Please include config.h first."
#endif
#include <limits.h>
#include <stddef.h>
#include <uchar.h>
/* The maximum multibyte character length supported on any platform.
This can be less than MB_LEN_MAX because many platforms have a
large MB_LEN_MAX to allow for stateful encodings, and mbcel does
not need to support these encodings. MBCEL_LEN_MAX is enough for
UTF-8, EUC, Shift-JIS, GB18030, etc.
0 < MB_CUR_MAX <= MBCEL_LEN_MAX <= MB_LEN_MAX. */
enum { MBCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
/* mbcel_t is a type representing a character CH or an encoding error byte ERR,
along with a count of the LEN bytes that represent CH or ERR.
If ERR is zero, CH is a valid character and 1 <= LEN <= MB_LEN_MAX;
otherwise ERR is an encoding error byte, 0x80 <= ERR <= UCHAR_MAX,
CH == 0, and LEN == 1. */
typedef struct
{
char32_t ch;
unsigned char err;
unsigned char len;
} mbcel_t;
/* On all known platforms, every multi-byte character length fits in
mbcel_t's LEN. Check this. */
static_assert (MB_LEN_MAX <= UCHAR_MAX);
/* Pacify GCC re '*p <= 0x7f' below. */
#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
# pragma GCC diagnostic ignored "-Wtype-limits"
#endif
_GL_INLINE_HEADER_BEGIN
#ifndef MBCEL_INLINE
# define MBCEL_INLINE _GL_INLINE
#endif
/* With mbcel there should be no need for the performance overhead of
replacing glibc mbrtoc32, as callers shouldn't care whether the
C locale treats a byte with the high bit set as an encoding error. */
#ifdef __GLIBC__
# undef mbrtoc32
#endif
/* Shifting an encoding error byte (which must be at least 2**7)
left by 14 yields at least 2**21 (0x200000), which is greater
than the maximum Unicode value 0x10FFFF. This suffices to sort
encoding errors after characters. */
enum { MBCEL_ENCODING_ERROR_SHIFT = 14 };
/* In the typical case where unsigned char easily fits in int,
optimizations are possible. */
enum {
MBCEL_UCHAR_FITS = UCHAR_MAX <= INT_MAX,
MBCEL_UCHAR_EASILY_FITS = UCHAR_MAX <= INT_MAX >> MBCEL_ENCODING_ERROR_SHIFT
};
#ifndef _GL_LIKELY
/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */
# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
#endif
/* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM.
Return either the valid character starting at P,
or the encoding error of length 1 at P. */
MBCEL_INLINE mbcel_t
mbcel_scan (char const *p, char const *lim)
{
/* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
In supported encodings, the first byte of a multi-byte character
cannot be an ASCII byte. */
if (_GL_LIKELY (0 <= *p && *p <= 0x7f))
return (mbcel_t) { .ch = *p, .len = 1 };
/* An initial mbstate_t; initialization optimized for some platforms.
For details about these and other platforms, see wchar.in.h. */
#if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__)
/* Although only a trivial optimization, it's worth it for GNU. */
mbstate_t mbs; mbs.__count = 0;
#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
|| (defined __APPLE__ && defined __MACH__))
/* These platforms have 128-byte mbstate_t. What were they thinking?
Initialize just for supported encodings (UTF-8, EUC, etc.).
Avoid memset because some compilers generate function call code. */
struct mbhidden { char32_t ch; int utf8_want, euc_want; }
_GL_ATTRIBUTE_MAY_ALIAS;
union { mbstate_t m; struct mbhidden s; } u;
u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
# define mbs u.m
#elif defined __NetBSD__
/* Experiments on both 32- and 64-bit NetBSD platforms have
shown that it doesn't work to clear fewer than 24 bytes. */
struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
union { mbstate_t m; struct mbhidden s; } u;
u.s.a = u.s.b = u.s.c = 0;
# define mbs u.m
#else
/* mbstate_t has unknown structure or is not worth optimizing. */
mbstate_t mbs = {0};
#endif
char32_t ch;
size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);
/* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
is not supported and MB_LEN_MAX is small. */
if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
return (mbcel_t) { .err = *p, .len = 1 };
/* Tell the compiler LEN is at most MB_LEN_MAX,
as this can help GCC generate better code. */
if (! (len <= MB_LEN_MAX))
unreachable ();
/* A multi-byte character. LEN must be positive,
as *P != '\0' and shift sequences are not supported. */
return (mbcel_t) { .ch = ch, .len = len };
}
/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
If *P == TERMINATOR, scan just that byte; otherwise scan
bytes up to but not including a TERMINATOR byte.
TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
Return either the valid character starting at P,
or the encoding error of length 1 at P. */
MBCEL_INLINE mbcel_t
mbcel_scant (char const *p, char terminator)
{
/* Handle ASCII quickly for speed. */
if (_GL_LIKELY (0 <= *p && *p <= 0x7f))
return (mbcel_t) { .ch = *p, .len = 1 };
/* Defer to mbcel_scan for non-ASCII. Compute length with code that
is typically branch-free and faster than memchr or strnlen. */
char const *lim = p + 1;
for (int i = 0; i < MBCEL_LEN_MAX - 1; i++)
lim += *lim != terminator;
return mbcel_scan (p, lim);
}
/* Scan bytes from P, a byte sequence terminated by '\0'.
If *P == '\0', scan just that byte; otherwise scan
bytes up to but not including a '\0'.
Return either the valid character starting at P,
or the encoding error of length 1 at P. */
MBCEL_INLINE mbcel_t
mbcel_scanz (char const *p)
{
return mbcel_scant (p, '\0');
}
/* Compare G1 and G2, with encoding errors sorting after characters.
Return <0, 0, >0 for <, =, >. */
MBCEL_INLINE int
mbcel_cmp (mbcel_t g1, mbcel_t g2)
{
int c1 = g1.ch, c2 = g2.ch, e1 = g1.err, e2 = g2.err, ccmp = c1 - c2,
ecmp = MBCEL_UCHAR_EASILY_FITS ? e1 - e2 : _GL_CMP (e1, e2);
return (ecmp << MBCEL_ENCODING_ERROR_SHIFT) + ccmp;
}
/* Compare G1 and G2 ignoring case, with encoding errors sorting after
characters. Return <0, 0, >0 for <, =, >. */
MBCEL_INLINE int
mbcel_casecmp (mbcel_t g1, mbcel_t g2)
{
int cmp = mbcel_cmp (g1, g2);
if (_GL_LIKELY (g1.err | g2.err | !cmp))
return cmp;
int c1 = c32tolower (g1.ch);
int c2 = c32tolower (g2.ch);
return c1 - c2;
}
/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
Return <0, 0, >0 for <, =, >. Consider encoding errors to be
greater than characters and compare them byte by byte. */
int mbcel_strcasecmp (char const *s1, char const *s2);
_GL_INLINE_HEADER_END
#endif /* _MBCEL_H */

View File

@ -19,17 +19,13 @@
#include <config.h>
/* Specification. */
#include <mbcel.h>
#include <mcel.h>
#include <ctype.h>
#include <stdlib.h>
/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
Return <0, 0, >0 for <, =, >. Consider encoding errors to be
greater than characters and compare them byte by byte. */
int
mbcel_strcasecmp (char const *s1, char const *s2)
mcel_casecmp (char const *s1, char const *s2)
{
char const *p1 = s1;
char const *p2 = s2;
@ -39,14 +35,15 @@ mbcel_strcasecmp (char const *s1, char const *s2)
if (MB_CUR_MAX == 1)
while (true)
{
static_assert (UCHAR_MAX <= INT_MAX);
unsigned char c1 = *p1++;
unsigned char c2 = *p2++;
int cmp = MBCEL_UCHAR_FITS ? c1 - c2 : _GL_CMP (c1, c2);
int cmp = c1 - c2;
if (_GL_UNLIKELY (cmp))
{
c1 = tolower (c1);
c2 = tolower (c2);
cmp = MBCEL_UCHAR_FITS ? c1 - c2 : _GL_CMP (c1, c2);
cmp = c1 - c2;
}
if (_GL_UNLIKELY (cmp | !c1))
return cmp;
@ -54,10 +51,10 @@ mbcel_strcasecmp (char const *s1, char const *s2)
else
while (true)
{
mbcel_t g1 = mbcel_scanz (p1); p1 += g1.len;
mbcel_t g2 = mbcel_scanz (p2); p2 += g2.len;
int cmp = mbcel_casecmp (g1, g2);
if (_GL_UNLIKELY (cmp | ! (g1.ch | g1.err)))
mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
int cmp = ucore_tocmp (c32tolower, g1.c, g2.c);
if (_GL_UNLIKELY (cmp | !g1.c))
return cmp;
}
}

3
lib/mcel.c Normal file
View File

@ -0,0 +1,3 @@
#include <config.h>
#define MCEL_INLINE _GL_EXTERN_INLINE
#include "mcel.h"

236
lib/mcel.h Normal file
View File

@ -0,0 +1,236 @@
/* Multi-byte characters, Error encodings, and Lengths (MCELs)
Copyright 2023 Free Software Foundation, Inc.
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 3 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* Written by Paul Eggert. */
/* The mcel_scan function lets code iterate through an array of bytes,
supporting character encodings in practical use
more simply than using plain mbrtoc32.
Instead of this single-byte code:
char *p = ..., *lim = ...;
for (; p < lim; p++)
process (*p);
You can use this multi-byte code:
char *p = ..., *lim = ...;
for (mcel_t g; p < lim; p += g.len)
{
g = mcel_scan (p, lim);
process (g);
}
The mcel_scanz function is similar except it works with a
string of unknown length that is terminated with '\0'.
Instead of this single-byte code:
char *p = ...;
for (; *p; p++)
process (*p);
You can use this multi-byte code:
char *p = ...;
for (mcel_t g; *p; p += g.len)
{
g = mcel_scanz (p);
process (g);
}
mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the
string is terminated by TERMINATOR. The TERMINATORs '\0', '\r',
'\n', '.', '/' are safe, as they cannot be a part (even a trailing
byte) of a multi-byte character.
You can select from G using G.c and G.len.
You can use ucore_* functions on G.c, e.g., ucore_iserr (G.c),
ucore_is (c32isalpha, G.c), and ucore_to (c32tolower, G.c).
mcel_strcasecmp compares two null-terminated multi-byte strings
lexicographically, ignoring case.
Although ISO C and POSIX allow encodings that have shift states or
that can produce multiple characters from an indivisible byte sequence,
POSIX does not require support for these encodings,
they are not in practical use on GNUish platforms,
and omitting support for them simplifies the API. */
#ifndef _MCEL_H
#define _MCEL_H 1
/* This API is an extension of ucore.h. Programs that include this
file can assume ucore.h is included too. */
#include <ucore.h>
/* The maximum multi-byte character length supported on any platform.
This can be less than MB_LEN_MAX because many platforms have a
large MB_LEN_MAX to allow for stateful encodings, and mcel does not
support these encodings. MCEL_LEN_MAX is enough for UTF-8, EUC,
Shift-JIS, GB18030, etc. In all multi-byte encodings supported by glibc,
0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX. */
enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
/* mcel_t is a type representing a character or encoding error C,
along with a count of the LEN bytes that represent C.
1 <= LEN <= MB_LEN_MAX. */
typedef struct
{
ucore_t c;
unsigned char len;
} mcel_t;
/* Every multi-byte character length fits in mcel_t's LEN. */
static_assert (MB_LEN_MAX <= UCHAR_MAX);
/* Bytes have 8 bits, as POSIX requires. */
static_assert (CHAR_BIT == 8);
/* Pacify GCC re 'c <= 0x7f' below. */
#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
# pragma GCC diagnostic ignored "-Wtype-limits"
#endif
_GL_INLINE_HEADER_BEGIN
#ifndef MCEL_INLINE
# define MCEL_INLINE _GL_INLINE
#endif
/* With mcel there should be no need for the performance overhead of
replacing glibc mbrtoc32, as callers shouldn't care whether the
C locale treats a byte with the high bit set as an encoding error. */
#ifdef __GLIBC__
# undef mbrtoc32
#endif
/* Shifting an encoding error byte (at least 0x80) left by this value
yields a value in the range UCORE_ERR_MIN .. 2*UCORE_ERR_MIN - 1.
This suffices to sort encoding errors after characters. */
enum { MCEL_ENCODING_ERROR_SHIFT = 14 };
static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
/* Whether C represents itself as a Unicode character
when it is the first byte of a single- or multi-byte character.
These days it is safe to assume ASCII, so do not support
obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS. */
MCEL_INLINE bool
mcel_isbasic (char c)
{
return 0 <= c && c <= 0x7f;
}
/* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM.
Return the character or encoding error starting at P. */
MCEL_INLINE mcel_t
mcel_scan (char const *p, char const *lim)
{
/* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
In supported encodings, the first byte of a multi-byte character
cannot be an ASCII byte. */
if (_GL_LIKELY (mcel_isbasic (*p)))
return (mcel_t) { .c = *p, .len = 1 };
/* An initial mbstate_t; initialization optimized for some platforms.
For details about these and other platforms, see wchar.in.h. */
#if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__)
/* Although only a trivial optimization, it's worth it for GNU. */
mbstate_t mbs; mbs.__count = 0;
#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
|| (defined __APPLE__ && defined __MACH__))
/* These platforms have 128-byte mbstate_t. What were they thinking?
Initialize just for supported encodings (UTF-8, EUC, etc.).
Avoid memset because some compilers generate function call code. */
struct mbhidden { char32_t ch; int utf8_want, euc_want; }
_GL_ATTRIBUTE_MAY_ALIAS;
union { mbstate_t m; struct mbhidden s; } u;
u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
# define mbs u.m
#elif defined __NetBSD__
/* Experiments on both 32- and 64-bit NetBSD platforms have
shown that it doesn't work to clear fewer than 24 bytes. */
struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
union { mbstate_t m; struct mbhidden s; } u;
u.s.a = u.s.b = u.s.c = 0;
# define mbs u.m
#else
/* mbstate_t has unknown structure or is not worth optimizing. */
mbstate_t mbs = {0};
#endif
char32_t c;
size_t len = mbrtoc32 (&c, p, lim - p, &mbs);
/* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
is not supported and MB_LEN_MAX is small. */
if (_GL_LIKELY (len <= (size_t) -1 / 2))
{
/* A multi-byte character. LEN must be positive,
as *P != '\0' and shift sequences are not supported. */
assume (0 < len);
assume (len <= MB_LEN_MAX);
assume (c <= UCORE_CHAR_MAX);
return (mcel_t) { .c = c, .len = len };
}
else
{
/* An encoding error. */
unsigned char b = *p;
c = b << MCEL_ENCODING_ERROR_SHIFT;
assume (UCORE_ERR_MIN <= c);
assume (c <= UCORE_ERR_MAX);
return (mcel_t) { .c = c, .len = 1 };
}
}
/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
If *P == TERMINATOR, scan just that byte; otherwise scan
bytes up to but not including TERMINATOR.
TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
Return the character or encoding error starting at P. */
MCEL_INLINE mcel_t
mcel_scant (char const *p, char terminator)
{
/* Handle ASCII quickly for speed. */
if (_GL_LIKELY (mcel_isbasic (*p)))
return (mcel_t) { .c = *p, .len = 1 };
/* Defer to mcel_scan for non-ASCII. Compute length with code that
is typically branch-free and faster than memchr or strnlen. */
char const *lim = p + 1;
for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
lim += *lim != terminator;
return mcel_scan (p, lim);
}
/* Scan bytes from P, a byte sequence terminated by '\0'.
If *P == '\0', scan just that byte; otherwise scan
bytes up to but not including '\0'.
Return the character or encoding error starting at P. */
MCEL_INLINE mcel_t
mcel_scanz (char const *p)
{
return mcel_scant (p, '\0');
}
/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
Return <0, 0, >0 for <, =, >. Consider encoding errors to be
greater than characters and compare them byte by byte. */
int mcel_casecmp (char const *s1, char const *s2);
_GL_INLINE_HEADER_END
#endif /* _MCEL_H */

3
lib/ucore.c Normal file
View File

@ -0,0 +1,3 @@
#include <config.h>
#define UCORE_INLINE _GL_EXTERN_INLINE
#include "ucore.h"

132
lib/ucore.h Normal file
View File

@ -0,0 +1,132 @@
/* Unicode Characters OR Encoding errors (UCOREs)
Copyright 2023 Free Software Foundation, Inc.
This file is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as
published by the Free Software Foundation; either version 3 of the
License, or (at your option) any later version.
This file is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>. */
/* Written by Paul Eggert. */
/* This API's fundamental type ucore_t represents
a single Unicode character or an encoding error.
ucore_iserr (C) tests whether C is an encoding error.
ucore_is (P, C) etc. test whether char class P accepts C.
ucore_to (TO, C) etc. use TO to convert C.
ucore_cmp (C1, C2) and ucore_tocmp (TO, C1, C2) compare C1 and C2,
with encoding errors sorting after characters. */
#ifndef _UCORE_H
#define _UCORE_H 1
#if !_GL_CONFIG_H_INCLUDED
#error "Please include config.h first."
#endif
#include <verify.h>
#include <limits.h>
#include <stddef.h>
#include <uchar.h>
/* ucore_t represents a Unicode Character OR Encoding error.
If 0 <= C <= UCORE_CHAR_MAX, C represents a Unicode character.
If UCORE_ERR_MIN <= C <= UCORE_ERR_MAX, C represents an encoding error.
Other ucore_t values C are invalid. */
typedef int ucore_t;
enum {
UCORE_CHAR_MAX = 0x10FFFF,
UCORE_ERR_MIN = 0x200000,
UCORE_ERR_MAX = 2 * UCORE_ERR_MIN - 1
};
/* Information is not lost by encoding errors as integers. */
static_assert (UCHAR_MAX <= UCORE_ERR_MAX - UCORE_ERR_MIN);
/* On glibc platforms, predicates like c32isalnum and c32tolower
do the right thing for char32_t values that are not valid characters.
POSIX says the behavior is undefined, so play it safe elsewhere.
Do not rely on UCORE_C32_SAFE for c32width. */
#ifdef __GLIBC__
enum { UCORE_C32_SAFE = true };
#else
enum { UCORE_C32_SAFE = false };
#endif
#ifndef _GL_LIKELY
/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */
# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
#endif
_GL_INLINE_HEADER_BEGIN
#ifndef UCORE_INLINE
# define UCORE_INLINE _GL_INLINE
#endif
/* Return true if C represents an encoding error, false otherwise. */
UCORE_INLINE bool
ucore_iserr (ucore_t c)
{
/* (c & UCORE_ERR_MIN) is a bit cheaper than (UCORE_ERR_MIN <= c)
with GCC 13 x86-64. */
if (_GL_UNLIKELY (c & UCORE_ERR_MIN))
{
assume (UCORE_ERR_MIN <= c && c <= UCORE_ERR_MAX);
return true;
}
else
{
assume (0 <= c && c <= UCORE_CHAR_MAX);
return false;
}
}
/* Whether the uchar predicate P accepts C, e.g., ucore_is (c32isalpha, C). */
UCORE_INLINE bool
ucore_is (int (*p) (wint_t), wint_t c)
{
/* When C is out of range, predicates based on glibc return false.
Behavior is undefined on other platforms, so play it safe. */
return (UCORE_C32_SAFE || ! ucore_iserr (c)) && p (c);
}
/* Apply the uchar translator TO to C, e.g., ucore_to (c32tolower, C). */
UCORE_INLINE wint_t
ucore_to (wint_t (*to) (wint_t), ucore_t c)
{
return UCORE_C32_SAFE || ! ucore_iserr (c) ? to (c) : c;
}
/* Compare C1 and C2, with encoding errors sorting after characters.
Return <0, 0, >0 for <, =, >. */
UCORE_INLINE int
ucore_cmp (ucore_t c1, ucore_t c2)
{
return c1 - c2;
}
/* Apply the uchar translater TO to C1 and C2 and compare the results,
with encoding errors sorting after characters,
Return <0, 0, >0 for <, =, >. */
UCORE_INLINE int
ucore_tocmp (wint_t (*to) (wint_t), ucore_t c1, ucore_t c2)
{
if (c1 == c2)
return 0;
int i1 = ucore_to (to, c1), i2 = ucore_to (to, c2);
return i1 - i2;
}
_GL_INLINE_HEADER_END
#endif /* _MCEL_H */

View File

@ -25,7 +25,7 @@
#include <error.h>
#include <exclude.h>
#include <filenamecat.h>
#include <mbcel.h>
#include <mcel.h>
#include <quote.h>
#include <setjmp.h>
#include <xalloc.h>
@ -189,7 +189,7 @@ compare_collated (char const *name1, char const *name2)
{
int r;
if (ignore_file_name_case)
r = mbcel_strcasecmp (name1, name2); /* Best we can do. */
r = mcel_casecmp (name1, name2); /* Best we can do. */
else
{
errno = 0;

194
src/io.c
View File

@ -23,7 +23,7 @@
#include <cmpbuf.h>
#include <file-type.h>
#include <ialloc.h>
#include <mbcel.h>
#include <mcel.h>
#include <xalloc.h>
#include <uchar.h>
@ -230,14 +230,6 @@ slurp (struct file_data *current)
}
}
/* Return true if CH1 and ERR1 stand for the same character or
encoding error as CH2 and ERR2. */
static bool
same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2)
{
return ! ((ch1 ^ ch2) | (err1 ^ err2));
}
/* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
one line from each input file) according to the command line options.
Line lengths include the trailing newline.
@ -435,35 +427,35 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
{
char const *lim1 = s1 + s1len;
char const *lim2 = s2 + s2len;
char32_t ch1prev = 0;
ucore_t c1prev = 0;
while (true)
{
mbcel_t g1 = mbcel_scan (t1, lim1);
mbcel_t g2 = mbcel_scan (t2, lim2);
mcel_t g1 = mcel_scan (t1, lim1);
mcel_t g2 = mcel_scan (t2, lim2);
t1 += g1.len;
t2 += g2.len;
char32_t ch1 = g1.ch;
char32_t ch2 = g2.ch;
ucore_t c1 = g1.c;
ucore_t c2 = g2.c;
/* Test for exact equality first, since it's a common case. */
if (! same_ch_err (ch1, g1.err, ch2, g2.err))
if (ucore_cmp (c1, c2) != 0)
{
switch (ignore_white_space)
{
case IGNORE_ALL_SPACE:
/* For -w, just skip past any white space. */
while (ch1 != '\n' && c32isspace (ch1))
while (c1 != '\n' && ! ucore_is (c32isspace, c1))
{
g1 = mbcel_scan (t1, lim1);
g1 = mcel_scan (t1, lim1);
t1 += g1.len;
ch1 = g1.ch;
c1 = g1.c;
}
while (ch2 != '\n' && c32isspace (ch2))
while (c2 != '\n' && ucore_is (c32isspace, c2))
{
g2 = mbcel_scan (t2, lim2);
g2 = mcel_scan (t2, lim2);
t2 += g2.len;
ch2 = g2.ch;
c2 = g2.c;
}
break;
@ -471,46 +463,48 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
/* For -b, advance past any sequence of white space in
line 1 and consider it just one space, or nothing at
all if it is at the end of the line. */
if (c32isspace (ch1))
while (ch1 != '\n')
if (ucore_is (c32isspace, c1))
while (c1 != '\n')
{
g1 = mbcel_scan (t1, lim1);
g1 = mcel_scan (t1, lim1);
t1 += g1.len;
ch1 = g1.ch;
if (! c32isspace (ch1))
c1 = g1.c;
if (! ucore_is (c32isspace, c1))
{
t1 -= g1.len;
ch1 = ' ';
c1 = ' ';
break;
}
}
/* Likewise for line 2. */
if (c32isspace (ch2))
while (ch2 != '\n')
if (ucore_is (c32isspace, c2))
while (c2 != '\n')
{
g2 = mbcel_scan (t2, lim2);
g2 = mcel_scan (t2, lim2);
t2 += g2.len;
ch2 = g2.ch;
if (! c32isspace (ch2))
c2 = g2.c;
if (! ucore_is (c32isspace, c2))
{
t2 -= g2.len;
ch2 = ' ';
c2 = ' ';
break;
}
}
if (ch1 != ch2)
if (c1 != c2)
{
/* If we went too far when doing the simple test
for equality, go back to the first non-white-space
character in both sides and try again. */
if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev))
if (c2 == ' ' && c1 != '\n'
&& ucore_is (c32isspace, c1prev))
{
t1 -= g1.len;
continue;
}
if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev))
if (c1 == ' ' && c2 != '\n'
&& ucore_is (c32isspace, c1prev))
{
t2 -= g2.len;
continue;
@ -521,30 +515,32 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
case IGNORE_TRAILING_SPACE:
case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
if (c32isspace (ch1) && c32isspace (ch2))
if (ucore_is (c32isspace, c1) && ucore_is (c32isspace, c2))
{
if (ch1 != '\n')
if (c1 != '\n')
{
mbcel_t g;
for (char const *p = t1; ; p += g.len)
char const *p = t1;
while (*p != '\n')
{
g = mbcel_scan (p, lim1);
if (g.ch == '\n' || ! c32isspace (g.ch))
mcel_t g = mcel_scan (p, lim1);
if (! ucore_is (c32isspace, g.c))
break;
p += g.len;
}
if (g.ch != '\n')
if (*p != '\n')
break;
}
if (ch2 != '\n')
if (c2 != '\n')
{
mbcel_t g;
for (char const *p = t2; ; p += g.len)
char const *p = t2;
while (*p != '\n')
{
g = mbcel_scan (p, lim2);
if (g.ch == '\n' || ! c32isspace (g.ch))
mcel_t g = mcel_scan (p, lim2);
if (! ucore_is (c32isspace, g.c))
break;
p += g.len;
}
if (g.ch != '\n')
if (*p != '\n')
break;
}
/* Both lines have nothing but whitespace left. */
@ -554,45 +550,45 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
break;
FALLTHROUGH;
case IGNORE_TAB_EXPANSION:
if ((ch1 == ' ' && ch2 == '\t')
|| (ch1 == '\t' && ch2 == ' '))
if ((c1 == ' ' && c2 == '\t')
|| (c1 == '\t' && c2 == ' '))
{
intmax_t tab2 = tab, column2 = column;
while (true)
{
if (ch1 == '\t'
|| (ch1 == ' ' && column == tabsize - 1))
if (c1 == '\t'
|| (c1 == ' ' && column == tabsize - 1))
{
tab++;
column = 0;
}
else if (ch1 == ' ')
else if (c1 == ' ')
column++;
else
break;
g1 = mbcel_scan (t1, lim1);
g1 = mcel_scan (t1, lim1);
t1 += g1.len;
ch1 = g1.ch;
c1 = g1.c;
}
while (true)
{
if (ch2 == '\t'
|| (ch2 == ' ' && column2 == tabsize - 1))
if (c2 == '\t'
|| (c2 == ' ' && column2 == tabsize - 1))
{
tab2++;
column2 = 0;
}
else if (ch2 == ' ')
else if (c2 == ' ')
column2++;
else
break;
g2 = mbcel_scan (t2, lim2);
g2 = mcel_scan (t2, lim2);
t2 += g2.len;
ch2 = g2.ch;
c2 = g2.c;
}
if (tab != tab2 || column != column2)
@ -606,15 +602,15 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
if (ignore_case)
{
ch1 = c32tolower (ch1);
ch2 = c32tolower (ch2);
c1 = ucore_to (c32tolower, c1);
c2 = ucore_to (c32tolower, c2);
}
if (! same_ch_err (ch1, g1.err, ch2, g2.err))
if (ucore_cmp (c1, c2) != 0)
break;
}
switch (ch1)
switch (c1)
{
case '\n':
return false;
@ -638,7 +634,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
default:
/* Assume that downcasing does not change print width. */
column += g1.err ? 1 : c32width (ch1);
column += ucore_iserr (c1) ? 1 : c32width (c1);
if (column < tabsize)
break;
FALLTHROUGH;
@ -648,7 +644,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
break;
}
ch1prev = ch1;
c1prev = c1;
}
}
@ -699,11 +695,11 @@ find_and_hash_each_line (struct file_data *current)
h = hash (h, ig_case ? tolower (c) : c);
}
else
for (mbcel_t g; *p != '\n'; p += g.len)
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mbcel_scan (p, suffix_begin);
if (! c32isspace (g.ch))
h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
g = mcel_scan (p, suffix_begin);
if (! ucore_is (c32isspace, g.c))
h = hash (h, (ig_case ? ucore_to (c32tolower, g.c) : g.c));
}
break;
@ -728,25 +724,25 @@ find_and_hash_each_line (struct file_data *current)
h = hash (h, ig_case ? tolower (c) : c);
}
else
for (mbcel_t g; *p != '\n'; p += g.len)
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mbcel_scan (p, suffix_begin);
if (c32isspace (g.ch))
g = mcel_scan (p, suffix_begin);
if (ucore_is (c32isspace, g.c))
{
do
{
p += g.len;
if (*p == '\n')
goto hashing_done;
g = mbcel_scan (p, suffix_begin);
g = mcel_scan (p, suffix_begin);
}
while (c32isspace (g.ch));
while (ucore_is (c32isspace, g.c));
h = hash (h, ' ');
}
/* G is now the first non-space. */
h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
h = hash (h, ig_case ? ucore_to (c32tolower, g.c) : g.c);
}
break;
@ -817,39 +813,35 @@ find_and_hash_each_line (struct file_data *current)
while (--repetitions != 0);
}
else
for (mbcel_t g; *p != '\n'; p += g.len)
for (mcel_t g; *p != '\n'; p += g.len)
{
intmax_t repetitions = 1;
g = mbcel_scan (p, suffix_begin);
char32_t ch;
if (g.err)
{
ch = -g.err;
column++;
}
g = mcel_scan (p, suffix_begin);
ucore_t c = g.c;
if (ucore_iserr (c))
column++;
else
{
ch = g.ch;
if (ig_white_space & IGNORE_TRAILING_SPACE
&& c32isspace (ch))
&& ucore_is (c32isspace, c))
{
char const *p1 = p + g.len;
for (mbcel_t g1; ; p1 += g1.len)
for (mcel_t g1; ; p1 += g1.len)
{
if (*p1 == '\n')
{
p = p1;
goto hashing_done;
}
g1 = mbcel_scan (p1, suffix_begin);
if (! c32isspace (g1.ch))
g1 = mcel_scan (p1, suffix_begin);
if (! ucore_is (c32isspace, g1.c))
break;
}
}
if (ig_white_space & IGNORE_TAB_EXPANSION)
switch (ch)
switch (c)
{
case '\b':
if (0 < column)
@ -862,7 +854,7 @@ find_and_hash_each_line (struct file_data *current)
break;
case '\t':
ch = ' ';
c = ' ';
repetitions = tabsize - column % tabsize;
tab += column / tabsize + 1;
column = 0;
@ -876,16 +868,16 @@ find_and_hash_each_line (struct file_data *current)
break;
default:
column += c32width (ch);
column += c32width (c);
break;
}
if (ig_case)
ch = c32tolower (ch);
c = c32tolower (c);
}
do
h = hash (h, ch);
h = hash (h, c);
while (--repetitions != 0);
}
}
@ -904,16 +896,16 @@ find_and_hash_each_line (struct file_data *current)
else
{
if (ig_case)
for (mbcel_t g; *p != '\n'; p += g.len)
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mbcel_scan (p, suffix_begin);
h = hash (h, c32tolower (g.ch) - g.err);
g = mcel_scan (p, suffix_begin);
h = hash (h, ucore_to (c32tolower, g.c));
}
else
for (mbcel_t g; *p != '\n'; p += g.len)
for (mcel_t g; *p != '\n'; p += g.len)
{
g = mbcel_scan (p, suffix_begin);
h = hash (h, g.ch - g.err);
g = mcel_scan (p, suffix_begin);
h = hash (h, g.c);
}
}
break;

View File

@ -22,7 +22,7 @@
#include "diff.h"
#include <mbcel.h>
#include <mcel.h>
static void print_sdiff_common_lines (lin, lin);
static void print_sdiff_hunk (struct change *);
@ -145,8 +145,8 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound)
/* A byte that might start a multibyte character.
Increase TEXT_POINTER, counting columns.
Assume encoding errors have print width 1. */
mbcel_t g = mbcel_scan (tp0, text_limit);
int width = g.err ? 1 : c32width (g.ch);
mcel_t g = mcel_scan (tp0, text_limit);
int width = ucore_iserr (g.c) ? 1 : c32width (g.c);
if (0 < width && ckd_add (&in_position, in_position, width))
return out_position;