diff --git a/lib/Makefile.am b/lib/Makefile.am index 245d57a..a5a880d 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -29,7 +29,7 @@ noinst_HEADERS = include gnulib.mk -noinst_HEADERS += cmpbuf.h diagnose.h mbcel.h -libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mbcel.c mbcel-strcasecmp.c +noinst_HEADERS += cmpbuf.h diagnose.h mcel.h ucore.h +libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mcel.c mcel-casecmp.c ucore.c AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS) diff --git a/lib/mbcel.c b/lib/mbcel.c deleted file mode 100644 index 8640c97..0000000 --- a/lib/mbcel.c +++ /dev/null @@ -1,3 +0,0 @@ -#include -#define MBCEL_INLINE _GL_EXTERN_INLINE -#include "mbcel.h" diff --git a/lib/mbcel.h b/lib/mbcel.h deleted file mode 100644 index 001c1c0..0000000 --- a/lib/mbcel.h +++ /dev/null @@ -1,266 +0,0 @@ -/* Multi-byte characters, error encodings, and lengths - Copyright 2023 Free Software Foundation, Inc. - - This file is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation; either version 3 of the - License, or (at your option) any later version. - - This file is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with this program. If not, see . */ - -/* Written by Paul Eggert. */ - -/* The mbcel_scan function lets code iterate through an array of bytes, - supporting character encodings in practical use - more simply than using plain mbrtoc32. - - Instead of this single-byte code: - - char *p = ..., *lim = ...; - for (; p < lim; p++) - process (*p); - - You can use this multi-byte code: - - char *p = ..., *lim = ...; - for (mbcel_t g; p < lim; p += g.len) - { - g = mbcel_scan (p, lim); - process (g); - } - - You can select from G using G.ch, G.err, and G.len. - - The mbcel_scanz function is similar except it works with a - string of unknown length that is terminated with '\0'. - Instead of this single-byte code: - - char *p = ...; - for (; *p; p++) - process (*p); - - You can use this multi-byte code: - - char *p = ...; - for (mbcel_t g; *p; p += g.len) - { - g = mbcel_scanz (p); - process (g); - } - - mbcel_scant (P, TERMINATOR) is like mbcel_scanz (P) except the - string is terminated by TERMINATOR. The TERMINATORs '\0', '\r', - '\n', '.', '/' are safe, as they cannot be a part (even a trailing - byte) of a multi-byte character. - - mbcel_cmp (G1, G2) and mbcel_casecmp (G1, G2) compare two mbcel_t - values lexicographically by character or by encoding byte value, - with encoding bytes sorting after characters. mbcel_casecmp - ignores case in characters. mbcel_strcasecmp compares two - null-terminated strings lexicographically. - - Although ISO C and POSIX allow encodings that have shift states or - that can produce multiple characters from an indivisible byte sequence, - POSIX does not require support for these encodings, - they are not in practical use on GNUish platforms, - and omitting support for them simplifies the API. */ - -#ifndef _MBCEL_H -#define _MBCEL_H 1 - -/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE, - _GL_ATTRIBUTE_MAY_ALIAS. */ -#if !_GL_CONFIG_H_INCLUDED - #error "Please include config.h first." -#endif - -#include -#include -#include - -/* The maximum multibyte character length supported on any platform. - This can be less than MB_LEN_MAX because many platforms have a - large MB_LEN_MAX to allow for stateful encodings, and mbcel does - not need to support these encodings. MBCEL_LEN_MAX is enough for - UTF-8, EUC, Shift-JIS, GB18030, etc. - 0 < MB_CUR_MAX <= MBCEL_LEN_MAX <= MB_LEN_MAX. */ -enum { MBCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 }; - -/* mbcel_t is a type representing a character CH or an encoding error byte ERR, - along with a count of the LEN bytes that represent CH or ERR. - If ERR is zero, CH is a valid character and 1 <= LEN <= MB_LEN_MAX; - otherwise ERR is an encoding error byte, 0x80 <= ERR <= UCHAR_MAX, - CH == 0, and LEN == 1. */ -typedef struct -{ - char32_t ch; - unsigned char err; - unsigned char len; -} mbcel_t; - -/* On all known platforms, every multi-byte character length fits in - mbcel_t's LEN. Check this. */ -static_assert (MB_LEN_MAX <= UCHAR_MAX); - -/* Pacify GCC re '*p <= 0x7f' below. */ -#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__) -# pragma GCC diagnostic ignored "-Wtype-limits" -#endif - -_GL_INLINE_HEADER_BEGIN -#ifndef MBCEL_INLINE -# define MBCEL_INLINE _GL_INLINE -#endif - -/* With mbcel there should be no need for the performance overhead of - replacing glibc mbrtoc32, as callers shouldn't care whether the - C locale treats a byte with the high bit set as an encoding error. */ -#ifdef __GLIBC__ -# undef mbrtoc32 -#endif - -/* Shifting an encoding error byte (which must be at least 2**7) - left by 14 yields at least 2**21 (0x200000), which is greater - than the maximum Unicode value 0x10FFFF. This suffices to sort - encoding errors after characters. */ -enum { MBCEL_ENCODING_ERROR_SHIFT = 14 }; - -/* In the typical case where unsigned char easily fits in int, - optimizations are possible. */ -enum { - MBCEL_UCHAR_FITS = UCHAR_MAX <= INT_MAX, - MBCEL_UCHAR_EASILY_FITS = UCHAR_MAX <= INT_MAX >> MBCEL_ENCODING_ERROR_SHIFT -}; - -#ifndef _GL_LIKELY -/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */ -# define _GL_LIKELY(cond) __builtin_expect ((cond), 1) -# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0) -#endif - -/* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM. - Return either the valid character starting at P, - or the encoding error of length 1 at P. */ -MBCEL_INLINE mbcel_t -mbcel_scan (char const *p, char const *lim) -{ - /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32. - In supported encodings, the first byte of a multi-byte character - cannot be an ASCII byte. */ - if (_GL_LIKELY (0 <= *p && *p <= 0x7f)) - return (mbcel_t) { .ch = *p, .len = 1 }; - - /* An initial mbstate_t; initialization optimized for some platforms. - For details about these and other platforms, see wchar.in.h. */ -#if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__) - /* Although only a trivial optimization, it's worth it for GNU. */ - mbstate_t mbs; mbs.__count = 0; -#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \ - || (defined __APPLE__ && defined __MACH__)) - /* These platforms have 128-byte mbstate_t. What were they thinking? - Initialize just for supported encodings (UTF-8, EUC, etc.). - Avoid memset because some compilers generate function call code. */ - struct mbhidden { char32_t ch; int utf8_want, euc_want; } - _GL_ATTRIBUTE_MAY_ALIAS; - union { mbstate_t m; struct mbhidden s; } u; - u.s.ch = u.s.utf8_want = u.s.euc_want = 0; -# define mbs u.m -#elif defined __NetBSD__ - /* Experiments on both 32- and 64-bit NetBSD platforms have - shown that it doesn't work to clear fewer than 24 bytes. */ - struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS; - union { mbstate_t m; struct mbhidden s; } u; - u.s.a = u.s.b = u.s.c = 0; -# define mbs u.m -#else - /* mbstate_t has unknown structure or is not worth optimizing. */ - mbstate_t mbs = {0}; -#endif - - char32_t ch; - size_t len = mbrtoc32 (&ch, p, lim - p, &mbs); - - /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3 - is not supported and MB_LEN_MAX is small. */ - if (_GL_UNLIKELY ((size_t) -1 / 2 < len)) - return (mbcel_t) { .err = *p, .len = 1 }; - - /* Tell the compiler LEN is at most MB_LEN_MAX, - as this can help GCC generate better code. */ - if (! (len <= MB_LEN_MAX)) - unreachable (); - - /* A multi-byte character. LEN must be positive, - as *P != '\0' and shift sequences are not supported. */ - return (mbcel_t) { .ch = ch, .len = len }; -} - -/* Scan bytes from P, a byte sequence terminated by TERMINATOR. - If *P == TERMINATOR, scan just that byte; otherwise scan - bytes up to but not including a TERMINATOR byte. - TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'. - Return either the valid character starting at P, - or the encoding error of length 1 at P. */ -MBCEL_INLINE mbcel_t -mbcel_scant (char const *p, char terminator) -{ - /* Handle ASCII quickly for speed. */ - if (_GL_LIKELY (0 <= *p && *p <= 0x7f)) - return (mbcel_t) { .ch = *p, .len = 1 }; - - /* Defer to mbcel_scan for non-ASCII. Compute length with code that - is typically branch-free and faster than memchr or strnlen. */ - char const *lim = p + 1; - for (int i = 0; i < MBCEL_LEN_MAX - 1; i++) - lim += *lim != terminator; - return mbcel_scan (p, lim); -} - -/* Scan bytes from P, a byte sequence terminated by '\0'. - If *P == '\0', scan just that byte; otherwise scan - bytes up to but not including a '\0'. - Return either the valid character starting at P, - or the encoding error of length 1 at P. */ -MBCEL_INLINE mbcel_t -mbcel_scanz (char const *p) -{ - return mbcel_scant (p, '\0'); -} - -/* Compare G1 and G2, with encoding errors sorting after characters. - Return <0, 0, >0 for <, =, >. */ -MBCEL_INLINE int -mbcel_cmp (mbcel_t g1, mbcel_t g2) -{ - int c1 = g1.ch, c2 = g2.ch, e1 = g1.err, e2 = g2.err, ccmp = c1 - c2, - ecmp = MBCEL_UCHAR_EASILY_FITS ? e1 - e2 : _GL_CMP (e1, e2); - return (ecmp << MBCEL_ENCODING_ERROR_SHIFT) + ccmp; -} - -/* Compare G1 and G2 ignoring case, with encoding errors sorting after - characters. Return <0, 0, >0 for <, =, >. */ -MBCEL_INLINE int -mbcel_casecmp (mbcel_t g1, mbcel_t g2) -{ - int cmp = mbcel_cmp (g1, g2); - if (_GL_LIKELY (g1.err | g2.err | !cmp)) - return cmp; - int c1 = c32tolower (g1.ch); - int c2 = c32tolower (g2.ch); - return c1 - c2; -} - -/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case. - Return <0, 0, >0 for <, =, >. Consider encoding errors to be - greater than characters and compare them byte by byte. */ -int mbcel_strcasecmp (char const *s1, char const *s2); - -_GL_INLINE_HEADER_END - -#endif /* _MBCEL_H */ diff --git a/lib/mbcel-strcasecmp.c b/lib/mcel-casecmp.c similarity index 70% rename from lib/mbcel-strcasecmp.c rename to lib/mcel-casecmp.c index 5d50491..24c7042 100644 --- a/lib/mbcel-strcasecmp.c +++ b/lib/mcel-casecmp.c @@ -19,17 +19,13 @@ #include /* Specification. */ -#include +#include #include #include -/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case. - Return <0, 0, >0 for <, =, >. Consider encoding errors to be - greater than characters and compare them byte by byte. */ - int -mbcel_strcasecmp (char const *s1, char const *s2) +mcel_casecmp (char const *s1, char const *s2) { char const *p1 = s1; char const *p2 = s2; @@ -39,14 +35,15 @@ mbcel_strcasecmp (char const *s1, char const *s2) if (MB_CUR_MAX == 1) while (true) { + static_assert (UCHAR_MAX <= INT_MAX); unsigned char c1 = *p1++; unsigned char c2 = *p2++; - int cmp = MBCEL_UCHAR_FITS ? c1 - c2 : _GL_CMP (c1, c2); + int cmp = c1 - c2; if (_GL_UNLIKELY (cmp)) { c1 = tolower (c1); c2 = tolower (c2); - cmp = MBCEL_UCHAR_FITS ? c1 - c2 : _GL_CMP (c1, c2); + cmp = c1 - c2; } if (_GL_UNLIKELY (cmp | !c1)) return cmp; @@ -54,10 +51,10 @@ mbcel_strcasecmp (char const *s1, char const *s2) else while (true) { - mbcel_t g1 = mbcel_scanz (p1); p1 += g1.len; - mbcel_t g2 = mbcel_scanz (p2); p2 += g2.len; - int cmp = mbcel_casecmp (g1, g2); - if (_GL_UNLIKELY (cmp | ! (g1.ch | g1.err))) + mcel_t g1 = mcel_scanz (p1); p1 += g1.len; + mcel_t g2 = mcel_scanz (p2); p2 += g2.len; + int cmp = ucore_tocmp (c32tolower, g1.c, g2.c); + if (_GL_UNLIKELY (cmp | !g1.c)) return cmp; } } diff --git a/lib/mcel.c b/lib/mcel.c new file mode 100644 index 0000000..3c2ae46 --- /dev/null +++ b/lib/mcel.c @@ -0,0 +1,3 @@ +#include +#define MCEL_INLINE _GL_EXTERN_INLINE +#include "mcel.h" diff --git a/lib/mcel.h b/lib/mcel.h new file mode 100644 index 0000000..47fa681 --- /dev/null +++ b/lib/mcel.h @@ -0,0 +1,236 @@ +/* Multi-byte characters, Error encodings, and Lengths (MCELs) + Copyright 2023 Free Software Foundation, Inc. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +/* Written by Paul Eggert. */ + +/* The mcel_scan function lets code iterate through an array of bytes, + supporting character encodings in practical use + more simply than using plain mbrtoc32. + + Instead of this single-byte code: + + char *p = ..., *lim = ...; + for (; p < lim; p++) + process (*p); + + You can use this multi-byte code: + + char *p = ..., *lim = ...; + for (mcel_t g; p < lim; p += g.len) + { + g = mcel_scan (p, lim); + process (g); + } + + The mcel_scanz function is similar except it works with a + string of unknown length that is terminated with '\0'. + Instead of this single-byte code: + + char *p = ...; + for (; *p; p++) + process (*p); + + You can use this multi-byte code: + + char *p = ...; + for (mcel_t g; *p; p += g.len) + { + g = mcel_scanz (p); + process (g); + } + + mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the + string is terminated by TERMINATOR. The TERMINATORs '\0', '\r', + '\n', '.', '/' are safe, as they cannot be a part (even a trailing + byte) of a multi-byte character. + + You can select from G using G.c and G.len. + You can use ucore_* functions on G.c, e.g., ucore_iserr (G.c), + ucore_is (c32isalpha, G.c), and ucore_to (c32tolower, G.c). + + mcel_strcasecmp compares two null-terminated multi-byte strings + lexicographically, ignoring case. + + Although ISO C and POSIX allow encodings that have shift states or + that can produce multiple characters from an indivisible byte sequence, + POSIX does not require support for these encodings, + they are not in practical use on GNUish platforms, + and omitting support for them simplifies the API. */ + +#ifndef _MCEL_H +#define _MCEL_H 1 + +/* This API is an extension of ucore.h. Programs that include this + file can assume ucore.h is included too. */ +#include + +/* The maximum multi-byte character length supported on any platform. + This can be less than MB_LEN_MAX because many platforms have a + large MB_LEN_MAX to allow for stateful encodings, and mcel does not + support these encodings. MCEL_LEN_MAX is enough for UTF-8, EUC, + Shift-JIS, GB18030, etc. In all multi-byte encodings supported by glibc, + 0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX. */ +enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 }; + +/* mcel_t is a type representing a character or encoding error C, + along with a count of the LEN bytes that represent C. + 1 <= LEN <= MB_LEN_MAX. */ +typedef struct +{ + ucore_t c; + unsigned char len; +} mcel_t; + +/* Every multi-byte character length fits in mcel_t's LEN. */ +static_assert (MB_LEN_MAX <= UCHAR_MAX); + +/* Bytes have 8 bits, as POSIX requires. */ +static_assert (CHAR_BIT == 8); + +/* Pacify GCC re 'c <= 0x7f' below. */ +#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__) +# pragma GCC diagnostic ignored "-Wtype-limits" +#endif + +_GL_INLINE_HEADER_BEGIN +#ifndef MCEL_INLINE +# define MCEL_INLINE _GL_INLINE +#endif + +/* With mcel there should be no need for the performance overhead of + replacing glibc mbrtoc32, as callers shouldn't care whether the + C locale treats a byte with the high bit set as an encoding error. */ +#ifdef __GLIBC__ +# undef mbrtoc32 +#endif + +/* Shifting an encoding error byte (at least 0x80) left by this value + yields a value in the range UCORE_ERR_MIN .. 2*UCORE_ERR_MIN - 1. + This suffices to sort encoding errors after characters. */ +enum { MCEL_ENCODING_ERROR_SHIFT = 14 }; +static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT); + +/* Whether C represents itself as a Unicode character + when it is the first byte of a single- or multi-byte character. + These days it is safe to assume ASCII, so do not support + obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS. */ +MCEL_INLINE bool +mcel_isbasic (char c) +{ + return 0 <= c && c <= 0x7f; +} + +/* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM. + Return the character or encoding error starting at P. */ +MCEL_INLINE mcel_t +mcel_scan (char const *p, char const *lim) +{ + /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32. + In supported encodings, the first byte of a multi-byte character + cannot be an ASCII byte. */ + if (_GL_LIKELY (mcel_isbasic (*p))) + return (mcel_t) { .c = *p, .len = 1 }; + + /* An initial mbstate_t; initialization optimized for some platforms. + For details about these and other platforms, see wchar.in.h. */ +#if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__) + /* Although only a trivial optimization, it's worth it for GNU. */ + mbstate_t mbs; mbs.__count = 0; +#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \ + || (defined __APPLE__ && defined __MACH__)) + /* These platforms have 128-byte mbstate_t. What were they thinking? + Initialize just for supported encodings (UTF-8, EUC, etc.). + Avoid memset because some compilers generate function call code. */ + struct mbhidden { char32_t ch; int utf8_want, euc_want; } + _GL_ATTRIBUTE_MAY_ALIAS; + union { mbstate_t m; struct mbhidden s; } u; + u.s.ch = u.s.utf8_want = u.s.euc_want = 0; +# define mbs u.m +#elif defined __NetBSD__ + /* Experiments on both 32- and 64-bit NetBSD platforms have + shown that it doesn't work to clear fewer than 24 bytes. */ + struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS; + union { mbstate_t m; struct mbhidden s; } u; + u.s.a = u.s.b = u.s.c = 0; +# define mbs u.m +#else + /* mbstate_t has unknown structure or is not worth optimizing. */ + mbstate_t mbs = {0}; +#endif + + char32_t c; + size_t len = mbrtoc32 (&c, p, lim - p, &mbs); + + /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3 + is not supported and MB_LEN_MAX is small. */ + if (_GL_LIKELY (len <= (size_t) -1 / 2)) + { + /* A multi-byte character. LEN must be positive, + as *P != '\0' and shift sequences are not supported. */ + assume (0 < len); + assume (len <= MB_LEN_MAX); + assume (c <= UCORE_CHAR_MAX); + return (mcel_t) { .c = c, .len = len }; + } + else + { + /* An encoding error. */ + unsigned char b = *p; + c = b << MCEL_ENCODING_ERROR_SHIFT; + assume (UCORE_ERR_MIN <= c); + assume (c <= UCORE_ERR_MAX); + return (mcel_t) { .c = c, .len = 1 }; + } +} + +/* Scan bytes from P, a byte sequence terminated by TERMINATOR. + If *P == TERMINATOR, scan just that byte; otherwise scan + bytes up to but not including TERMINATOR. + TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'. + Return the character or encoding error starting at P. */ +MCEL_INLINE mcel_t +mcel_scant (char const *p, char terminator) +{ + /* Handle ASCII quickly for speed. */ + if (_GL_LIKELY (mcel_isbasic (*p))) + return (mcel_t) { .c = *p, .len = 1 }; + + /* Defer to mcel_scan for non-ASCII. Compute length with code that + is typically branch-free and faster than memchr or strnlen. */ + char const *lim = p + 1; + for (int i = 0; i < MCEL_LEN_MAX - 1; i++) + lim += *lim != terminator; + return mcel_scan (p, lim); +} + +/* Scan bytes from P, a byte sequence terminated by '\0'. + If *P == '\0', scan just that byte; otherwise scan + bytes up to but not including '\0'. + Return the character or encoding error starting at P. */ +MCEL_INLINE mcel_t +mcel_scanz (char const *p) +{ + return mcel_scant (p, '\0'); +} + +/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case. + Return <0, 0, >0 for <, =, >. Consider encoding errors to be + greater than characters and compare them byte by byte. */ +int mcel_casecmp (char const *s1, char const *s2); + +_GL_INLINE_HEADER_END + +#endif /* _MCEL_H */ diff --git a/lib/ucore.c b/lib/ucore.c new file mode 100644 index 0000000..5831b1b --- /dev/null +++ b/lib/ucore.c @@ -0,0 +1,3 @@ +#include +#define UCORE_INLINE _GL_EXTERN_INLINE +#include "ucore.h" diff --git a/lib/ucore.h b/lib/ucore.h new file mode 100644 index 0000000..7e13db7 --- /dev/null +++ b/lib/ucore.h @@ -0,0 +1,132 @@ +/* Unicode Characters OR Encoding errors (UCOREs) + Copyright 2023 Free Software Foundation, Inc. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation; either version 3 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +/* Written by Paul Eggert. */ + +/* This API's fundamental type ucore_t represents + a single Unicode character or an encoding error. + ucore_iserr (C) tests whether C is an encoding error. + ucore_is (P, C) etc. test whether char class P accepts C. + ucore_to (TO, C) etc. use TO to convert C. + ucore_cmp (C1, C2) and ucore_tocmp (TO, C1, C2) compare C1 and C2, + with encoding errors sorting after characters. */ + +#ifndef _UCORE_H +#define _UCORE_H 1 + +#if !_GL_CONFIG_H_INCLUDED + #error "Please include config.h first." +#endif + +#include + +#include +#include +#include + +/* ucore_t represents a Unicode Character OR Encoding error. + If 0 <= C <= UCORE_CHAR_MAX, C represents a Unicode character. + If UCORE_ERR_MIN <= C <= UCORE_ERR_MAX, C represents an encoding error. + Other ucore_t values C are invalid. */ +typedef int ucore_t; + +enum { + UCORE_CHAR_MAX = 0x10FFFF, + UCORE_ERR_MIN = 0x200000, + UCORE_ERR_MAX = 2 * UCORE_ERR_MIN - 1 +}; + +/* Information is not lost by encoding errors as integers. */ +static_assert (UCHAR_MAX <= UCORE_ERR_MAX - UCORE_ERR_MIN); + +/* On glibc platforms, predicates like c32isalnum and c32tolower + do the right thing for char32_t values that are not valid characters. + POSIX says the behavior is undefined, so play it safe elsewhere. + Do not rely on UCORE_C32_SAFE for c32width. */ +#ifdef __GLIBC__ +enum { UCORE_C32_SAFE = true }; +#else +enum { UCORE_C32_SAFE = false }; +#endif + +#ifndef _GL_LIKELY +/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */ +# define _GL_LIKELY(cond) __builtin_expect ((cond), 1) +# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0) +#endif + +_GL_INLINE_HEADER_BEGIN +#ifndef UCORE_INLINE +# define UCORE_INLINE _GL_INLINE +#endif + +/* Return true if C represents an encoding error, false otherwise. */ +UCORE_INLINE bool +ucore_iserr (ucore_t c) +{ + /* (c & UCORE_ERR_MIN) is a bit cheaper than (UCORE_ERR_MIN <= c) + with GCC 13 x86-64. */ + if (_GL_UNLIKELY (c & UCORE_ERR_MIN)) + { + assume (UCORE_ERR_MIN <= c && c <= UCORE_ERR_MAX); + return true; + } + else + { + assume (0 <= c && c <= UCORE_CHAR_MAX); + return false; + } +} + +/* Whether the uchar predicate P accepts C, e.g., ucore_is (c32isalpha, C). */ +UCORE_INLINE bool +ucore_is (int (*p) (wint_t), wint_t c) +{ + /* When C is out of range, predicates based on glibc return false. + Behavior is undefined on other platforms, so play it safe. */ + return (UCORE_C32_SAFE || ! ucore_iserr (c)) && p (c); +} + +/* Apply the uchar translator TO to C, e.g., ucore_to (c32tolower, C). */ +UCORE_INLINE wint_t +ucore_to (wint_t (*to) (wint_t), ucore_t c) +{ + return UCORE_C32_SAFE || ! ucore_iserr (c) ? to (c) : c; +} + +/* Compare C1 and C2, with encoding errors sorting after characters. + Return <0, 0, >0 for <, =, >. */ +UCORE_INLINE int +ucore_cmp (ucore_t c1, ucore_t c2) +{ + return c1 - c2; +} + +/* Apply the uchar translater TO to C1 and C2 and compare the results, + with encoding errors sorting after characters, + Return <0, 0, >0 for <, =, >. */ +UCORE_INLINE int +ucore_tocmp (wint_t (*to) (wint_t), ucore_t c1, ucore_t c2) +{ + if (c1 == c2) + return 0; + int i1 = ucore_to (to, c1), i2 = ucore_to (to, c2); + return i1 - i2; +} + +_GL_INLINE_HEADER_END + +#endif /* _MCEL_H */ diff --git a/src/dir.c b/src/dir.c index f4eec99..897bddd 100644 --- a/src/dir.c +++ b/src/dir.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -189,7 +189,7 @@ compare_collated (char const *name1, char const *name2) { int r; if (ignore_file_name_case) - r = mbcel_strcasecmp (name1, name2); /* Best we can do. */ + r = mcel_casecmp (name1, name2); /* Best we can do. */ else { errno = 0; diff --git a/src/io.c b/src/io.c index c4139c0..8442adb 100644 --- a/src/io.c +++ b/src/io.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include @@ -230,14 +230,6 @@ slurp (struct file_data *current) } } -/* Return true if CH1 and ERR1 stand for the same character or - encoding error as CH2 and ERR2. */ -static bool -same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2) -{ - return ! ((ch1 ^ ch2) | (err1 ^ err2)); -} - /* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically one line from each input file) according to the command line options. Line lengths include the trailing newline. @@ -435,35 +427,35 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) { char const *lim1 = s1 + s1len; char const *lim2 = s2 + s2len; - char32_t ch1prev = 0; + ucore_t c1prev = 0; while (true) { - mbcel_t g1 = mbcel_scan (t1, lim1); - mbcel_t g2 = mbcel_scan (t2, lim2); + mcel_t g1 = mcel_scan (t1, lim1); + mcel_t g2 = mcel_scan (t2, lim2); t1 += g1.len; t2 += g2.len; - char32_t ch1 = g1.ch; - char32_t ch2 = g2.ch; + ucore_t c1 = g1.c; + ucore_t c2 = g2.c; /* Test for exact equality first, since it's a common case. */ - if (! same_ch_err (ch1, g1.err, ch2, g2.err)) + if (ucore_cmp (c1, c2) != 0) { switch (ignore_white_space) { case IGNORE_ALL_SPACE: /* For -w, just skip past any white space. */ - while (ch1 != '\n' && c32isspace (ch1)) + while (c1 != '\n' && ! ucore_is (c32isspace, c1)) { - g1 = mbcel_scan (t1, lim1); + g1 = mcel_scan (t1, lim1); t1 += g1.len; - ch1 = g1.ch; + c1 = g1.c; } - while (ch2 != '\n' && c32isspace (ch2)) + while (c2 != '\n' && ucore_is (c32isspace, c2)) { - g2 = mbcel_scan (t2, lim2); + g2 = mcel_scan (t2, lim2); t2 += g2.len; - ch2 = g2.ch; + c2 = g2.c; } break; @@ -471,46 +463,48 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) /* For -b, advance past any sequence of white space in line 1 and consider it just one space, or nothing at all if it is at the end of the line. */ - if (c32isspace (ch1)) - while (ch1 != '\n') + if (ucore_is (c32isspace, c1)) + while (c1 != '\n') { - g1 = mbcel_scan (t1, lim1); + g1 = mcel_scan (t1, lim1); t1 += g1.len; - ch1 = g1.ch; - if (! c32isspace (ch1)) + c1 = g1.c; + if (! ucore_is (c32isspace, c1)) { t1 -= g1.len; - ch1 = ' '; + c1 = ' '; break; } } /* Likewise for line 2. */ - if (c32isspace (ch2)) - while (ch2 != '\n') + if (ucore_is (c32isspace, c2)) + while (c2 != '\n') { - g2 = mbcel_scan (t2, lim2); + g2 = mcel_scan (t2, lim2); t2 += g2.len; - ch2 = g2.ch; - if (! c32isspace (ch2)) + c2 = g2.c; + if (! ucore_is (c32isspace, c2)) { t2 -= g2.len; - ch2 = ' '; + c2 = ' '; break; } } - if (ch1 != ch2) + if (c1 != c2) { /* If we went too far when doing the simple test for equality, go back to the first non-white-space character in both sides and try again. */ - if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev)) + if (c2 == ' ' && c1 != '\n' + && ucore_is (c32isspace, c1prev)) { t1 -= g1.len; continue; } - if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev)) + if (c1 == ' ' && c2 != '\n' + && ucore_is (c32isspace, c1prev)) { t2 -= g2.len; continue; @@ -521,30 +515,32 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) case IGNORE_TRAILING_SPACE: case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE: - if (c32isspace (ch1) && c32isspace (ch2)) + if (ucore_is (c32isspace, c1) && ucore_is (c32isspace, c2)) { - if (ch1 != '\n') + if (c1 != '\n') { - mbcel_t g; - for (char const *p = t1; ; p += g.len) + char const *p = t1; + while (*p != '\n') { - g = mbcel_scan (p, lim1); - if (g.ch == '\n' || ! c32isspace (g.ch)) + mcel_t g = mcel_scan (p, lim1); + if (! ucore_is (c32isspace, g.c)) break; + p += g.len; } - if (g.ch != '\n') + if (*p != '\n') break; } - if (ch2 != '\n') + if (c2 != '\n') { - mbcel_t g; - for (char const *p = t2; ; p += g.len) + char const *p = t2; + while (*p != '\n') { - g = mbcel_scan (p, lim2); - if (g.ch == '\n' || ! c32isspace (g.ch)) + mcel_t g = mcel_scan (p, lim2); + if (! ucore_is (c32isspace, g.c)) break; + p += g.len; } - if (g.ch != '\n') + if (*p != '\n') break; } /* Both lines have nothing but whitespace left. */ @@ -554,45 +550,45 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) break; FALLTHROUGH; case IGNORE_TAB_EXPANSION: - if ((ch1 == ' ' && ch2 == '\t') - || (ch1 == '\t' && ch2 == ' ')) + if ((c1 == ' ' && c2 == '\t') + || (c1 == '\t' && c2 == ' ')) { intmax_t tab2 = tab, column2 = column; while (true) { - if (ch1 == '\t' - || (ch1 == ' ' && column == tabsize - 1)) + if (c1 == '\t' + || (c1 == ' ' && column == tabsize - 1)) { tab++; column = 0; } - else if (ch1 == ' ') + else if (c1 == ' ') column++; else break; - g1 = mbcel_scan (t1, lim1); + g1 = mcel_scan (t1, lim1); t1 += g1.len; - ch1 = g1.ch; + c1 = g1.c; } while (true) { - if (ch2 == '\t' - || (ch2 == ' ' && column2 == tabsize - 1)) + if (c2 == '\t' + || (c2 == ' ' && column2 == tabsize - 1)) { tab2++; column2 = 0; } - else if (ch2 == ' ') + else if (c2 == ' ') column2++; else break; - g2 = mbcel_scan (t2, lim2); + g2 = mcel_scan (t2, lim2); t2 += g2.len; - ch2 = g2.ch; + c2 = g2.c; } if (tab != tab2 || column != column2) @@ -606,15 +602,15 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) if (ignore_case) { - ch1 = c32tolower (ch1); - ch2 = c32tolower (ch2); + c1 = ucore_to (c32tolower, c1); + c2 = ucore_to (c32tolower, c2); } - if (! same_ch_err (ch1, g1.err, ch2, g2.err)) + if (ucore_cmp (c1, c2) != 0) break; } - switch (ch1) + switch (c1) { case '\n': return false; @@ -638,7 +634,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) default: /* Assume that downcasing does not change print width. */ - column += g1.err ? 1 : c32width (ch1); + column += ucore_iserr (c1) ? 1 : c32width (c1); if (column < tabsize) break; FALLTHROUGH; @@ -648,7 +644,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) break; } - ch1prev = ch1; + c1prev = c1; } } @@ -699,11 +695,11 @@ find_and_hash_each_line (struct file_data *current) h = hash (h, ig_case ? tolower (c) : c); } else - for (mbcel_t g; *p != '\n'; p += g.len) + for (mcel_t g; *p != '\n'; p += g.len) { - g = mbcel_scan (p, suffix_begin); - if (! c32isspace (g.ch)) - h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err); + g = mcel_scan (p, suffix_begin); + if (! ucore_is (c32isspace, g.c)) + h = hash (h, (ig_case ? ucore_to (c32tolower, g.c) : g.c)); } break; @@ -728,25 +724,25 @@ find_and_hash_each_line (struct file_data *current) h = hash (h, ig_case ? tolower (c) : c); } else - for (mbcel_t g; *p != '\n'; p += g.len) + for (mcel_t g; *p != '\n'; p += g.len) { - g = mbcel_scan (p, suffix_begin); - if (c32isspace (g.ch)) + g = mcel_scan (p, suffix_begin); + if (ucore_is (c32isspace, g.c)) { do { p += g.len; if (*p == '\n') goto hashing_done; - g = mbcel_scan (p, suffix_begin); + g = mcel_scan (p, suffix_begin); } - while (c32isspace (g.ch)); + while (ucore_is (c32isspace, g.c)); h = hash (h, ' '); } /* G is now the first non-space. */ - h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err); + h = hash (h, ig_case ? ucore_to (c32tolower, g.c) : g.c); } break; @@ -817,39 +813,35 @@ find_and_hash_each_line (struct file_data *current) while (--repetitions != 0); } else - for (mbcel_t g; *p != '\n'; p += g.len) + for (mcel_t g; *p != '\n'; p += g.len) { intmax_t repetitions = 1; - g = mbcel_scan (p, suffix_begin); - char32_t ch; - if (g.err) - { - ch = -g.err; - column++; - } + g = mcel_scan (p, suffix_begin); + ucore_t c = g.c; + if (ucore_iserr (c)) + column++; else { - ch = g.ch; if (ig_white_space & IGNORE_TRAILING_SPACE - && c32isspace (ch)) + && ucore_is (c32isspace, c)) { char const *p1 = p + g.len; - for (mbcel_t g1; ; p1 += g1.len) + for (mcel_t g1; ; p1 += g1.len) { if (*p1 == '\n') { p = p1; goto hashing_done; } - g1 = mbcel_scan (p1, suffix_begin); - if (! c32isspace (g1.ch)) + g1 = mcel_scan (p1, suffix_begin); + if (! ucore_is (c32isspace, g1.c)) break; } } if (ig_white_space & IGNORE_TAB_EXPANSION) - switch (ch) + switch (c) { case '\b': if (0 < column) @@ -862,7 +854,7 @@ find_and_hash_each_line (struct file_data *current) break; case '\t': - ch = ' '; + c = ' '; repetitions = tabsize - column % tabsize; tab += column / tabsize + 1; column = 0; @@ -876,16 +868,16 @@ find_and_hash_each_line (struct file_data *current) break; default: - column += c32width (ch); + column += c32width (c); break; } if (ig_case) - ch = c32tolower (ch); + c = c32tolower (c); } do - h = hash (h, ch); + h = hash (h, c); while (--repetitions != 0); } } @@ -904,16 +896,16 @@ find_and_hash_each_line (struct file_data *current) else { if (ig_case) - for (mbcel_t g; *p != '\n'; p += g.len) + for (mcel_t g; *p != '\n'; p += g.len) { - g = mbcel_scan (p, suffix_begin); - h = hash (h, c32tolower (g.ch) - g.err); + g = mcel_scan (p, suffix_begin); + h = hash (h, ucore_to (c32tolower, g.c)); } else - for (mbcel_t g; *p != '\n'; p += g.len) + for (mcel_t g; *p != '\n'; p += g.len) { - g = mbcel_scan (p, suffix_begin); - h = hash (h, g.ch - g.err); + g = mcel_scan (p, suffix_begin); + h = hash (h, g.c); } } break; diff --git a/src/side.c b/src/side.c index a4b471e..0d1b197 100644 --- a/src/side.c +++ b/src/side.c @@ -22,7 +22,7 @@ #include "diff.h" -#include +#include static void print_sdiff_common_lines (lin, lin); static void print_sdiff_hunk (struct change *); @@ -145,8 +145,8 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound) /* A byte that might start a multibyte character. Increase TEXT_POINTER, counting columns. Assume encoding errors have print width 1. */ - mbcel_t g = mbcel_scan (tp0, text_limit); - int width = g.err ? 1 : c32width (g.ch); + mcel_t g = mcel_scan (tp0, text_limit); + int width = ucore_iserr (g.c) ? 1 : c32width (g.c); if (0 < width && ckd_add (&in_position, in_position, width)) return out_position;