From ae1cdc7239dd3424d03e5d363322c3d1e1314211 Mon Sep 17 00:00:00 2001 From: Paul Eggert Date: Mon, 21 Aug 2023 08:38:16 -0700 Subject: [PATCH] diff: modularize and tune mcel code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Go back to a single mcel module, instead of trying to break it up into ucore and mcel pieces, as breaking it up hurt performance. Use gnulib-tool’s --local-dir to create diffutils-specific modules for mcel; the idea is that this will eventually migrate into Gnulib. * bootstrap.conf (avoided_gnulib_modules): Add mbuiterf. (gnulib_modules): Add mbscasecmp, mcel-prefer. (gnulib_tool_option_extras): Add --local-dir=gl to pick up new files. * cfg.mk (exclude_file_name_regexp--sc_prohibit_doubled_word): Do not exclude now-removed files lib/ucore.c, lib/ucore.h. * lib/Makefile.am: Adjust to use of modules. (noinst_HEADERS): Remove mcel.h, ucore.h. (libdiffutils_a_SOURCES): Remove mcel.c, mcel-casecmp.c, ucore.c * lib/mcel-casecmp.c, lib/ucore.c, lib/ucore.h: Remove. * lib/mcel.h: Switch to LGPLv2.1+. Do not include ucore.h. All uses of ucore_t changed back to using char32_t. Do what ucore.h used to do: include verify.h, limits.h, stddef.h, uchar.h; require config.h, define _GL_LIKELY, _GL_UNLIKELY. (MCEl_CHAR_MAX, MCEL_ERR_MIN, MCEL_ERR_MAX): New constants. (mcel_t): Switch from single ucore_t c to a char32_t ch and unsigned char err. This has significantly better performance on Fedora 38 x86-64. All uses changed. Check that unsigned char promotes to int. (mcel_ch, mcel_err, mcel_cmp, mcel_tocmp): New functions. (MCEL_ERR_SHIFT): Rename from MCEL_ENCODING_ERROR_SHIFT. All uses changed. (mcel_isbasic): Add a _GL_LIKELY to help compilers. All uses changed. (mcel_scan, mcel_scant): Simplify by using mcel_ch, mcel_err. (mcel_casecmp): Remove decl. Callers changed to use mbscasecmp. * gl/lib/mcel.c, gl/lib/mcel.h: Rename from lib/mcel.c, lib/mcel.h. * gl/lib/mbscasecmp.c: New file. * gl/modules/mcel, gl/modules/mcel-prefer, gl/modules/mcel-tests: * gl/tests/test-mcel.c: New files. * src/io.c: Revert use of ucore API. Use plain c32isspace etc. instead of ucore_is. Use .err instead of ucore_iserr. (same_ch_err): Bring back, and use it instead of ucore_cmp. * src/side.c (print_half_line): Use .err instead of ucore_iserr. --- bootstrap.conf | 5 +- cfg.mk | 3 +- gl/lib/mbscasecmp.c | 112 ++++++++++++++++++++++++++ {lib => gl/lib}/mcel.c | 0 {lib => gl/lib}/mcel.h | 178 +++++++++++++++++++++++++++-------------- gl/modules/mcel | 34 ++++++++ gl/modules/mcel-prefer | 24 ++++++ gl/modules/mcel-tests | 12 +++ gl/tests/test-mcel.c | 138 ++++++++++++++++++++++++++++++++ lib/Makefile.am | 4 +- lib/mcel-casecmp.c | 60 -------------- lib/ucore.c | 3 - lib/ucore.h | 132 ------------------------------ src/io.c | 132 ++++++++++++++++-------------- src/side.c | 2 +- 15 files changed, 517 insertions(+), 322 deletions(-) create mode 100644 gl/lib/mbscasecmp.c rename {lib => gl/lib}/mcel.c (100%) rename {lib => gl/lib}/mcel.h (61%) create mode 100644 gl/modules/mcel create mode 100644 gl/modules/mcel-prefer create mode 100644 gl/modules/mcel-tests create mode 100644 gl/tests/test-mcel.c delete mode 100644 lib/mcel-casecmp.c delete mode 100644 lib/ucore.c delete mode 100644 lib/ucore.h diff --git a/bootstrap.conf b/bootstrap.conf index ec800c7..6664a73 100644 --- a/bootstrap.conf +++ b/bootstrap.conf @@ -18,6 +18,7 @@ avoided_gnulib_modules=' --avoid=localename --avoid=lock-tests + --avoid=mbuiterf --avoid=setlocale ' @@ -73,7 +74,8 @@ largefile lstat maintainer-makefile manywarnings -mbrtoc32 +mbscasecmp +mcel-prefer mempcpy minmax mkstemp @@ -140,6 +142,7 @@ XGETTEXT_OPTIONS=$XGETTEXT_OPTIONS'\\\ ' gnulib_tool_option_extras="--tests-base=gnulib-tests + --local-dir=gl --with-tests --symlink --makefile-name=gnulib.mk diff --git a/cfg.mk b/cfg.mk index 4c88c74..aced89e 100644 --- a/cfg.mk +++ b/cfg.mk @@ -74,8 +74,7 @@ config-save: cp lib/config.h config.status $(_cf_state_dir)/latest exclude_file_name_regexp--sc_space_tab = ^gl/lib/.*\.c\.diff$$ -exclude_file_name_regexp--sc_prohibit_doubled_word = \ - ^(tests/y2038-vs-32bit|lib/ucore\.h)$$ +exclude_file_name_regexp--sc_prohibit_doubled_word = ^tests/y2038-vs-32bit$$ # Tell gnulib's tight_scope rule that we mark externs with XTERN export _gl_TS_extern = extern|XTERN|DIFF_INLINE|SYSTEM_INLINE|SYSTEM_EXTERN diff --git a/gl/lib/mbscasecmp.c b/gl/lib/mbscasecmp.c new file mode 100644 index 0000000..8d7a7fc --- /dev/null +++ b/gl/lib/mbscasecmp.c @@ -0,0 +1,112 @@ +/* Case-insensitive string comparison function. + Copyright (C) 1998-1999, 2005-2023 Free Software Foundation, Inc. + Written by Bruno Haible , 2005, + based on earlier glibc code. + + This file is free software: you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + This file is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with this program. If not, see . */ + +#include + +/* Specification. */ +#include + +#include +#include +#include + +#if GNULIB_MCEL_PREFER +# include "mcel.h" +#else +# include "mbuiterf.h" +#endif + +/* Compare the character strings S1 and S2, ignoring case, returning less than, + equal to or greater than zero if S1 is lexicographically less than, equal to + or greater than S2. + Note: This function may, in multibyte locales, return 0 for strings of + different lengths! */ +int +mbscasecmp (const char *s1, const char *s2) +{ + if (s1 == s2) + return 0; + + char const *p1 = s1; + char const *p2 = s2; + + /* Be careful not to look at the entire extent of s1 or s2 until needed. + This is useful because when two strings differ, the difference is + most often already in the very few first characters. */ + if (MB_CUR_MAX > 1) + { +#if GNULIB_MCEL_PREFER + while (true) + { + mcel_t g1 = mcel_scanz (p1); p1 += g1.len; + mcel_t g2 = mcel_scanz (p2); p2 += g2.len; + int cmp = mcel_tocmp (c32tolower, g1, g2); + if (cmp | !g1.ch) + return cmp; + } +#else + mbuif_state_t state1; + const char *iter1; + mbuif_init (state1); + iter1 = s1; + + mbuif_state_t state2; + const char *iter2; + mbuif_init (state2); + iter2 = s2; + + while (mbuif_avail (state1, iter1) && mbuif_avail (state2, iter2)) + { + mbchar_t cur1 = mbuif_next (state1, iter1); + mbchar_t cur2 = mbuif_next (state2, iter2); + int cmp = mb_casecmp (cur1, cur2); + + if (cmp != 0) + return cmp; + + iter1 += mb_len (cur1); + iter2 += mb_len (cur2); + } + if (mbuif_avail (state1, iter1)) + /* s2 terminated before s1. */ + return 1; + if (mbuif_avail (state2, iter2)) + /* s1 terminated before s2. */ + return -1; + return 0; +#endif + } + else + while (true) + { + unsigned char c1 = *p1++; + unsigned char c2 = *p2++; + /* On machines where 'char' and 'int' are types of the same size, the + difference of two 'unsigned char' values - including the sign bit - + doesn't fit in an 'int'. */ + int cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2); + if (cmp) + { + c1 = tolower (c1); + c2 = tolower (c2); + cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2); + } + if (cmp | !c1) + return cmp; + } +} diff --git a/lib/mcel.c b/gl/lib/mcel.c similarity index 100% rename from lib/mcel.c rename to gl/lib/mcel.c diff --git a/lib/mcel.h b/gl/lib/mcel.h similarity index 61% rename from lib/mcel.h rename to gl/lib/mcel.h index 47fa681..867d925 100644 --- a/lib/mcel.h +++ b/gl/lib/mcel.h @@ -3,7 +3,7 @@ This file is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation; either version 3 of the + published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This file is distributed in the hope that it will be useful, @@ -16,7 +16,14 @@ /* Written by Paul Eggert. */ -/* The mcel_scan function lets code iterate through an array of bytes, +/* The macros in this file implement multi-byte character representation + and forward iteration through a multi-byte string. + They are simpler and typically faster than the mbiter family. + However, they do not support obsolescent encodings like CP864, + EBCDIC, Johab, and Shift JIS that glibc also does not support, + and it is up to the caller to coalesce encoding-error bytes if desired. + + The mcel_scan function lets code iterate through an array of bytes, supporting character encodings in practical use more simply than using plain mbrtoc32. @@ -35,8 +42,11 @@ process (g); } + You can select from G using G.ch, G.err, and G.len. + G is an encoding error if G.err is nonzero, a character otherwise. + The mcel_scanz function is similar except it works with a - string of unknown length that is terminated with '\0'. + string of unknown but positive length that is terminated with '\0'. Instead of this single-byte code: char *p = ...; @@ -57,12 +67,16 @@ '\n', '.', '/' are safe, as they cannot be a part (even a trailing byte) of a multi-byte character. - You can select from G using G.c and G.len. - You can use ucore_* functions on G.c, e.g., ucore_iserr (G.c), - ucore_is (c32isalpha, G.c), and ucore_to (c32tolower, G.c). + mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values. - mcel_strcasecmp compares two null-terminated multi-byte strings - lexicographically, ignoring case. + mcel_cmp (G1, G2) compares two mcel_t values lexicographically by + character or by encoding byte value, with encoding bytes sorting + after characters. + + Calls like c32isalpha (G.ch) test G; they return false for encoding + errors since calls like c32isalpha (0) return false. Calls like + mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2), + but transliterate first. Although ISO C and POSIX allow encodings that have shift states or that can produce multiple characters from an indivisible byte sequence, @@ -73,9 +87,20 @@ #ifndef _MCEL_H #define _MCEL_H 1 -/* This API is an extension of ucore.h. Programs that include this - file can assume ucore.h is included too. */ -#include +#if !_GL_CONFIG_H_INCLUDED + #error "Please include config.h first." +#endif + +#include + +#include +#include +#include + +/* Pacify GCC re type limits. */ +#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__) +# pragma GCC diagnostic ignored "-Wtype-limits" +#endif /* The maximum multi-byte character length supported on any platform. This can be less than MB_LEN_MAX because many platforms have a @@ -85,24 +110,41 @@ 0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX. */ enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 }; -/* mcel_t is a type representing a character or encoding error C, - along with a count of the LEN bytes that represent C. - 1 <= LEN <= MB_LEN_MAX. */ +/* Bounds for mcel_t members. */ +enum { MCEL_CHAR_MAX = 0x10FFFF }; +enum { MCEL_ERR_MIN = 0x80 }; +enum { MCEL_ERR_MAX = UCHAR_MAX }; + +/* mcel_t is a type representing a character CH or an encoding error byte ERR, + along with a count of the LEN bytes that represent CH or ERR. + If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX; + otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR <= MCEL_ERR_MAX. + CH == 0, and LEN == 1. */ typedef struct { - ucore_t c; + char32_t ch; + unsigned char err; unsigned char len; } mcel_t; /* Every multi-byte character length fits in mcel_t's LEN. */ static_assert (MB_LEN_MAX <= UCHAR_MAX); +/* Shifting an encoding error byte left by this value + suffices to sort encoding errors after characters. */ +enum { MCEL_ERR_SHIFT = 14 }; +static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT); + +/* Unsigned char promotes to int. */ +static_assert (UCHAR_MAX <= INT_MAX); + /* Bytes have 8 bits, as POSIX requires. */ static_assert (CHAR_BIT == 8); -/* Pacify GCC re 'c <= 0x7f' below. */ -#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__) -# pragma GCC diagnostic ignored "-Wtype-limits" +#ifndef _GL_LIKELY +/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */ +# define _GL_LIKELY(cond) __builtin_expect ((cond), 1) +# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0) #endif _GL_INLINE_HEADER_BEGIN @@ -110,18 +152,44 @@ _GL_INLINE_HEADER_BEGIN # define MCEL_INLINE _GL_INLINE #endif -/* With mcel there should be no need for the performance overhead of - replacing glibc mbrtoc32, as callers shouldn't care whether the - C locale treats a byte with the high bit set as an encoding error. */ -#ifdef __GLIBC__ -# undef mbrtoc32 -#endif +/* mcel_t constructors. */ +MCEL_INLINE mcel_t +mcel_ch (char32_t ch, size_t len) +{ + assume (0 < len); + assume (len <= MCEL_LEN_MAX); + assume (ch <= MCEL_CHAR_MAX); + return (mcel_t) {ch: ch, len: len}; +} +MCEL_INLINE mcel_t +mcel_err (unsigned char err) +{ + assume (MCEL_ERR_MIN <= err); + assume (err <= MCEL_ERR_MAX); + return (mcel_t) {err: err, len: 1}; +} -/* Shifting an encoding error byte (at least 0x80) left by this value - yields a value in the range UCORE_ERR_MIN .. 2*UCORE_ERR_MIN - 1. - This suffices to sort encoding errors after characters. */ -enum { MCEL_ENCODING_ERROR_SHIFT = 14 }; -static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT); +/* Compare C1 and C2, with encoding errors sorting after characters. + Return <0, 0, >0 for <, =, >. */ +MCEL_INLINE int +mcel_cmp (mcel_t c1, mcel_t c2) +{ + int ch1 = c1.ch, ch2 = c2.ch; + return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2); +} + +/* Apply the uchar translator TO to C1 and C2 and compare the results, + with encoding errors sorting after characters, + Return <0, 0, >0 for <, =, >. */ +MCEL_INLINE int +mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2) +{ + int cmp = mcel_cmp (c1, c2); + if (_GL_LIKELY ((c1.err - c2.err) | !cmp)) + return cmp; + int ch1 = to (c1.ch), ch2 = to (c2.ch); + return ch1 - ch2; +} /* Whether C represents itself as a Unicode character when it is the first byte of a single- or multi-byte character. @@ -130,9 +198,16 @@ static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT); MCEL_INLINE bool mcel_isbasic (char c) { - return 0 <= c && c <= 0x7f; + return _GL_LIKELY (0 <= c && c <= 0x7f); } +/* With mcel there should be no need for the performance overhead of + replacing glibc mbrtoc32, as callers shouldn't care whether the + C locale treats a byte with the high bit set as an encoding error. */ +#ifdef __GLIBC__ +# undef mbrtoc32 +#endif + /* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM. Return the character or encoding error starting at P. */ MCEL_INLINE mcel_t @@ -141,8 +216,8 @@ mcel_scan (char const *p, char const *lim) /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32. In supported encodings, the first byte of a multi-byte character cannot be an ASCII byte. */ - if (_GL_LIKELY (mcel_isbasic (*p))) - return (mcel_t) { .c = *p, .len = 1 }; + if (mcel_isbasic (*p)) + return mcel_ch (*p, 1); /* An initial mbstate_t; initialization optimized for some platforms. For details about these and other platforms, see wchar.in.h. */ @@ -171,29 +246,17 @@ mcel_scan (char const *p, char const *lim) mbstate_t mbs = {0}; #endif - char32_t c; - size_t len = mbrtoc32 (&c, p, lim - p, &mbs); + char32_t ch; + size_t len = mbrtoc32 (&ch, p, lim - p, &mbs); /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3 is not supported and MB_LEN_MAX is small. */ - if (_GL_LIKELY (len <= (size_t) -1 / 2)) - { - /* A multi-byte character. LEN must be positive, - as *P != '\0' and shift sequences are not supported. */ - assume (0 < len); - assume (len <= MB_LEN_MAX); - assume (c <= UCORE_CHAR_MAX); - return (mcel_t) { .c = c, .len = len }; - } - else - { - /* An encoding error. */ - unsigned char b = *p; - c = b << MCEL_ENCODING_ERROR_SHIFT; - assume (UCORE_ERR_MIN <= c); - assume (c <= UCORE_ERR_MAX); - return (mcel_t) { .c = c, .len = 1 }; - } + if (_GL_UNLIKELY ((size_t) -1 / 2 < len)) + return mcel_err (*p); + + /* A multi-byte character. LEN must be positive, + as *P != '\0' and shift sequences are not supported. */ + return mcel_ch (ch, len); } /* Scan bytes from P, a byte sequence terminated by TERMINATOR. @@ -205,11 +268,11 @@ MCEL_INLINE mcel_t mcel_scant (char const *p, char terminator) { /* Handle ASCII quickly for speed. */ - if (_GL_LIKELY (mcel_isbasic (*p))) - return (mcel_t) { .c = *p, .len = 1 }; + if (mcel_isbasic (*p)) + return mcel_ch (*p, 1); /* Defer to mcel_scan for non-ASCII. Compute length with code that - is typically branch-free and faster than memchr or strnlen. */ + is typically faster than strnlen. */ char const *lim = p + 1; for (int i = 0; i < MCEL_LEN_MAX - 1; i++) lim += *lim != terminator; @@ -226,11 +289,6 @@ mcel_scanz (char const *p) return mcel_scant (p, '\0'); } -/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case. - Return <0, 0, >0 for <, =, >. Consider encoding errors to be - greater than characters and compare them byte by byte. */ -int mcel_casecmp (char const *s1, char const *s2); - _GL_INLINE_HEADER_END #endif /* _MCEL_H */ diff --git a/gl/modules/mcel b/gl/modules/mcel new file mode 100644 index 0000000..59ca633 --- /dev/null +++ b/gl/modules/mcel @@ -0,0 +1,34 @@ +Description: +Multibye Characters, Encoding errors, and Lengths + +Files: +lib/mcel.c +lib/mcel.h + +Depends-on: +assert-h +extern-inline +limits-h +mbrtoc32 +stdbool +uchar +verify + +configure.ac: + +Makefile.am: +lib_SOURCES += mcel.c mcel.h + +Include: +"mcel.h" + +Link: +$(LTLIBUNISTRING) when linking with libtool, $(LIBUNISTRING) otherwise +$(MBRTOWC_LIB) +$(LTLIBC32CONV) when linking with libtool, $(LIBC32CONV) otherwise + +License: +LGPLv2+ + +Maintainer: +all diff --git a/gl/modules/mcel-prefer b/gl/modules/mcel-prefer new file mode 100644 index 0000000..49ecfd8 --- /dev/null +++ b/gl/modules/mcel-prefer @@ -0,0 +1,24 @@ +Description: +mcel is preferred to the mbiter family when either will do. +mcel is simpler and faster. However, it does not support some +obsolete encodings that are also not supported by glibc locales, +and the caller is responsible for coalescing sequences of +error-encoding bytes if that is desired. + +Files: + +Depends-on: +mcel + +configure.ac: +gl_MODULE_INDICATOR([mcel-prefer]) + +Makefile.am: + +Include: + +License: +LGPLv2+ + +Maintainer: +Paul Eggert diff --git a/gl/modules/mcel-tests b/gl/modules/mcel-tests new file mode 100644 index 0000000..4b9ba0e --- /dev/null +++ b/gl/modules/mcel-tests @@ -0,0 +1,12 @@ +Files: +tests/test-mcel.c + +Depends-on: +assert-h +setlocale + +configure.ac: + +Makefile.am: +TESTS += test-mcel +check_PROGRAMS += test-mcel diff --git a/gl/tests/test-mcel.c b/gl/tests/test-mcel.c new file mode 100644 index 0000000..b5323f2 --- /dev/null +++ b/gl/tests/test-mcel.c @@ -0,0 +1,138 @@ +/* Test + Copyright 2023 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +#include + +#include + +#include + +#include "macros.h" + +static wint_t +to_ascii (wint_t c) +{ + return c & 0x7f; +} + +static int +sgn (int i) +{ + return (i > 0) - (i < 0); +} + +static void +test_mcel_vs_mbrtoc32 (unsigned char uc, mcel_t c, size_t n, char32_t ch) +{ + ASSERT (!c.err == (n <= MB_LEN_MAX)); + ASSERT (c.err + ? c.err == uc && c.ch == 0 && c.len == 1 + : c.ch == ch && c.len == (n ? n : 1)); +} + +int +main (void) +{ + /* configure should already have checked that the locale is supported. */ + if (setlocale (LC_ALL, "") == NULL) + return 1; + + mcel_t prev; + for (int ch = 0; ch < 0x80; ch++) + { + mcel_t c = mcel_ch (ch, 1); + ASSERT (c.ch == ch); + ASSERT (c.len == 1); + ASSERT (!c.err); + ASSERT (mcel_cmp (c, c) == 0); + ASSERT (mcel_tocmp (to_ascii, c, c) == 0); + if (ch) + { + ASSERT (mcel_cmp (prev, c) < 0); + ASSERT (mcel_cmp (c, prev) > 0); + ASSERT (mcel_tocmp (to_ascii, prev, c) < 0); + ASSERT (mcel_tocmp (to_ascii, c, prev) > 0); + } + ASSERT (mcel_isbasic (ch)); + prev = c; + } + for (char ch = CHAR_MIN; ; ch++) + { + ASSERT (mcel_isbasic (ch) == (0 <= ch && ch <= 0x7f)); + if (ch == CHAR_MAX) + break; + } + for (int ch = 0x80; ch < 0x200; ch++) + { + mcel_t c = mcel_ch (ch, 2); + ASSERT (c.ch == ch); + ASSERT (c.len == 2); + ASSERT (!c.err); + ASSERT (mcel_cmp (c, c) == 0); + ASSERT (mcel_tocmp (to_ascii, c, c) == 0); + ASSERT (mcel_cmp (prev, c) < 0); + ASSERT (mcel_cmp (c, prev) > 0); + ASSERT (mcel_tocmp (to_ascii, c, c) == 0); + int cmp = to_ascii (c.ch) ? -1 : 1; + ASSERT (sgn (mcel_tocmp (to_ascii, prev, c)) == cmp); + ASSERT (sgn (mcel_tocmp (to_ascii, c, prev)) == -cmp); + prev = c; + } + for (unsigned char err = 0x80; ; err++) + { + mcel_t c = mcel_err (err); + ASSERT (!c.ch); + ASSERT (c.len == 1); + ASSERT (c.err == err); + ASSERT (mcel_cmp (c, c) == 0); + ASSERT (mcel_cmp (prev, c) < 0); + ASSERT (mcel_cmp (c, prev) > 0); + ASSERT (mcel_tocmp (to_ascii, c, c) == 0); + ASSERT (mcel_tocmp (to_ascii, prev, c) < 0); + ASSERT (mcel_tocmp (to_ascii, c, prev) > 0); + prev = c; + if (err == (unsigned char) -1) + break; + } + + for (int i = CHAR_MIN; i <= CHAR_MAX; i++) + for (int j = CHAR_MIN; i <= CHAR_MAX; i++) + for (int k = CHAR_MIN; k <= CHAR_MAX; k++) + { + char const ijk[] = {i, j, k}; + mbstate_t mbs = {0}; + char32_t ch; + size_t n = mbrtoc32 (&ch, ijk, sizeof ijk, &mbs); + mcel_t c = mcel_scan (ijk, ijk + sizeof ijk); + test_mcel_vs_mbrtoc32 (i, c, n, ch); + + static char const terminator[] = "\r\n./"; + for (int ti = 0; ti < sizeof terminator; ti++) + { + char t = terminator[ti]; + if (i == t) + continue; + char const ijkt[] = {i, j, k, t}; + mcel_t d = mcel_scant (ijk, t); + ASSERT (c.ch == d.ch && c.err == d.err && c.len == d.len); + if (!t) + { + mcel_t z = mcel_scanz (ijk); + ASSERT (d.ch == z.ch && d.err == z.err && d.len == z.len); + } + } + } +} diff --git a/lib/Makefile.am b/lib/Makefile.am index a5a880d..1d371dc 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -29,7 +29,7 @@ noinst_HEADERS = include gnulib.mk -noinst_HEADERS += cmpbuf.h diagnose.h mcel.h ucore.h -libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mcel.c mcel-casecmp.c ucore.c +noinst_HEADERS += cmpbuf.h diagnose.h +libdiffutils_a_SOURCES += cmpbuf.c diagnose.c AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS) diff --git a/lib/mcel-casecmp.c b/lib/mcel-casecmp.c deleted file mode 100644 index 24c7042..0000000 --- a/lib/mcel-casecmp.c +++ /dev/null @@ -1,60 +0,0 @@ -/* Case-insensitive string comparison function. - Copyright 2023 Free Software Foundation, Inc. - - This file is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation, either version 3 of the - License, or (at your option) any later version. - - This file is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with this program. If not, see . */ - -/* Written by Paul Eggert. */ - -#include - -/* Specification. */ -#include - -#include -#include - -int -mcel_casecmp (char const *s1, char const *s2) -{ - char const *p1 = s1; - char const *p2 = s2; - - /* Do not look at the entire extent of S1 or S2 until needed: - when two strings differ, the difference is typically early. */ - if (MB_CUR_MAX == 1) - while (true) - { - static_assert (UCHAR_MAX <= INT_MAX); - unsigned char c1 = *p1++; - unsigned char c2 = *p2++; - int cmp = c1 - c2; - if (_GL_UNLIKELY (cmp)) - { - c1 = tolower (c1); - c2 = tolower (c2); - cmp = c1 - c2; - } - if (_GL_UNLIKELY (cmp | !c1)) - return cmp; - } - else - while (true) - { - mcel_t g1 = mcel_scanz (p1); p1 += g1.len; - mcel_t g2 = mcel_scanz (p2); p2 += g2.len; - int cmp = ucore_tocmp (c32tolower, g1.c, g2.c); - if (_GL_UNLIKELY (cmp | !g1.c)) - return cmp; - } -} diff --git a/lib/ucore.c b/lib/ucore.c deleted file mode 100644 index 5831b1b..0000000 --- a/lib/ucore.c +++ /dev/null @@ -1,3 +0,0 @@ -#include -#define UCORE_INLINE _GL_EXTERN_INLINE -#include "ucore.h" diff --git a/lib/ucore.h b/lib/ucore.h deleted file mode 100644 index 7e13db7..0000000 --- a/lib/ucore.h +++ /dev/null @@ -1,132 +0,0 @@ -/* Unicode Characters OR Encoding errors (UCOREs) - Copyright 2023 Free Software Foundation, Inc. - - This file is free software: you can redistribute it and/or modify - it under the terms of the GNU Lesser General Public License as - published by the Free Software Foundation; either version 3 of the - License, or (at your option) any later version. - - This file is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public License - along with this program. If not, see . */ - -/* Written by Paul Eggert. */ - -/* This API's fundamental type ucore_t represents - a single Unicode character or an encoding error. - ucore_iserr (C) tests whether C is an encoding error. - ucore_is (P, C) etc. test whether char class P accepts C. - ucore_to (TO, C) etc. use TO to convert C. - ucore_cmp (C1, C2) and ucore_tocmp (TO, C1, C2) compare C1 and C2, - with encoding errors sorting after characters. */ - -#ifndef _UCORE_H -#define _UCORE_H 1 - -#if !_GL_CONFIG_H_INCLUDED - #error "Please include config.h first." -#endif - -#include - -#include -#include -#include - -/* ucore_t represents a Unicode Character OR Encoding error. - If 0 <= C <= UCORE_CHAR_MAX, C represents a Unicode character. - If UCORE_ERR_MIN <= C <= UCORE_ERR_MAX, C represents an encoding error. - Other ucore_t values C are invalid. */ -typedef int ucore_t; - -enum { - UCORE_CHAR_MAX = 0x10FFFF, - UCORE_ERR_MIN = 0x200000, - UCORE_ERR_MAX = 2 * UCORE_ERR_MIN - 1 -}; - -/* Information is not lost by encoding errors as integers. */ -static_assert (UCHAR_MAX <= UCORE_ERR_MAX - UCORE_ERR_MIN); - -/* On glibc platforms, predicates like c32isalnum and c32tolower - do the right thing for char32_t values that are not valid characters. - POSIX says the behavior is undefined, so play it safe elsewhere. - Do not rely on UCORE_C32_SAFE for c32width. */ -#ifdef __GLIBC__ -enum { UCORE_C32_SAFE = true }; -#else -enum { UCORE_C32_SAFE = false }; -#endif - -#ifndef _GL_LIKELY -/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */ -# define _GL_LIKELY(cond) __builtin_expect ((cond), 1) -# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0) -#endif - -_GL_INLINE_HEADER_BEGIN -#ifndef UCORE_INLINE -# define UCORE_INLINE _GL_INLINE -#endif - -/* Return true if C represents an encoding error, false otherwise. */ -UCORE_INLINE bool -ucore_iserr (ucore_t c) -{ - /* (c & UCORE_ERR_MIN) is a bit cheaper than (UCORE_ERR_MIN <= c) - with GCC 13 x86-64. */ - if (_GL_UNLIKELY (c & UCORE_ERR_MIN)) - { - assume (UCORE_ERR_MIN <= c && c <= UCORE_ERR_MAX); - return true; - } - else - { - assume (0 <= c && c <= UCORE_CHAR_MAX); - return false; - } -} - -/* Whether the uchar predicate P accepts C, e.g., ucore_is (c32isalpha, C). */ -UCORE_INLINE bool -ucore_is (int (*p) (wint_t), wint_t c) -{ - /* When C is out of range, predicates based on glibc return false. - Behavior is undefined on other platforms, so play it safe. */ - return (UCORE_C32_SAFE || ! ucore_iserr (c)) && p (c); -} - -/* Apply the uchar translator TO to C, e.g., ucore_to (c32tolower, C). */ -UCORE_INLINE wint_t -ucore_to (wint_t (*to) (wint_t), ucore_t c) -{ - return UCORE_C32_SAFE || ! ucore_iserr (c) ? to (c) : c; -} - -/* Compare C1 and C2, with encoding errors sorting after characters. - Return <0, 0, >0 for <, =, >. */ -UCORE_INLINE int -ucore_cmp (ucore_t c1, ucore_t c2) -{ - return c1 - c2; -} - -/* Apply the uchar translater TO to C1 and C2 and compare the results, - with encoding errors sorting after characters, - Return <0, 0, >0 for <, =, >. */ -UCORE_INLINE int -ucore_tocmp (wint_t (*to) (wint_t), ucore_t c1, ucore_t c2) -{ - if (c1 == c2) - return 0; - int i1 = ucore_to (to, c1), i2 = ucore_to (to, c2); - return i1 - i2; -} - -_GL_INLINE_HEADER_END - -#endif /* _MCEL_H */ diff --git a/src/io.c b/src/io.c index 8442adb..db61fff 100644 --- a/src/io.c +++ b/src/io.c @@ -230,6 +230,14 @@ slurp (struct file_data *current) } } +/* Return true if CH1 and ERR1 stand for the same character or + encoding error as CH2 and ERR2. */ +static bool +same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2) +{ + return ! ((ch1 ^ ch2) | (err1 ^ err2)); +} + /* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically one line from each input file) according to the command line options. Line lengths include the trailing newline. @@ -427,7 +435,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) { char const *lim1 = s1 + s1len; char const *lim2 = s2 + s2len; - ucore_t c1prev = 0; + char32_t ch1prev = 0; while (true) { @@ -435,27 +443,27 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) mcel_t g2 = mcel_scan (t2, lim2); t1 += g1.len; t2 += g2.len; - ucore_t c1 = g1.c; - ucore_t c2 = g2.c; + char32_t ch1 = g1.ch; + char32_t ch2 = g2.ch; /* Test for exact equality first, since it's a common case. */ - if (ucore_cmp (c1, c2) != 0) + if (! same_ch_err (ch1, g1.err, ch2, g2.err)) { switch (ignore_white_space) { case IGNORE_ALL_SPACE: /* For -w, just skip past any white space. */ - while (c1 != '\n' && ! ucore_is (c32isspace, c1)) + while (ch1 != '\n' && c32isspace (ch1)) { g1 = mcel_scan (t1, lim1); t1 += g1.len; - c1 = g1.c; + ch1 = g1.ch; } - while (c2 != '\n' && ucore_is (c32isspace, c2)) + while (ch2 != '\n' && c32isspace (ch2)) { g2 = mcel_scan (t2, lim2); t2 += g2.len; - c2 = g2.c; + ch2 = g2.ch; } break; @@ -463,48 +471,46 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) /* For -b, advance past any sequence of white space in line 1 and consider it just one space, or nothing at all if it is at the end of the line. */ - if (ucore_is (c32isspace, c1)) - while (c1 != '\n') + if (c32isspace (ch1)) + while (ch1 != '\n') { g1 = mcel_scan (t1, lim1); t1 += g1.len; - c1 = g1.c; - if (! ucore_is (c32isspace, c1)) + ch1 = g1.ch; + if (! c32isspace (ch1)) { t1 -= g1.len; - c1 = ' '; + ch1 = ' '; break; } } /* Likewise for line 2. */ - if (ucore_is (c32isspace, c2)) - while (c2 != '\n') + if (c32isspace (ch2)) + while (ch2 != '\n') { g2 = mcel_scan (t2, lim2); t2 += g2.len; - c2 = g2.c; - if (! ucore_is (c32isspace, c2)) + ch2 = g2.ch; + if (! c32isspace (ch2)) { t2 -= g2.len; - c2 = ' '; + ch2 = ' '; break; } } - if (c1 != c2) + if (ch1 != ch2) { /* If we went too far when doing the simple test for equality, go back to the first non-white-space character in both sides and try again. */ - if (c2 == ' ' && c1 != '\n' - && ucore_is (c32isspace, c1prev)) + if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev)) { t1 -= g1.len; continue; } - if (c1 == ' ' && c2 != '\n' - && ucore_is (c32isspace, c1prev)) + if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev)) { t2 -= g2.len; continue; @@ -515,28 +521,28 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) case IGNORE_TRAILING_SPACE: case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE: - if (ucore_is (c32isspace, c1) && ucore_is (c32isspace, c2)) + if (c32isspace (ch1) && c32isspace (ch2)) { - if (c1 != '\n') + if (ch1 != '\n') { char const *p = t1; while (*p != '\n') { mcel_t g = mcel_scan (p, lim1); - if (! ucore_is (c32isspace, g.c)) + if (c32isspace (g.ch)) break; p += g.len; } if (*p != '\n') break; } - if (c2 != '\n') + if (ch2 != '\n') { char const *p = t2; while (*p != '\n') { mcel_t g = mcel_scan (p, lim2); - if (! ucore_is (c32isspace, g.c)) + if (! c32isspace (g.ch)) break; p += g.len; } @@ -550,45 +556,45 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) break; FALLTHROUGH; case IGNORE_TAB_EXPANSION: - if ((c1 == ' ' && c2 == '\t') - || (c1 == '\t' && c2 == ' ')) + if ((ch1 == ' ' && ch2 == '\t') + || (ch1 == '\t' && ch2 == ' ')) { intmax_t tab2 = tab, column2 = column; while (true) { - if (c1 == '\t' - || (c1 == ' ' && column == tabsize - 1)) + if (ch1 == '\t' + || (ch1 == ' ' && column == tabsize - 1)) { tab++; column = 0; } - else if (c1 == ' ') + else if (ch1 == ' ') column++; else break; g1 = mcel_scan (t1, lim1); t1 += g1.len; - c1 = g1.c; + ch1 = g1.ch; } while (true) { - if (c2 == '\t' - || (c2 == ' ' && column2 == tabsize - 1)) + if (ch2 == '\t' + || (ch2 == ' ' && column2 == tabsize - 1)) { tab2++; column2 = 0; } - else if (c2 == ' ') + else if (ch2 == ' ') column2++; else break; g2 = mcel_scan (t2, lim2); t2 += g2.len; - c2 = g2.c; + ch2 = g2.ch; } if (tab != tab2 || column != column2) @@ -602,15 +608,15 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) if (ignore_case) { - c1 = ucore_to (c32tolower, c1); - c2 = ucore_to (c32tolower, c2); + ch1 = c32tolower (ch1); + ch2 = c32tolower (ch2); } - if (ucore_cmp (c1, c2) != 0) + if (! same_ch_err (ch1, g1.err, ch2, g2.err)) break; } - switch (c1) + switch (ch1) { case '\n': return false; @@ -634,7 +640,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) default: /* Assume that downcasing does not change print width. */ - column += ucore_iserr (c1) ? 1 : c32width (c1); + column += g1.err ? 1 : c32width (ch1); if (column < tabsize) break; FALLTHROUGH; @@ -644,7 +650,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len) break; } - c1prev = c1; + ch1prev = ch1; } } @@ -698,8 +704,8 @@ find_and_hash_each_line (struct file_data *current) for (mcel_t g; *p != '\n'; p += g.len) { g = mcel_scan (p, suffix_begin); - if (! ucore_is (c32isspace, g.c)) - h = hash (h, (ig_case ? ucore_to (c32tolower, g.c) : g.c)); + if (! c32isspace (g.ch)) + h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err); } break; @@ -727,7 +733,7 @@ find_and_hash_each_line (struct file_data *current) for (mcel_t g; *p != '\n'; p += g.len) { g = mcel_scan (p, suffix_begin); - if (ucore_is (c32isspace, g.c)) + if (c32isspace (g.ch)) { do { @@ -736,13 +742,13 @@ find_and_hash_each_line (struct file_data *current) goto hashing_done; g = mcel_scan (p, suffix_begin); } - while (ucore_is (c32isspace, g.c)); + while (c32isspace (g.ch)); h = hash (h, ' '); } /* G is now the first non-space. */ - h = hash (h, ig_case ? ucore_to (c32tolower, g.c) : g.c); + h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err); } break; @@ -818,13 +824,17 @@ find_and_hash_each_line (struct file_data *current) intmax_t repetitions = 1; g = mcel_scan (p, suffix_begin); - ucore_t c = g.c; - if (ucore_iserr (c)) - column++; + char32_t ch; + if (g.err) + { + ch = -g.err; + column++; + } else { + ch = g.ch; if (ig_white_space & IGNORE_TRAILING_SPACE - && ucore_is (c32isspace, c)) + && c32isspace (ch)) { char const *p1 = p + g.len; for (mcel_t g1; ; p1 += g1.len) @@ -835,13 +845,13 @@ find_and_hash_each_line (struct file_data *current) goto hashing_done; } g1 = mcel_scan (p1, suffix_begin); - if (! ucore_is (c32isspace, g1.c)) + if (! c32isspace (g1.ch)) break; } } if (ig_white_space & IGNORE_TAB_EXPANSION) - switch (c) + switch (ch) { case '\b': if (0 < column) @@ -854,7 +864,7 @@ find_and_hash_each_line (struct file_data *current) break; case '\t': - c = ' '; + ch = ' '; repetitions = tabsize - column % tabsize; tab += column / tabsize + 1; column = 0; @@ -868,16 +878,16 @@ find_and_hash_each_line (struct file_data *current) break; default: - column += c32width (c); + column += c32width (ch); break; } if (ig_case) - c = c32tolower (c); + ch = c32tolower (ch); } do - h = hash (h, c); + h = hash (h, ch); while (--repetitions != 0); } } @@ -899,13 +909,13 @@ find_and_hash_each_line (struct file_data *current) for (mcel_t g; *p != '\n'; p += g.len) { g = mcel_scan (p, suffix_begin); - h = hash (h, ucore_to (c32tolower, g.c)); + h = hash (h, c32tolower (g.ch) - g.err); } else for (mcel_t g; *p != '\n'; p += g.len) { g = mcel_scan (p, suffix_begin); - h = hash (h, g.c); + h = hash (h, g.ch - g.err); } } break; diff --git a/src/side.c b/src/side.c index 0d1b197..7d8c4bd 100644 --- a/src/side.c +++ b/src/side.c @@ -146,7 +146,7 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound) Increase TEXT_POINTER, counting columns. Assume encoding errors have print width 1. */ mcel_t g = mcel_scan (tp0, text_limit); - int width = ucore_iserr (g.c) ? 1 : c32width (g.c); + int width = g.err ? 1 : c32width (g.ch); if (0 < width && ckd_add (&in_position, in_position, width)) return out_position;