diff: modularize and tune mcel code

Go back to a single mcel module, instead of trying to break it up into ucore and mcel pieces, as breaking it up hurt performance. Use gnulib-tool’s --local-dir to create diffutils-specific modules for mcel; the idea is that this will eventually migrate into Gnulib. * bootstrap.conf (avoided_gnulib_modules): Add mbuiterf. (gnulib_modules): Add mbscasecmp, mcel-prefer. (gnulib_tool_option_extras): Add --local-dir=gl to pick up new files. * cfg.mk (exclude_file_name_regexp--sc_prohibit_doubled_word): Do not exclude now-removed files lib/ucore.c, lib/ucore.h. * lib/Makefile.am: Adjust to use of modules. (noinst_HEADERS): Remove mcel.h, ucore.h. (libdiffutils_a_SOURCES): Remove mcel.c, mcel-casecmp.c, ucore.c * lib/mcel-casecmp.c, lib/ucore.c, lib/ucore.h: Remove. * lib/mcel.h: Switch to LGPLv2.1+. Do not include ucore.h. All uses of ucore_t changed back to using char32_t. Do what ucore.h used to do: include verify.h, limits.h, stddef.h, uchar.h; require config.h, define _GL_LIKELY, _GL_UNLIKELY. (MCEl_CHAR_MAX, MCEL_ERR_MIN, MCEL_ERR_MAX): New constants. (mcel_t): Switch from single ucore_t c to a char32_t ch and unsigned char err. This has significantly better performance on Fedora 38 x86-64. All uses changed. Check that unsigned char promotes to int. (mcel_ch, mcel_err, mcel_cmp, mcel_tocmp): New functions. (MCEL_ERR_SHIFT): Rename from MCEL_ENCODING_ERROR_SHIFT. All uses changed. (mcel_isbasic): Add a _GL_LIKELY to help compilers. All uses changed. (mcel_scan, mcel_scant): Simplify by using mcel_ch, mcel_err. (mcel_casecmp): Remove decl. Callers changed to use mbscasecmp. * gl/lib/mcel.c, gl/lib/mcel.h: Rename from lib/mcel.c, lib/mcel.h. * gl/lib/mbscasecmp.c: New file. * gl/modules/mcel, gl/modules/mcel-prefer, gl/modules/mcel-tests: * gl/tests/test-mcel.c: New files. * src/io.c: Revert use of ucore API. Use plain c32isspace etc. instead of ucore_is. Use .err instead of ucore_iserr. (same_ch_err): Bring back, and use it instead of ucore_cmp. * src/side.c (print_half_line): Use .err instead of ucore_iserr.
2026-01-27 01:44:20 +00:00 · 2023-08-21 08:38:16 -07:00 · 2023-08-21 08:38:16 -07:00 · ae1cdc7239
commit ae1cdc7239
parent 574e81bff2
15 changed files with 517 additions and 322 deletions
--- a/bootstrap.conf
+++ b/bootstrap.conf
@ -18,6 +18,7 @@
 avoided_gnulib_modules='
  --avoid=localename
  --avoid=lock-tests
+  --avoid=mbuiterf
  --avoid=setlocale
 '

@ -73,7 +74,8 @@ largefile
 lstat
 maintainer-makefile
 manywarnings
-mbrtoc32
+mbscasecmp
+mcel-prefer
 mempcpy
 minmax
 mkstemp
@ -140,6 +142,7 @@ XGETTEXT_OPTIONS=$XGETTEXT_OPTIONS'\\\
 '

 gnulib_tool_option_extras="--tests-base=gnulib-tests
+ --local-dir=gl
 --with-tests
 --symlink
 --makefile-name=gnulib.mk
--- a/cfg.mk
+++ b/cfg.mk
@ -74,8 +74,7 @@ config-save:
 	cp lib/config.h config.status $(_cf_state_dir)/latest

 exclude_file_name_regexp--sc_space_tab = ^gl/lib/.*\.c\.diff$$
-exclude_file_name_regexp--sc_prohibit_doubled_word = \
-  ^(tests/y2038-vs-32bit|lib/ucore\.h)$$
+exclude_file_name_regexp--sc_prohibit_doubled_word = ^tests/y2038-vs-32bit$$

 # Tell gnulib's tight_scope rule that we mark externs with XTERN
 export _gl_TS_extern = extern|XTERN|DIFF_INLINE|SYSTEM_INLINE|SYSTEM_EXTERN
--- a/gl/lib/mbscasecmp.c
+++ b/gl/lib/mbscasecmp.c
@ -0,0 +1,112 @@
+/* Case-insensitive string comparison function.
+   Copyright (C) 1998-1999, 2005-2023 Free Software Foundation, Inc.
+   Written by Bruno Haible <bruno@clisp.org>, 2005,
+   based on earlier glibc code.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation, either version 3 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+/* Specification.  */
+#include <string.h>
+
+#include <ctype.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#if GNULIB_MCEL_PREFER
+# include "mcel.h"
+#else
+# include "mbuiterf.h"
+#endif
+
+/* Compare the character strings S1 and S2, ignoring case, returning less than,
+   equal to or greater than zero if S1 is lexicographically less than, equal to
+   or greater than S2.
+   Note: This function may, in multibyte locales, return 0 for strings of
+   different lengths!  */
+int
+mbscasecmp (const char *s1, const char *s2)
+{
+  if (s1 == s2)
+    return 0;
+
+  char const *p1 = s1;
+  char const *p2 = s2;
+
+  /* Be careful not to look at the entire extent of s1 or s2 until needed.
+     This is useful because when two strings differ, the difference is
+     most often already in the very few first characters.  */
+  if (MB_CUR_MAX > 1)
+    {
+#if GNULIB_MCEL_PREFER
+      while (true)
+        {
+          mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
+          mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
+          int cmp = mcel_tocmp (c32tolower, g1, g2);
+          if (cmp | !g1.ch)
+            return cmp;
+        }
+#else
+      mbuif_state_t state1;
+      const char *iter1;
+      mbuif_init (state1);
+      iter1 = s1;
+
+      mbuif_state_t state2;
+      const char *iter2;
+      mbuif_init (state2);
+      iter2 = s2;
+
+      while (mbuif_avail (state1, iter1) && mbuif_avail (state2, iter2))
+        {
+          mbchar_t cur1 = mbuif_next (state1, iter1);
+          mbchar_t cur2 = mbuif_next (state2, iter2);
+          int cmp = mb_casecmp (cur1, cur2);
+
+          if (cmp != 0)
+            return cmp;
+
+          iter1 += mb_len (cur1);
+          iter2 += mb_len (cur2);
+        }
+      if (mbuif_avail (state1, iter1))
+        /* s2 terminated before s1.  */
+        return 1;
+      if (mbuif_avail (state2, iter2))
+        /* s1 terminated before s2.  */
+        return -1;
+      return 0;
+#endif
+    }
+  else
+    while (true)
+      {
+        unsigned char c1 = *p1++;
+        unsigned char c2 = *p2++;
+        /* On machines where 'char' and 'int' are types of the same size, the
+           difference of two 'unsigned char' values - including the sign bit -
+           doesn't fit in an 'int'.  */
+        int cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+        if (cmp)
+          {
+            c1 = tolower (c1);
+            c2 = tolower (c2);
+            cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
+          }
+        if (cmp | !c1)
+          return cmp;
+      }
+}
--- a/gl/lib/mcel.c
+++ b/gl/lib/mcel.c
--- a/gl/lib/mcel.h
+++ b/gl/lib/mcel.h
@ -3,7 +3,7 @@

   This file is free software: you can redistribute it and/or modify
   it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 3 of the
+   published by the Free Software Foundation; either version 2.1 of the
   License, or (at your option) any later version.

   This file is distributed in the hope that it will be useful,
@ -16,7 +16,14 @@

 /* Written by Paul Eggert.  */

-/* The mcel_scan function lets code iterate through an array of bytes,
+/* The macros in this file implement multi-byte character representation
+   and forward iteration through a multi-byte string.
+   They are simpler and typically faster than the mbiter family.
+   However, they do not support obsolescent encodings like CP864,
+   EBCDIC, Johab, and Shift JIS that glibc also does not support,
+   and it is up to the caller to coalesce encoding-error bytes if desired.
+
+   The mcel_scan function lets code iterate through an array of bytes,
   supporting character encodings in practical use
   more simply than using plain mbrtoc32.

@ -35,8 +42,11 @@
 	  process (g);
 	}

+   You can select from G using G.ch, G.err, and G.len.
+   G is an encoding error if G.err is nonzero, a character otherwise.
+
   The mcel_scanz function is similar except it works with a
-   string of unknown length that is terminated with '\0'.
+   string of unknown but positive length that is terminated with '\0'.
   Instead of this single-byte code:

      char *p = ...;
@ -57,12 +67,16 @@
   '\n', '.', '/' are safe, as they cannot be a part (even a trailing
   byte) of a multi-byte character.

-   You can select from G using G.c and G.len.
-   You can use ucore_* functions on G.c, e.g., ucore_iserr (G.c),
-   ucore_is (c32isalpha, G.c), and ucore_to (c32tolower, G.c).
+   mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values.

-   mcel_strcasecmp compares two null-terminated multi-byte strings
-   lexicographically, ignoring case.
+   mcel_cmp (G1, G2) compares two mcel_t values lexicographically by
+   character or by encoding byte value, with encoding bytes sorting
+   after characters.
+
+   Calls like c32isalpha (G.ch) test G; they return false for encoding
+   errors since calls like c32isalpha (0) return false.  Calls like
+   mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2),
+   but transliterate first.

   Although ISO C and POSIX allow encodings that have shift states or
   that can produce multiple characters from an indivisible byte sequence,
@ -73,9 +87,20 @@
 #ifndef _MCEL_H
 #define _MCEL_H 1

-/* This API is an extension of ucore.h.  Programs that include this
-   file can assume ucore.h is included too.  */
-#include <ucore.h>
+#if !_GL_CONFIG_H_INCLUDED
+ #error "Please include config.h first."
+#endif
+
+#include <verify.h>
+
+#include <limits.h>
+#include <stddef.h>
+#include <uchar.h>
+
+/* Pacify GCC re type limits.  */
+#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
+# pragma GCC diagnostic ignored "-Wtype-limits"
+#endif

 /* The maximum multi-byte character length supported on any platform.
   This can be less than MB_LEN_MAX because many platforms have a
@ -85,24 +110,41 @@
   0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX.  */
 enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };

-/* mcel_t is a type representing a character or encoding error C,
-   along with a count of the LEN bytes that represent C.
-   1 <= LEN <= MB_LEN_MAX.  */
+/* Bounds for mcel_t members.  */
+enum { MCEL_CHAR_MAX = 0x10FFFF };
+enum { MCEL_ERR_MIN = 0x80 };
+enum { MCEL_ERR_MAX = UCHAR_MAX };
+
+/* mcel_t is a type representing a character CH or an encoding error byte ERR,
+   along with a count of the LEN bytes that represent CH or ERR.
+   If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX;
+   otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR <= MCEL_ERR_MAX.
+   CH == 0, and LEN == 1.  */
 typedef struct
 {
-  ucore_t c;
+  char32_t ch;
+  unsigned char err;
  unsigned char len;
 } mcel_t;

 /* Every multi-byte character length fits in mcel_t's LEN.  */
 static_assert (MB_LEN_MAX <= UCHAR_MAX);

+/* Shifting an encoding error byte left by this value
+   suffices to sort encoding errors after characters.  */
+enum { MCEL_ERR_SHIFT = 14 };
+static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT);
+
+/* Unsigned char promotes to int.  */
+static_assert (UCHAR_MAX <= INT_MAX);
+
 /* Bytes have 8 bits, as POSIX requires.  */
 static_assert (CHAR_BIT == 8);

-/* Pacify GCC re 'c <= 0x7f' below.  */
-#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
-# pragma GCC diagnostic ignored "-Wtype-limits"
+#ifndef _GL_LIKELY
+/* Rely on __builtin_expect, as provided by the module 'builtin-expect'.  */
+# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
+# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
 #endif

 _GL_INLINE_HEADER_BEGIN
@ -110,18 +152,44 @@ _GL_INLINE_HEADER_BEGIN
 # define MCEL_INLINE _GL_INLINE
 #endif

-/* With mcel there should be no need for the performance overhead of
-   replacing glibc mbrtoc32, as callers shouldn't care whether the
-   C locale treats a byte with the high bit set as an encoding error.  */
-#ifdef __GLIBC__
-# undef mbrtoc32
-#endif
+/* mcel_t constructors.  */
+MCEL_INLINE mcel_t
+mcel_ch (char32_t ch, size_t len)
+{
+  assume (0 < len);
+  assume (len <= MCEL_LEN_MAX);
+  assume (ch <= MCEL_CHAR_MAX);
+  return (mcel_t) {ch: ch, len: len};
+}
+MCEL_INLINE mcel_t
+mcel_err (unsigned char err)
+{
+  assume (MCEL_ERR_MIN <= err);
+  assume (err <= MCEL_ERR_MAX);
+  return (mcel_t) {err: err, len: 1};
+}

-/* Shifting an encoding error byte (at least 0x80) left by this value
-   yields a value in the range UCORE_ERR_MIN .. 2*UCORE_ERR_MIN - 1.
-   This suffices to sort encoding errors after characters.  */
-enum { MCEL_ENCODING_ERROR_SHIFT = 14 };
-static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
+/* Compare C1 and C2, with encoding errors sorting after characters.
+   Return <0, 0, >0 for <, =, >.  */
+MCEL_INLINE int
+mcel_cmp (mcel_t c1, mcel_t c2)
+{
+  int ch1 = c1.ch, ch2 = c2.ch;
+  return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2);
+}
+
+/* Apply the uchar translator TO to C1 and C2 and compare the results,
+   with encoding errors sorting after characters,
+   Return <0, 0, >0 for <, =, >.  */
+MCEL_INLINE int
+mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2)
+{
+  int cmp = mcel_cmp (c1, c2);
+  if (_GL_LIKELY ((c1.err - c2.err) | !cmp))
+    return cmp;
+  int ch1 = to (c1.ch), ch2 = to (c2.ch);
+  return ch1 - ch2;
+}

 /* Whether C represents itself as a Unicode character
   when it is the first byte of a single- or multi-byte character.
@ -130,9 +198,16 @@ static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
 MCEL_INLINE bool
 mcel_isbasic (char c)
 {
-  return 0 <= c && c <= 0x7f;
+  return _GL_LIKELY (0 <= c && c <= 0x7f);
 }

+/* With mcel there should be no need for the performance overhead of
+   replacing glibc mbrtoc32, as callers shouldn't care whether the
+   C locale treats a byte with the high bit set as an encoding error.  */
+#ifdef __GLIBC__
+# undef mbrtoc32
+#endif
+
 /* Scan bytes from P inclusive to LIM exclusive.  P must be less than LIM.
   Return the character or encoding error starting at P.  */
 MCEL_INLINE mcel_t
@ -141,8 +216,8 @@ mcel_scan (char const *p, char const *lim)
  /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
     In supported encodings, the first byte of a multi-byte character
     cannot be an ASCII byte.  */
-  if (_GL_LIKELY (mcel_isbasic (*p)))
-    return (mcel_t) { .c = *p, .len = 1 };
+  if (mcel_isbasic (*p))
+    return mcel_ch (*p, 1);

  /* An initial mbstate_t; initialization optimized for some platforms.
     For details about these and other platforms, see wchar.in.h.  */
@ -171,29 +246,17 @@ mcel_scan (char const *p, char const *lim)
  mbstate_t mbs = {0};
 #endif

-  char32_t c;
-  size_t len = mbrtoc32 (&c, p, lim - p, &mbs);
+  char32_t ch;
+  size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);

  /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
     is not supported and MB_LEN_MAX is small.  */
-  if (_GL_LIKELY (len <= (size_t) -1 / 2))
-    {
-      /* A multi-byte character.  LEN must be positive,
-	 as *P != '\0' and shift sequences are not supported.  */
-      assume (0 < len);
-      assume (len <= MB_LEN_MAX);
-      assume (c <= UCORE_CHAR_MAX);
-      return (mcel_t) { .c = c, .len = len };
-    }
-  else
-    {
-      /* An encoding error.  */
-      unsigned char b = *p;
-      c = b << MCEL_ENCODING_ERROR_SHIFT;
-      assume (UCORE_ERR_MIN <= c);
-      assume (c <= UCORE_ERR_MAX);
-      return (mcel_t) { .c = c, .len = 1 };
-    }
+  if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
+    return mcel_err (*p);
+
+  /* A multi-byte character.  LEN must be positive,
+     as *P != '\0' and shift sequences are not supported.  */
+  return mcel_ch (ch, len);
 }

 /* Scan bytes from P, a byte sequence terminated by TERMINATOR.
@ -205,11 +268,11 @@ MCEL_INLINE mcel_t
 mcel_scant (char const *p, char terminator)
 {
  /* Handle ASCII quickly for speed.  */
-  if (_GL_LIKELY (mcel_isbasic (*p)))
-    return (mcel_t) { .c = *p, .len = 1 };
+  if (mcel_isbasic (*p))
+    return mcel_ch (*p, 1);

  /* Defer to mcel_scan for non-ASCII.  Compute length with code that
-     is typically branch-free and faster than memchr or strnlen.  */
+     is typically faster than strnlen.  */
  char const *lim = p + 1;
  for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
    lim += *lim != terminator;
@ -226,11 +289,6 @@ mcel_scanz (char const *p)
  return mcel_scant (p, '\0');
 }

-/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
-   Return <0, 0, >0 for <, =, >.  Consider encoding errors to be
-   greater than characters and compare them byte by byte.  */
-int mcel_casecmp (char const *s1, char const *s2);
-
 _GL_INLINE_HEADER_END

 #endif /* _MCEL_H */
--- a/gl/modules/mcel
+++ b/gl/modules/mcel
@ -0,0 +1,34 @@
+Description:
+Multibye Characters, Encoding errors, and Lengths
+
+Files:
+lib/mcel.c
+lib/mcel.h
+
+Depends-on:
+assert-h
+extern-inline
+limits-h
+mbrtoc32
+stdbool
+uchar
+verify
+
+configure.ac:
+
+Makefile.am:
+lib_SOURCES += mcel.c mcel.h
+
+Include:
+"mcel.h"
+
+Link:
+$(LTLIBUNISTRING) when linking with libtool, $(LIBUNISTRING) otherwise
+$(MBRTOWC_LIB)
+$(LTLIBC32CONV) when linking with libtool, $(LIBC32CONV) otherwise
+
+License:
+LGPLv2+
+
+Maintainer:
+all
--- a/gl/modules/mcel-prefer
+++ b/gl/modules/mcel-prefer
@ -0,0 +1,24 @@
+Description:
+mcel is preferred to the mbiter family when either will do.
+mcel is simpler and faster.  However, it does not support some
+obsolete encodings that are also not supported by glibc locales,
+and the caller is responsible for coalescing sequences of
+error-encoding bytes if that is desired.
+
+Files:
+
+Depends-on:
+mcel
+
+configure.ac:
+gl_MODULE_INDICATOR([mcel-prefer])
+
+Makefile.am:
+
+Include:
+
+License:
+LGPLv2+
+
+Maintainer:
+Paul Eggert
--- a/gl/modules/mcel-tests
+++ b/gl/modules/mcel-tests
@ -0,0 +1,12 @@
+Files:
+tests/test-mcel.c
+
+Depends-on:
+assert-h
+setlocale
+
+configure.ac:
+
+Makefile.am:
+TESTS += test-mcel
+check_PROGRAMS += test-mcel
--- a/gl/tests/test-mcel.c
+++ b/gl/tests/test-mcel.c
@ -0,0 +1,138 @@
+/* Test <mcel.h>
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This program is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+#include <config.h>
+
+#include <mcel.h>
+
+#include <locale.h>
+
+#include "macros.h"
+
+static wint_t
+to_ascii (wint_t c)
+{
+  return c & 0x7f;
+}
+
+static int
+sgn (int i)
+{
+  return (i > 0) - (i < 0);
+}
+
+static void
+test_mcel_vs_mbrtoc32 (unsigned char uc, mcel_t c, size_t n, char32_t ch)
+{
+  ASSERT (!c.err == (n <= MB_LEN_MAX));
+  ASSERT (c.err
+          ? c.err == uc && c.ch == 0 && c.len == 1
+          : c.ch == ch && c.len == (n ? n : 1));
+}
+
+int
+main (void)
+{
+  /* configure should already have checked that the locale is supported.  */
+  if (setlocale (LC_ALL, "") == NULL)
+    return 1;
+
+  mcel_t prev;
+  for (int ch = 0; ch < 0x80; ch++)
+    {
+      mcel_t c = mcel_ch (ch, 1);
+      ASSERT (c.ch == ch);
+      ASSERT (c.len == 1);
+      ASSERT (!c.err);
+      ASSERT (mcel_cmp (c, c) == 0);
+      ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
+      if (ch)
+        {
+          ASSERT (mcel_cmp (prev, c) < 0);
+          ASSERT (mcel_cmp (c, prev) > 0);
+          ASSERT (mcel_tocmp (to_ascii, prev, c) < 0);
+          ASSERT (mcel_tocmp (to_ascii, c, prev) > 0);
+        }
+      ASSERT (mcel_isbasic (ch));
+      prev = c;
+    }
+  for (char ch = CHAR_MIN; ; ch++)
+    {
+      ASSERT (mcel_isbasic (ch) == (0 <= ch && ch <= 0x7f));
+      if (ch == CHAR_MAX)
+        break;
+    }
+  for (int ch = 0x80; ch < 0x200; ch++)
+    {
+      mcel_t c = mcel_ch (ch, 2);
+      ASSERT (c.ch == ch);
+      ASSERT (c.len == 2);
+      ASSERT (!c.err);
+      ASSERT (mcel_cmp (c, c) == 0);
+      ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
+      ASSERT (mcel_cmp (prev, c) < 0);
+      ASSERT (mcel_cmp (c, prev) > 0);
+      ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
+      int cmp = to_ascii (c.ch) ? -1 : 1;
+      ASSERT (sgn (mcel_tocmp (to_ascii, prev, c)) == cmp);
+      ASSERT (sgn (mcel_tocmp (to_ascii, c, prev)) == -cmp);
+      prev = c;
+    }
+  for (unsigned char err = 0x80; ; err++)
+    {
+      mcel_t c = mcel_err (err);
+      ASSERT (!c.ch);
+      ASSERT (c.len == 1);
+      ASSERT (c.err == err);
+      ASSERT (mcel_cmp (c, c) == 0);
+      ASSERT (mcel_cmp (prev, c) < 0);
+      ASSERT (mcel_cmp (c, prev) > 0);
+      ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
+      ASSERT (mcel_tocmp (to_ascii, prev, c) < 0);
+      ASSERT (mcel_tocmp (to_ascii, c, prev) > 0);
+      prev = c;
+      if (err == (unsigned char) -1)
+        break;
+    }
+
+  for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
+    for (int j = CHAR_MIN; i <= CHAR_MAX; i++)
+      for (int k = CHAR_MIN; k <= CHAR_MAX; k++)
+        {
+          char const ijk[] = {i, j, k};
+          mbstate_t mbs = {0};
+          char32_t ch;
+          size_t n = mbrtoc32 (&ch, ijk, sizeof ijk, &mbs);
+          mcel_t c = mcel_scan (ijk, ijk + sizeof ijk);
+          test_mcel_vs_mbrtoc32 (i, c, n, ch);
+
+          static char const terminator[] = "\r\n./";
+          for (int ti = 0; ti < sizeof terminator; ti++)
+            {
+              char t = terminator[ti];
+              if (i == t)
+                continue;
+              char const ijkt[] = {i, j, k, t};
+              mcel_t d = mcel_scant (ijk, t);
+              ASSERT (c.ch == d.ch && c.err == d.err && c.len == d.len);
+              if (!t)
+                {
+                  mcel_t z = mcel_scanz (ijk);
+                  ASSERT (d.ch == z.ch && d.err == z.err && d.len == z.len);
+                }
+            }
+        }
+}
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -29,7 +29,7 @@ noinst_HEADERS =

 include gnulib.mk

-noinst_HEADERS += cmpbuf.h diagnose.h mcel.h ucore.h
-libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mcel.c mcel-casecmp.c ucore.c
+noinst_HEADERS += cmpbuf.h diagnose.h
+libdiffutils_a_SOURCES += cmpbuf.c diagnose.c

 AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS)
--- a/lib/mcel-casecmp.c
+++ b/lib/mcel-casecmp.c
@ -1,60 +0,0 @@
-/* Case-insensitive string comparison function.
-   Copyright 2023 Free Software Foundation, Inc.
-
-   This file is free software: you can redistribute it and/or modify
-   it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation, either version 3 of the
-   License, or (at your option) any later version.
-
-   This file is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
-
-/* Written by Paul Eggert.  */
-
-#include <config.h>
-
-/* Specification.  */
-#include <mcel.h>
-
-#include <ctype.h>
-#include <stdlib.h>
-
-int
-mcel_casecmp (char const *s1, char const *s2)
-{
-  char const *p1 = s1;
-  char const *p2 = s2;
-
-  /* Do not look at the entire extent of S1 or S2 until needed:
-     when two strings differ, the difference is typically early.  */
-  if (MB_CUR_MAX == 1)
-    while (true)
-      {
-	static_assert (UCHAR_MAX <= INT_MAX);
-	unsigned char c1 = *p1++;
-	unsigned char c2 = *p2++;
-	int cmp = c1 - c2;
-	if (_GL_UNLIKELY (cmp))
-	  {
-	    c1 = tolower (c1);
-	    c2 = tolower (c2);
-	    cmp = c1 - c2;
-	  }
-	if (_GL_UNLIKELY (cmp | !c1))
-	  return cmp;
-      }
-  else
-    while (true)
-      {
-	mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
-	mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
-	int cmp = ucore_tocmp (c32tolower, g1.c, g2.c);
-	if (_GL_UNLIKELY (cmp | !g1.c))
-	  return cmp;
-      }
-}
--- a/lib/ucore.c
+++ b/lib/ucore.c
@ -1,3 +0,0 @@
-#include <config.h>
-#define UCORE_INLINE _GL_EXTERN_INLINE
-#include "ucore.h"
--- a/lib/ucore.h
+++ b/lib/ucore.h
@ -1,132 +0,0 @@
-/* Unicode Characters OR Encoding errors (UCOREs)
-   Copyright 2023 Free Software Foundation, Inc.
-
-   This file is free software: you can redistribute it and/or modify
-   it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 3 of the
-   License, or (at your option) any later version.
-
-   This file is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
-
-/* Written by Paul Eggert.  */
-
-/* This API's fundamental type ucore_t represents
-   a single Unicode character or an encoding error.
-   ucore_iserr (C) tests whether C is an encoding error.
-   ucore_is (P, C) etc. test whether char class P accepts C.
-   ucore_to (TO, C) etc. use TO to convert C.
-   ucore_cmp (C1, C2) and ucore_tocmp (TO, C1, C2) compare C1 and C2,
-   with encoding errors sorting after characters.  */
-
-#ifndef _UCORE_H
-#define _UCORE_H 1
-
-#if !_GL_CONFIG_H_INCLUDED
- #error "Please include config.h first."
-#endif
-
-#include <verify.h>
-
-#include <limits.h>
-#include <stddef.h>
-#include <uchar.h>
-
-/* ucore_t represents a Unicode Character OR Encoding error.
-   If 0 <= C <= UCORE_CHAR_MAX, C represents a Unicode character.
-   If UCORE_ERR_MIN <= C <= UCORE_ERR_MAX, C represents an encoding error.
-   Other ucore_t values C are invalid.  */
-typedef int ucore_t;
-
-enum {
-  UCORE_CHAR_MAX = 0x10FFFF,
-  UCORE_ERR_MIN = 0x200000,
-  UCORE_ERR_MAX = 2 * UCORE_ERR_MIN - 1
-};
-
-/* Information is not lost by encoding errors as integers.  */
-static_assert (UCHAR_MAX <= UCORE_ERR_MAX - UCORE_ERR_MIN);
-
-/* On glibc platforms, predicates like c32isalnum and c32tolower
-   do the right thing for char32_t values that are not valid characters.
-   POSIX says the behavior is undefined, so play it safe elsewhere.
-   Do not rely on UCORE_C32_SAFE for c32width.  */
-#ifdef __GLIBC__
-enum { UCORE_C32_SAFE = true };
-#else
-enum { UCORE_C32_SAFE = false };
-#endif
-
-#ifndef _GL_LIKELY
-/* Rely on __builtin_expect, as provided by the module 'builtin-expect'.  */
-# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
-# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
-#endif
-
-_GL_INLINE_HEADER_BEGIN
-#ifndef UCORE_INLINE
-# define UCORE_INLINE _GL_INLINE
-#endif
-
-/* Return true if C represents an encoding error, false otherwise.  */
-UCORE_INLINE bool
-ucore_iserr (ucore_t c)
-{
-  /* (c & UCORE_ERR_MIN) is a bit cheaper than (UCORE_ERR_MIN <= c)
-     with GCC 13 x86-64.  */
-  if (_GL_UNLIKELY (c & UCORE_ERR_MIN))
-    {
-      assume (UCORE_ERR_MIN <= c && c <= UCORE_ERR_MAX);
-      return true;
-    }
-  else
-    {
-      assume (0 <= c && c <= UCORE_CHAR_MAX);
-      return false;
-    }
-}
-
-/* Whether the uchar predicate P accepts C, e.g., ucore_is (c32isalpha, C).  */
-UCORE_INLINE bool
-ucore_is (int (*p) (wint_t), wint_t c)
-{
-  /* When C is out of range, predicates based on glibc return false.
-     Behavior is undefined on other platforms, so play it safe.  */
-  return (UCORE_C32_SAFE || ! ucore_iserr (c)) && p (c);
-}
-
-/* Apply the uchar translator TO to C, e.g., ucore_to (c32tolower, C).  */
-UCORE_INLINE wint_t
-ucore_to (wint_t (*to) (wint_t), ucore_t c)
-{
-  return UCORE_C32_SAFE || ! ucore_iserr (c) ? to (c) : c;
-}
-
-/* Compare C1 and C2, with encoding errors sorting after characters.
-   Return <0, 0, >0 for <, =, >.  */
-UCORE_INLINE int
-ucore_cmp (ucore_t c1, ucore_t c2)
-{
-  return c1 - c2;
-}
-
-/* Apply the uchar translater TO to C1 and C2 and compare the results,
-   with encoding errors sorting after characters,
-   Return <0, 0, >0 for <, =, >.  */
-UCORE_INLINE int
-ucore_tocmp (wint_t (*to) (wint_t), ucore_t c1, ucore_t c2)
-{
-  if (c1 == c2)
-    return 0;
-  int i1 = ucore_to (to, c1), i2 = ucore_to (to, c2);
-  return i1 - i2;
-}
-
-_GL_INLINE_HEADER_END
-
-#endif /* _MCEL_H */
--- a/src/io.c
+++ b/src/io.c
@ -230,6 +230,14 @@ slurp (struct file_data *current)
    }
 }

+/* Return true if CH1 and ERR1 stand for the same character or
+   encoding error as CH2 and ERR2.  */
+static bool
+same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2)
+{
+  return ! ((ch1 ^ ch2) | (err1 ^ err2));
+}
+
 /* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
   one line from each input file) according to the command line options.
   Line lengths include the trailing newline.
@ -427,7 +435,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
    {
      char const *lim1 = s1 + s1len;
      char const *lim2 = s2 + s2len;
-      ucore_t c1prev = 0;
+      char32_t ch1prev = 0;

      while (true)
 	{
@ -435,27 +443,27 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 	  mcel_t g2 = mcel_scan (t2, lim2);
 	  t1 += g1.len;
 	  t2 += g2.len;
-	  ucore_t c1 = g1.c;
-	  ucore_t c2 = g2.c;
+	  char32_t ch1 = g1.ch;
+	  char32_t ch2 = g2.ch;

 	  /* Test for exact equality first, since it's a common case.  */
-	  if (ucore_cmp (c1, c2) != 0)
+	  if (! same_ch_err (ch1, g1.err, ch2, g2.err))
 	    {
 	      switch (ignore_white_space)
 		{
 		case IGNORE_ALL_SPACE:
 		  /* For -w, just skip past any white space.  */
-		  while (c1 != '\n' && ! ucore_is (c32isspace, c1))
+		  while (ch1 != '\n' && c32isspace (ch1))
 		    {
 		      g1 = mcel_scan (t1, lim1);
 		      t1 += g1.len;
-		      c1 = g1.c;
+		      ch1 = g1.ch;
 		    }
-		  while (c2 != '\n' && ucore_is (c32isspace, c2))
+		  while (ch2 != '\n' && c32isspace (ch2))
 		    {
 		      g2 = mcel_scan (t2, lim2);
 		      t2 += g2.len;
-		      c2 = g2.c;
+		      ch2 = g2.ch;
 		    }
 		  break;

@ -463,48 +471,46 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 		  /* For -b, advance past any sequence of white space in
 		     line 1 and consider it just one space, or nothing at
 		     all if it is at the end of the line.  */
-		  if (ucore_is (c32isspace, c1))
-		    while (c1 != '\n')
+		  if (c32isspace (ch1))
+		    while (ch1 != '\n')
 		      {
 			g1 = mcel_scan (t1, lim1);
 			t1 += g1.len;
-			c1 = g1.c;
-			if (! ucore_is (c32isspace, c1))
+			ch1 = g1.ch;
+			if (! c32isspace (ch1))
 			  {
 			    t1 -= g1.len;
-			    c1 = ' ';
+			    ch1 = ' ';
 			    break;
 			  }
 		      }

 		  /* Likewise for line 2.  */
-		  if (ucore_is (c32isspace, c2))
-		    while (c2 != '\n')
+		  if (c32isspace (ch2))
+		    while (ch2 != '\n')
 		      {
 			g2 = mcel_scan (t2, lim2);
 			t2 += g2.len;
-			c2 = g2.c;
-			if (! ucore_is (c32isspace, c2))
+			ch2 = g2.ch;
+			if (! c32isspace (ch2))
 			  {
 			    t2 -= g2.len;
-			    c2 = ' ';
+			    ch2 = ' ';
 			    break;
 			  }
 		      }

-		  if (c1 != c2)
+		  if (ch1 != ch2)
 		    {
 		      /* If we went too far when doing the simple test
 			 for equality, go back to the first non-white-space
 			 character in both sides and try again.  */
-		      if (c2 == ' ' && c1 != '\n'
-			  && ucore_is (c32isspace, c1prev))
+		      if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev))
 			{
 			  t1 -= g1.len;
 			  continue;
 			}
-		      if (c1 == ' ' && c2 != '\n'
-			  && ucore_is (c32isspace, c1prev))
+		      if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev))
 			{
 			  t2 -= g2.len;
 			  continue;
@ -515,28 +521,28 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)

 		case IGNORE_TRAILING_SPACE:
 		case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
-		  if (ucore_is (c32isspace, c1) && ucore_is (c32isspace, c2))
+		  if (c32isspace (ch1) && c32isspace (ch2))
 		    {
-		      if (c1 != '\n')
+		      if (ch1 != '\n')
 			{
 			  char const *p = t1;
 			  while (*p != '\n')
 			    {
 			      mcel_t g = mcel_scan (p, lim1);
-			      if (! ucore_is (c32isspace, g.c))
+			      if (c32isspace (g.ch))
 				break;
 			      p += g.len;
 			    }
 			  if (*p != '\n')
 			    break;
 			}
-		      if (c2 != '\n')
+		      if (ch2 != '\n')
 			{
 			  char const *p = t2;
 			  while (*p != '\n')
 			    {
 			      mcel_t g = mcel_scan (p, lim2);
-			      if (! ucore_is (c32isspace, g.c))
+			      if (! c32isspace (g.ch))
 				break;
 			      p += g.len;
 			    }
@ -550,45 +556,45 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 		    break;
 		  FALLTHROUGH;
 		case IGNORE_TAB_EXPANSION:
-		  if ((c1 == ' ' && c2 == '\t')
-		      || (c1 == '\t' && c2 == ' '))
+		  if ((ch1 == ' ' && ch2 == '\t')
+		      || (ch1 == '\t' && ch2 == ' '))
 		    {
 		      intmax_t tab2 = tab, column2 = column;

 		      while (true)
 			{
-			  if (c1 == '\t'
-			      || (c1 == ' ' && column == tabsize - 1))
+			  if (ch1 == '\t'
+			      || (ch1 == ' ' && column == tabsize - 1))
 			    {
 			      tab++;
 			      column = 0;
 			    }
-			  else if (c1 == ' ')
+			  else if (ch1 == ' ')
 			    column++;
 			  else
 			    break;

 			  g1 = mcel_scan (t1, lim1);
 			  t1 += g1.len;
-			  c1 = g1.c;
+			  ch1 = g1.ch;
 			}

 		      while (true)
 			{
-			  if (c2 == '\t'
-			      || (c2 == ' ' && column2 == tabsize - 1))
+			  if (ch2 == '\t'
+			      || (ch2 == ' ' && column2 == tabsize - 1))
 			    {
 			      tab2++;
 			      column2 = 0;
 			    }
-			  else if (c2 == ' ')
+			  else if (ch2 == ' ')
 			    column2++;
 			  else
 			    break;

 			  g2 = mcel_scan (t2, lim2);
 			  t2 += g2.len;
-			  c2 = g2.c;
+			  ch2 = g2.ch;
 			}

 		      if (tab != tab2 || column != column2)
@ -602,15 +608,15 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)

 	      if (ignore_case)
 		{
-		  c1 = ucore_to (c32tolower, c1);
-		  c2 = ucore_to (c32tolower, c2);
+		  ch1 = c32tolower (ch1);
+		  ch2 = c32tolower (ch2);
 		}

-	      if (ucore_cmp (c1, c2) != 0)
+	      if (! same_ch_err (ch1, g1.err, ch2, g2.err))
 		break;
 	    }

-	  switch (c1)
+	  switch (ch1)
 	    {
 	    case '\n':
 	      return false;
@ -634,7 +640,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)

 	    default:
 	      /* Assume that downcasing does not change print width.  */
-	      column += ucore_iserr (c1) ? 1 : c32width (c1);
+	      column += g1.err ? 1 : c32width (ch1);
 	      if (column < tabsize)
 		break;
 	      FALLTHROUGH;
@ -644,7 +650,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 	      break;
 	    }

-	  c1prev = c1;
+	  ch1prev = ch1;
 	}
    }

@ -698,8 +704,8 @@ find_and_hash_each_line (struct file_data *current)
 	    for (mcel_t g; *p != '\n'; p += g.len)
 	      {
 		g = mcel_scan (p, suffix_begin);
-		if (! ucore_is (c32isspace, g.c))
-		  h = hash (h, (ig_case ? ucore_to (c32tolower, g.c) : g.c));
+		if (! c32isspace (g.ch))
+		  h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
 	      }
          break;

@ -727,7 +733,7 @@ find_and_hash_each_line (struct file_data *current)
 	    for (mcel_t g; *p != '\n'; p += g.len)
 	      {
 		g = mcel_scan (p, suffix_begin);
-		if (ucore_is (c32isspace, g.c))
+		if (c32isspace (g.ch))
 		  {
 		    do
 		      {
@ -736,13 +742,13 @@ find_and_hash_each_line (struct file_data *current)
 			  goto hashing_done;
 			g = mcel_scan (p, suffix_begin);
 		      }
-		    while (ucore_is (c32isspace, g.c));
+		    while (c32isspace (g.ch));

 		    h = hash (h, ' ');
 		  }

 		/* G is now the first non-space.  */
-		h = hash (h, ig_case ? ucore_to (c32tolower, g.c) : g.c);
+		h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
 	      }
          break;

@ -818,13 +824,17 @@ find_and_hash_each_line (struct file_data *current)
 		  intmax_t repetitions = 1;

 		  g = mcel_scan (p, suffix_begin);
-		  ucore_t c = g.c;
-		  if (ucore_iserr (c))
-		    column++;
+		  char32_t ch;
+		  if (g.err)
+		    {
+		      ch = -g.err;
+		      column++;
+		    }
 		  else
 		    {
+		      ch = g.ch;
 		      if (ig_white_space & IGNORE_TRAILING_SPACE
-			  && ucore_is (c32isspace, c))
+			  && c32isspace (ch))
 			{
 			  char const *p1 = p + g.len;
 			  for (mcel_t g1; ; p1 += g1.len)
@ -835,13 +845,13 @@ find_and_hash_each_line (struct file_data *current)
 				  goto hashing_done;
 				}
 			      g1 = mcel_scan (p1, suffix_begin);
-			      if (! ucore_is (c32isspace, g1.c))
+			      if (! c32isspace (g1.ch))
 				break;
 			    }
 			}

 		      if (ig_white_space & IGNORE_TAB_EXPANSION)
-			switch (c)
+			switch (ch)
 			  {
 			  case '\b':
 			    if (0 < column)
@ -854,7 +864,7 @@ find_and_hash_each_line (struct file_data *current)
 			    break;

 			  case '\t':
-			    c = ' ';
+			    ch = ' ';
 			    repetitions = tabsize - column % tabsize;
 			    tab += column / tabsize + 1;
 			    column = 0;
@ -868,16 +878,16 @@ find_and_hash_each_line (struct file_data *current)
 			    break;

 			  default:
-			    column += c32width (c);
+			    column += c32width (ch);
 			    break;
 			  }

 		      if (ig_case)
-			c = c32tolower (c);
+			ch = c32tolower (ch);
 		    }

 		  do
-		    h = hash (h, c);
+		    h = hash (h, ch);
 		  while (--repetitions != 0);
 		}
          }
@ -899,13 +909,13 @@ find_and_hash_each_line (struct file_data *current)
 		for (mcel_t g; *p != '\n'; p += g.len)
 		  {
 		    g = mcel_scan (p, suffix_begin);
-		    h = hash (h, ucore_to (c32tolower, g.c));
+		    h = hash (h, c32tolower (g.ch) - g.err);
 		  }
 	      else
 		for (mcel_t g; *p != '\n'; p += g.len)
 		  {
 		    g = mcel_scan (p, suffix_begin);
-		    h = hash (h, g.c);
+		    h = hash (h, g.ch - g.err);
 		  }
 	    }
          break;
--- a/src/side.c
+++ b/src/side.c
@ -146,7 +146,7 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound)
 	       Increase TEXT_POINTER, counting columns.
 	       Assume encoding errors have print width 1.  */
 	    mcel_t g = mcel_scan (tp0, text_limit);
-	    int width = ucore_iserr (g.c) ? 1 : c32width (g.c);
+	    int width = g.err ? 1 : c32width (g.ch);
 	    if (0 < width && ckd_add (&in_position, in_position, width))
 	      return out_position;