diff: simplify multi-byte code (mbcel -> mcel)

* lib/Makefile.am: Adjust to file renamings and additions. * lib/mbcel.c, lib/mbcel.h: Split into two APIs, replacing with ... * lib/mcel.c, lib/mcel.h, lib/ucore.c, lib/ucore.h: ... these new files. * lib/mcel.h: Simplify by assuming ucore.h is included. Check that bytes have 8 bits. (MCEL_LEN_MAX, mcel_t, MCEL_INLINE, MCEL_ENCODING_ERROR_SHIFT) (mcel_scan, mcel_scant, mcel_scanz, mcel_casecmp): Rename from MBCEL_LEN_MAX, mbcel_t, MBCEL_INLINE, MBCEL_ENCODING_ERROR_SHIFT, mbcel_scan, mbcel_scanz, mbcel_scant, mbcel_casecmp. (mcel_t): New member c, replacing old members ch and err. All uses changed. (MBCEL_UCHAR_FITS, MBCEL_UCHAR_EASILY_FITS): Remove. All uses removed. No longer needed now 8-bit bytes are assumed. (MCEL_ENCODING_ERROR_SHIFT): Check that it matches UCORE_ERR_MIN. (mcel_isbasic): New function. Use it where appropriate. (mbcel_cmp, mbcel_casecmp): Remove; replaced by ucore_cmp, ucore_tocmp. All uses changed. * lib/mcel-casecmp.c: Rename from lib/mbcel-strcasecmp.c. Include mcel.h instead of mbcel.h. (mcel_casecmp): Rename from mbcel_strcasecmp. All uses changed. Assert that UCHAR_MAX <= INT_MAX, as POSIX requires, and simplify code accordingly. Use mcel rather than mbcel. * lib/ucore.h: Include verify.h. (ucore_t): New type. (UCORE_CHAR_MAX, UCORE_ERR_MIN, UCORE_ERR_MAX, UCORE_C32_SAFE): New constants. Check that information is not lost by encoding errors as integers; this is a weaker test than CHAR_BIT == 8. (ucore_iserr, ucore_is, ucore_to): New functions. (ucore_cmp, ucore_tocmp): New functions, replacing the old mbcel_cmp, mbcel_casecmp. All uses changed. * src/dir.c, src/io.c, src/side.c: Use mcel rather than mbcel. * src/io.c (same_ch_err): Remove. All uses replaced by ucore_cmp.
2026-01-27 01:44:20 +00:00 · 2023-08-15 10:26:37 -07:00 · 2023-08-15 10:26:37 -07:00 · 12bcf0bd50
commit 12bcf0bd50
parent e016d12581
11 changed files with 483 additions and 389 deletions
--- a/lib/Makefile.am
+++ b/lib/Makefile.am
@ -29,7 +29,7 @@ noinst_HEADERS =

 include gnulib.mk

-noinst_HEADERS += cmpbuf.h diagnose.h mbcel.h
-libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mbcel.c mbcel-strcasecmp.c
+noinst_HEADERS += cmpbuf.h diagnose.h mcel.h ucore.h
+libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mcel.c mcel-casecmp.c ucore.c

 AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS)
--- a/lib/mbcel.c
+++ b/lib/mbcel.c
@ -1,3 +0,0 @@
-#include <config.h>
-#define MBCEL_INLINE _GL_EXTERN_INLINE
-#include "mbcel.h"
--- a/lib/mbcel.h
+++ b/lib/mbcel.h
@ -1,266 +0,0 @@
-/* Multi-byte characters, error encodings, and lengths
-   Copyright 2023 Free Software Foundation, Inc.
-
-   This file is free software: you can redistribute it and/or modify
-   it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 3 of the
-   License, or (at your option) any later version.
-
-   This file is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public License
-   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
-
-/* Written by Paul Eggert.  */
-
-/* The mbcel_scan function lets code iterate through an array of bytes,
-   supporting character encodings in practical use
-   more simply than using plain mbrtoc32.
-
-   Instead of this single-byte code:
-
-      char *p = ..., *lim = ...;
-      for (; p < lim; p++)
-        process (*p);
-
-   You can use this multi-byte code:
-
-      char *p = ..., *lim = ...;
-      for (mbcel_t g; p < lim; p += g.len)
-        {
-	  g = mbcel_scan (p, lim);
-	  process (g);
-	}
-
-   You can select from G using G.ch, G.err, and G.len.
-
-   The mbcel_scanz function is similar except it works with a
-   string of unknown length that is terminated with '\0'.
-   Instead of this single-byte code:
-
-      char *p = ...;
-      for (; *p; p++)
-	process (*p);
-
-   You can use this multi-byte code:
-
-      char *p = ...;
-      for (mbcel_t g; *p; p += g.len)
-	{
-	  g = mbcel_scanz (p);
-	  process (g);
-	}
-
-   mbcel_scant (P, TERMINATOR) is like mbcel_scanz (P) except the
-   string is terminated by TERMINATOR.  The TERMINATORs '\0', '\r',
-   '\n', '.', '/' are safe, as they cannot be a part (even a trailing
-   byte) of a multi-byte character.
-
-   mbcel_cmp (G1, G2) and mbcel_casecmp (G1, G2) compare two mbcel_t
-   values lexicographically by character or by encoding byte value,
-   with encoding bytes sorting after characters.  mbcel_casecmp
-   ignores case in characters.  mbcel_strcasecmp compares two
-   null-terminated strings lexicographically.
-
-   Although ISO C and POSIX allow encodings that have shift states or
-   that can produce multiple characters from an indivisible byte sequence,
-   POSIX does not require support for these encodings,
-   they are not in practical use on GNUish platforms,
-   and omitting support for them simplifies the API.  */
-
-#ifndef _MBCEL_H
-#define _MBCEL_H 1
-
-/* This file uses _GL_INLINE_HEADER_BEGIN, _GL_INLINE,
-   _GL_ATTRIBUTE_MAY_ALIAS.  */
-#if !_GL_CONFIG_H_INCLUDED
- #error "Please include config.h first."
-#endif
-
-#include <limits.h>
-#include <stddef.h>
-#include <uchar.h>
-
-/* The maximum multibyte character length supported on any platform.
-   This can be less than MB_LEN_MAX because many platforms have a
-   large MB_LEN_MAX to allow for stateful encodings, and mbcel does
-   not need to support these encodings.  MBCEL_LEN_MAX is enough for
-   UTF-8, EUC, Shift-JIS, GB18030, etc.
-   0 < MB_CUR_MAX <= MBCEL_LEN_MAX <= MB_LEN_MAX.  */
-enum { MBCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
-
-/* mbcel_t is a type representing a character CH or an encoding error byte ERR,
-   along with a count of the LEN bytes that represent CH or ERR.
-   If ERR is zero, CH is a valid character and 1 <= LEN <= MB_LEN_MAX;
-   otherwise ERR is an encoding error byte, 0x80 <= ERR <= UCHAR_MAX,
-   CH == 0, and LEN == 1.  */
-typedef struct
-{
-  char32_t ch;
-  unsigned char err;
-  unsigned char len;
-} mbcel_t;
-
-/* On all known platforms, every multi-byte character length fits in
-   mbcel_t's LEN.  Check this.  */
-static_assert (MB_LEN_MAX <= UCHAR_MAX);
-
-/* Pacify GCC re '*p <= 0x7f' below.  */
-#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
-# pragma GCC diagnostic ignored "-Wtype-limits"
-#endif
-
-_GL_INLINE_HEADER_BEGIN
-#ifndef MBCEL_INLINE
-# define MBCEL_INLINE _GL_INLINE
-#endif
-
-/* With mbcel there should be no need for the performance overhead of
-   replacing glibc mbrtoc32, as callers shouldn't care whether the
-   C locale treats a byte with the high bit set as an encoding error.  */
-#ifdef __GLIBC__
-# undef mbrtoc32
-#endif
-
-/* Shifting an encoding error byte (which must be at least 2**7)
-   left by 14 yields at least 2**21 (0x200000), which is greater
-   than the maximum Unicode value 0x10FFFF.  This suffices to sort
-   encoding errors after characters.  */
-enum { MBCEL_ENCODING_ERROR_SHIFT = 14 };
-
-/* In the typical case where unsigned char easily fits in int,
-   optimizations are possible.  */
-enum {
-  MBCEL_UCHAR_FITS = UCHAR_MAX <= INT_MAX,
-  MBCEL_UCHAR_EASILY_FITS = UCHAR_MAX <= INT_MAX >> MBCEL_ENCODING_ERROR_SHIFT
-};
-
-#ifndef _GL_LIKELY
-/* Rely on __builtin_expect, as provided by the module 'builtin-expect'.  */
-# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
-# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
-#endif
-
-/* Scan bytes from P inclusive to LIM exclusive.  P must be less than LIM.
-   Return either the valid character starting at P,
-   or the encoding error of length 1 at P.  */
-MBCEL_INLINE mbcel_t
-mbcel_scan (char const *p, char const *lim)
-{
-  /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
-     In supported encodings, the first byte of a multi-byte character
-     cannot be an ASCII byte.  */
-  if (_GL_LIKELY (0 <= *p && *p <= 0x7f))
-    return (mbcel_t) { .ch = *p, .len = 1 };
-
-  /* An initial mbstate_t; initialization optimized for some platforms.
-     For details about these and other platforms, see wchar.in.h.  */
-#if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__)
-  /* Although only a trivial optimization, it's worth it for GNU.  */
-  mbstate_t mbs; mbs.__count = 0;
-#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
-       || (defined __APPLE__ && defined __MACH__))
-  /* These platforms have 128-byte mbstate_t.  What were they thinking?
-     Initialize just for supported encodings (UTF-8, EUC, etc.).
-     Avoid memset because some compilers generate function call code.  */
-  struct mbhidden { char32_t ch; int utf8_want, euc_want; }
-    _GL_ATTRIBUTE_MAY_ALIAS;
-  union { mbstate_t m; struct mbhidden s; } u;
-  u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
-# define mbs u.m
-#elif defined __NetBSD__
-  /* Experiments on both 32- and 64-bit NetBSD platforms have
-     shown that it doesn't work to clear fewer than 24 bytes.  */
-  struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
-  union { mbstate_t m; struct mbhidden s; } u;
-  u.s.a = u.s.b = u.s.c = 0;
-# define mbs u.m
-#else
-  /* mbstate_t has unknown structure or is not worth optimizing.  */
-  mbstate_t mbs = {0};
-#endif
-
-  char32_t ch;
-  size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);
-
-  /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
-     is not supported and MB_LEN_MAX is small.  */
-  if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
-    return (mbcel_t) { .err = *p, .len = 1 };
-
-  /* Tell the compiler LEN is at most MB_LEN_MAX,
-     as this can help GCC generate better code.  */
-  if (! (len <= MB_LEN_MAX))
-    unreachable ();
-
-  /* A multi-byte character.  LEN must be positive,
-     as *P != '\0' and shift sequences are not supported.  */
-  return (mbcel_t) { .ch = ch, .len = len };
-}
-
-/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
-   If *P == TERMINATOR, scan just that byte; otherwise scan
-   bytes up to but not including a TERMINATOR byte.
-   TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
-   Return either the valid character starting at P,
-   or the encoding error of length 1 at P.  */
-MBCEL_INLINE mbcel_t
-mbcel_scant (char const *p, char terminator)
-{
-  /* Handle ASCII quickly for speed.  */
-  if (_GL_LIKELY (0 <= *p && *p <= 0x7f))
-    return (mbcel_t) { .ch = *p, .len = 1 };
-
-  /* Defer to mbcel_scan for non-ASCII.  Compute length with code that
-     is typically branch-free and faster than memchr or strnlen.  */
-  char const *lim = p + 1;
-  for (int i = 0; i < MBCEL_LEN_MAX - 1; i++)
-    lim += *lim != terminator;
-  return mbcel_scan (p, lim);
-}
-
-/* Scan bytes from P, a byte sequence terminated by '\0'.
-   If *P == '\0', scan just that byte; otherwise scan
-   bytes up to but not including a '\0'.
-   Return either the valid character starting at P,
-   or the encoding error of length 1 at P.  */
-MBCEL_INLINE mbcel_t
-mbcel_scanz (char const *p)
-{
-  return mbcel_scant (p, '\0');
-}
-
-/* Compare G1 and G2, with encoding errors sorting after characters.
-   Return <0, 0, >0 for <, =, >.  */
-MBCEL_INLINE int
-mbcel_cmp (mbcel_t g1, mbcel_t g2)
-{
-  int c1 = g1.ch, c2 = g2.ch, e1 = g1.err, e2 = g2.err, ccmp = c1 - c2,
-    ecmp = MBCEL_UCHAR_EASILY_FITS ? e1 - e2 : _GL_CMP (e1, e2);
-  return (ecmp << MBCEL_ENCODING_ERROR_SHIFT) + ccmp;
-}
-
-/* Compare G1 and G2 ignoring case, with encoding errors sorting after
-   characters.  Return <0, 0, >0 for <, =, >.  */
-MBCEL_INLINE int
-mbcel_casecmp (mbcel_t g1, mbcel_t g2)
-{
-  int cmp = mbcel_cmp (g1, g2);
-  if (_GL_LIKELY (g1.err | g2.err | !cmp))
-    return cmp;
-  int c1 = c32tolower (g1.ch);
-  int c2 = c32tolower (g2.ch);
-  return c1 - c2;
-}
-
-/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
-   Return <0, 0, >0 for <, =, >.  Consider encoding errors to be
-   greater than characters and compare them byte by byte.  */
-int mbcel_strcasecmp (char const *s1, char const *s2);
-
-_GL_INLINE_HEADER_END
-
-#endif /* _MBCEL_H */
--- a/lib/mbcel-strcasecmp.c
+++ b/lib/mbcel-strcasecmp.c
@ -19,17 +19,13 @@
 #include <config.h>

 /* Specification.  */
-#include <mbcel.h>
+#include <mcel.h>

 #include <ctype.h>
 #include <stdlib.h>

-/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
-   Return <0, 0, >0 for <, =, >.  Consider encoding errors to be
-   greater than characters and compare them byte by byte.  */
-
 int
-mbcel_strcasecmp (char const *s1, char const *s2)
+mcel_casecmp (char const *s1, char const *s2)
 {
  char const *p1 = s1;
  char const *p2 = s2;
@ -39,14 +35,15 @@ mbcel_strcasecmp (char const *s1, char const *s2)
  if (MB_CUR_MAX == 1)
    while (true)
      {
+	static_assert (UCHAR_MAX <= INT_MAX);
 	unsigned char c1 = *p1++;
 	unsigned char c2 = *p2++;
-	int cmp = MBCEL_UCHAR_FITS ? c1 - c2 : _GL_CMP (c1, c2);
+	int cmp = c1 - c2;
 	if (_GL_UNLIKELY (cmp))
 	  {
 	    c1 = tolower (c1);
 	    c2 = tolower (c2);
-	    cmp = MBCEL_UCHAR_FITS ? c1 - c2 : _GL_CMP (c1, c2);
+	    cmp = c1 - c2;
 	  }
 	if (_GL_UNLIKELY (cmp | !c1))
 	  return cmp;
@ -54,10 +51,10 @@ mbcel_strcasecmp (char const *s1, char const *s2)
  else
    while (true)
      {
-	mbcel_t g1 = mbcel_scanz (p1); p1 += g1.len;
-	mbcel_t g2 = mbcel_scanz (p2); p2 += g2.len;
-	int cmp = mbcel_casecmp (g1, g2);
-	if (_GL_UNLIKELY (cmp | ! (g1.ch | g1.err)))
+	mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
+	mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
+	int cmp = ucore_tocmp (c32tolower, g1.c, g2.c);
+	if (_GL_UNLIKELY (cmp | !g1.c))
 	  return cmp;
      }
 }
--- a/lib/mcel.c
+++ b/lib/mcel.c
@ -0,0 +1,3 @@
+#include <config.h>
+#define MCEL_INLINE _GL_EXTERN_INLINE
+#include "mcel.h"
--- a/lib/mcel.h
+++ b/lib/mcel.h
@ -0,0 +1,236 @@
+/* Multi-byte characters, Error encodings, and Lengths (MCELs)
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Paul Eggert.  */
+
+/* The mcel_scan function lets code iterate through an array of bytes,
+   supporting character encodings in practical use
+   more simply than using plain mbrtoc32.
+
+   Instead of this single-byte code:
+
+      char *p = ..., *lim = ...;
+      for (; p < lim; p++)
+        process (*p);
+
+   You can use this multi-byte code:
+
+      char *p = ..., *lim = ...;
+      for (mcel_t g; p < lim; p += g.len)
+        {
+	  g = mcel_scan (p, lim);
+	  process (g);
+	}
+
+   The mcel_scanz function is similar except it works with a
+   string of unknown length that is terminated with '\0'.
+   Instead of this single-byte code:
+
+      char *p = ...;
+      for (; *p; p++)
+	process (*p);
+
+   You can use this multi-byte code:
+
+      char *p = ...;
+      for (mcel_t g; *p; p += g.len)
+	{
+	  g = mcel_scanz (p);
+	  process (g);
+	}
+
+   mcel_scant (P, TERMINATOR) is like mcel_scanz (P) except the
+   string is terminated by TERMINATOR.  The TERMINATORs '\0', '\r',
+   '\n', '.', '/' are safe, as they cannot be a part (even a trailing
+   byte) of a multi-byte character.
+
+   You can select from G using G.c and G.len.
+   You can use ucore_* functions on G.c, e.g., ucore_iserr (G.c),
+   ucore_is (c32isalpha, G.c), and ucore_to (c32tolower, G.c).
+
+   mcel_strcasecmp compares two null-terminated multi-byte strings
+   lexicographically, ignoring case.
+
+   Although ISO C and POSIX allow encodings that have shift states or
+   that can produce multiple characters from an indivisible byte sequence,
+   POSIX does not require support for these encodings,
+   they are not in practical use on GNUish platforms,
+   and omitting support for them simplifies the API.  */
+
+#ifndef _MCEL_H
+#define _MCEL_H 1
+
+/* This API is an extension of ucore.h.  Programs that include this
+   file can assume ucore.h is included too.  */
+#include <ucore.h>
+
+/* The maximum multi-byte character length supported on any platform.
+   This can be less than MB_LEN_MAX because many platforms have a
+   large MB_LEN_MAX to allow for stateful encodings, and mcel does not
+   support these encodings.  MCEL_LEN_MAX is enough for UTF-8, EUC,
+   Shift-JIS, GB18030, etc.  In all multi-byte encodings supported by glibc,
+   0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX.  */
+enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
+
+/* mcel_t is a type representing a character or encoding error C,
+   along with a count of the LEN bytes that represent C.
+   1 <= LEN <= MB_LEN_MAX.  */
+typedef struct
+{
+  ucore_t c;
+  unsigned char len;
+} mcel_t;
+
+/* Every multi-byte character length fits in mcel_t's LEN.  */
+static_assert (MB_LEN_MAX <= UCHAR_MAX);
+
+/* Bytes have 8 bits, as POSIX requires.  */
+static_assert (CHAR_BIT == 8);
+
+/* Pacify GCC re 'c <= 0x7f' below.  */
+#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
+# pragma GCC diagnostic ignored "-Wtype-limits"
+#endif
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef MCEL_INLINE
+# define MCEL_INLINE _GL_INLINE
+#endif
+
+/* With mcel there should be no need for the performance overhead of
+   replacing glibc mbrtoc32, as callers shouldn't care whether the
+   C locale treats a byte with the high bit set as an encoding error.  */
+#ifdef __GLIBC__
+# undef mbrtoc32
+#endif
+
+/* Shifting an encoding error byte (at least 0x80) left by this value
+   yields a value in the range UCORE_ERR_MIN .. 2*UCORE_ERR_MIN - 1.
+   This suffices to sort encoding errors after characters.  */
+enum { MCEL_ENCODING_ERROR_SHIFT = 14 };
+static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
+
+/* Whether C represents itself as a Unicode character
+   when it is the first byte of a single- or multi-byte character.
+   These days it is safe to assume ASCII, so do not support
+   obsolescent encodings like CP864, EBCDIC, Johab, and Shift JIS.  */
+MCEL_INLINE bool
+mcel_isbasic (char c)
+{
+  return 0 <= c && c <= 0x7f;
+}
+
+/* Scan bytes from P inclusive to LIM exclusive.  P must be less than LIM.
+   Return the character or encoding error starting at P.  */
+MCEL_INLINE mcel_t
+mcel_scan (char const *p, char const *lim)
+{
+  /* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
+     In supported encodings, the first byte of a multi-byte character
+     cannot be an ASCII byte.  */
+  if (_GL_LIKELY (mcel_isbasic (*p)))
+    return (mcel_t) { .c = *p, .len = 1 };
+
+  /* An initial mbstate_t; initialization optimized for some platforms.
+     For details about these and other platforms, see wchar.in.h.  */
+#if defined __GLIBC__ && 2 < __GLIBC__ + (2 <= __GLIBC_MINOR__)
+  /* Although only a trivial optimization, it's worth it for GNU.  */
+  mbstate_t mbs; mbs.__count = 0;
+#elif (defined __FreeBSD__ || defined __DragonFly__ || defined __OpenBSD__ \
+       || (defined __APPLE__ && defined __MACH__))
+  /* These platforms have 128-byte mbstate_t.  What were they thinking?
+     Initialize just for supported encodings (UTF-8, EUC, etc.).
+     Avoid memset because some compilers generate function call code.  */
+  struct mbhidden { char32_t ch; int utf8_want, euc_want; }
+    _GL_ATTRIBUTE_MAY_ALIAS;
+  union { mbstate_t m; struct mbhidden s; } u;
+  u.s.ch = u.s.utf8_want = u.s.euc_want = 0;
+# define mbs u.m
+#elif defined __NetBSD__
+  /* Experiments on both 32- and 64-bit NetBSD platforms have
+     shown that it doesn't work to clear fewer than 24 bytes.  */
+  struct mbhidden { long long int a, b, c; } _GL_ATTRIBUTE_MAY_ALIAS;
+  union { mbstate_t m; struct mbhidden s; } u;
+  u.s.a = u.s.b = u.s.c = 0;
+# define mbs u.m
+#else
+  /* mbstate_t has unknown structure or is not worth optimizing.  */
+  mbstate_t mbs = {0};
+#endif
+
+  char32_t c;
+  size_t len = mbrtoc32 (&c, p, lim - p, &mbs);
+
+  /* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
+     is not supported and MB_LEN_MAX is small.  */
+  if (_GL_LIKELY (len <= (size_t) -1 / 2))
+    {
+      /* A multi-byte character.  LEN must be positive,
+	 as *P != '\0' and shift sequences are not supported.  */
+      assume (0 < len);
+      assume (len <= MB_LEN_MAX);
+      assume (c <= UCORE_CHAR_MAX);
+      return (mcel_t) { .c = c, .len = len };
+    }
+  else
+    {
+      /* An encoding error.  */
+      unsigned char b = *p;
+      c = b << MCEL_ENCODING_ERROR_SHIFT;
+      assume (UCORE_ERR_MIN <= c);
+      assume (c <= UCORE_ERR_MAX);
+      return (mcel_t) { .c = c, .len = 1 };
+    }
+}
+
+/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
+   If *P == TERMINATOR, scan just that byte; otherwise scan
+   bytes up to but not including TERMINATOR.
+   TERMINATOR must be ASCII, and should be '\0', '\r', '\n', '.', or '/'.
+   Return the character or encoding error starting at P.  */
+MCEL_INLINE mcel_t
+mcel_scant (char const *p, char terminator)
+{
+  /* Handle ASCII quickly for speed.  */
+  if (_GL_LIKELY (mcel_isbasic (*p)))
+    return (mcel_t) { .c = *p, .len = 1 };
+
+  /* Defer to mcel_scan for non-ASCII.  Compute length with code that
+     is typically branch-free and faster than memchr or strnlen.  */
+  char const *lim = p + 1;
+  for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
+    lim += *lim != terminator;
+  return mcel_scan (p, lim);
+}
+
+/* Scan bytes from P, a byte sequence terminated by '\0'.
+   If *P == '\0', scan just that byte; otherwise scan
+   bytes up to but not including '\0'.
+   Return the character or encoding error starting at P.  */
+MCEL_INLINE mcel_t
+mcel_scanz (char const *p)
+{
+  return mcel_scant (p, '\0');
+}
+
+/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
+   Return <0, 0, >0 for <, =, >.  Consider encoding errors to be
+   greater than characters and compare them byte by byte.  */
+int mcel_casecmp (char const *s1, char const *s2);
+
+_GL_INLINE_HEADER_END
+
+#endif /* _MCEL_H */
--- a/lib/ucore.c
+++ b/lib/ucore.c
@ -0,0 +1,3 @@
+#include <config.h>
+#define UCORE_INLINE _GL_EXTERN_INLINE
+#include "ucore.h"
--- a/lib/ucore.h
+++ b/lib/ucore.h
@ -0,0 +1,132 @@
+/* Unicode Characters OR Encoding errors (UCOREs)
+   Copyright 2023 Free Software Foundation, Inc.
+
+   This file is free software: you can redistribute it and/or modify
+   it under the terms of the GNU Lesser General Public License as
+   published by the Free Software Foundation; either version 3 of the
+   License, or (at your option) any later version.
+
+   This file is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public License
+   along with this program.  If not, see <https://www.gnu.org/licenses/>.  */
+
+/* Written by Paul Eggert.  */
+
+/* This API's fundamental type ucore_t represents
+   a single Unicode character or an encoding error.
+   ucore_iserr (C) tests whether C is an encoding error.
+   ucore_is (P, C) etc. test whether char class P accepts C.
+   ucore_to (TO, C) etc. use TO to convert C.
+   ucore_cmp (C1, C2) and ucore_tocmp (TO, C1, C2) compare C1 and C2,
+   with encoding errors sorting after characters.  */
+
+#ifndef _UCORE_H
+#define _UCORE_H 1
+
+#if !_GL_CONFIG_H_INCLUDED
+ #error "Please include config.h first."
+#endif
+
+#include <verify.h>
+
+#include <limits.h>
+#include <stddef.h>
+#include <uchar.h>
+
+/* ucore_t represents a Unicode Character OR Encoding error.
+   If 0 <= C <= UCORE_CHAR_MAX, C represents a Unicode character.
+   If UCORE_ERR_MIN <= C <= UCORE_ERR_MAX, C represents an encoding error.
+   Other ucore_t values C are invalid.  */
+typedef int ucore_t;
+
+enum {
+  UCORE_CHAR_MAX = 0x10FFFF,
+  UCORE_ERR_MIN = 0x200000,
+  UCORE_ERR_MAX = 2 * UCORE_ERR_MIN - 1
+};
+
+/* Information is not lost by encoding errors as integers.  */
+static_assert (UCHAR_MAX <= UCORE_ERR_MAX - UCORE_ERR_MIN);
+
+/* On glibc platforms, predicates like c32isalnum and c32tolower
+   do the right thing for char32_t values that are not valid characters.
+   POSIX says the behavior is undefined, so play it safe elsewhere.
+   Do not rely on UCORE_C32_SAFE for c32width.  */
+#ifdef __GLIBC__
+enum { UCORE_C32_SAFE = true };
+#else
+enum { UCORE_C32_SAFE = false };
+#endif
+
+#ifndef _GL_LIKELY
+/* Rely on __builtin_expect, as provided by the module 'builtin-expect'.  */
+# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
+# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
+#endif
+
+_GL_INLINE_HEADER_BEGIN
+#ifndef UCORE_INLINE
+# define UCORE_INLINE _GL_INLINE
+#endif
+
+/* Return true if C represents an encoding error, false otherwise.  */
+UCORE_INLINE bool
+ucore_iserr (ucore_t c)
+{
+  /* (c & UCORE_ERR_MIN) is a bit cheaper than (UCORE_ERR_MIN <= c)
+     with GCC 13 x86-64.  */
+  if (_GL_UNLIKELY (c & UCORE_ERR_MIN))
+    {
+      assume (UCORE_ERR_MIN <= c && c <= UCORE_ERR_MAX);
+      return true;
+    }
+  else
+    {
+      assume (0 <= c && c <= UCORE_CHAR_MAX);
+      return false;
+    }
+}
+
+/* Whether the uchar predicate P accepts C, e.g., ucore_is (c32isalpha, C).  */
+UCORE_INLINE bool
+ucore_is (int (*p) (wint_t), wint_t c)
+{
+  /* When C is out of range, predicates based on glibc return false.
+     Behavior is undefined on other platforms, so play it safe.  */
+  return (UCORE_C32_SAFE || ! ucore_iserr (c)) && p (c);
+}
+
+/* Apply the uchar translator TO to C, e.g., ucore_to (c32tolower, C).  */
+UCORE_INLINE wint_t
+ucore_to (wint_t (*to) (wint_t), ucore_t c)
+{
+  return UCORE_C32_SAFE || ! ucore_iserr (c) ? to (c) : c;
+}
+
+/* Compare C1 and C2, with encoding errors sorting after characters.
+   Return <0, 0, >0 for <, =, >.  */
+UCORE_INLINE int
+ucore_cmp (ucore_t c1, ucore_t c2)
+{
+  return c1 - c2;
+}
+
+/* Apply the uchar translater TO to C1 and C2 and compare the results,
+   with encoding errors sorting after characters,
+   Return <0, 0, >0 for <, =, >.  */
+UCORE_INLINE int
+ucore_tocmp (wint_t (*to) (wint_t), ucore_t c1, ucore_t c2)
+{
+  if (c1 == c2)
+    return 0;
+  int i1 = ucore_to (to, c1), i2 = ucore_to (to, c2);
+  return i1 - i2;
+}
+
+_GL_INLINE_HEADER_END
+
+#endif /* _MCEL_H */
--- a/src/dir.c
+++ b/src/dir.c
@ -25,7 +25,7 @@
 #include <error.h>
 #include <exclude.h>
 #include <filenamecat.h>
-#include <mbcel.h>
+#include <mcel.h>
 #include <quote.h>
 #include <setjmp.h>
 #include <xalloc.h>
@ -189,7 +189,7 @@ compare_collated (char const *name1, char const *name2)
 {
  int r;
  if (ignore_file_name_case)
-    r = mbcel_strcasecmp (name1, name2);  /* Best we can do.  */
+    r = mcel_casecmp (name1, name2);  /* Best we can do.  */
  else
    {
      errno = 0;
--- a/src/io.c
+++ b/src/io.c
@ -23,7 +23,7 @@
 #include <cmpbuf.h>
 #include <file-type.h>
 #include <ialloc.h>
-#include <mbcel.h>
+#include <mcel.h>
 #include <xalloc.h>

 #include <uchar.h>
@ -230,14 +230,6 @@ slurp (struct file_data *current)
    }
 }

-/* Return true if CH1 and ERR1 stand for the same character or
-   encoding error as CH2 and ERR2.  */
-static bool
-same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2)
-{
-  return ! ((ch1 ^ ch2) | (err1 ^ err2));
-}
-
 /* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
   one line from each input file) according to the command line options.
   Line lengths include the trailing newline.
@ -435,35 +427,35 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
    {
      char const *lim1 = s1 + s1len;
      char const *lim2 = s2 + s2len;
-      char32_t ch1prev = 0;
+      ucore_t c1prev = 0;

      while (true)
 	{
-	  mbcel_t g1 = mbcel_scan (t1, lim1);
-	  mbcel_t g2 = mbcel_scan (t2, lim2);
+	  mcel_t g1 = mcel_scan (t1, lim1);
+	  mcel_t g2 = mcel_scan (t2, lim2);
 	  t1 += g1.len;
 	  t2 += g2.len;
-	  char32_t ch1 = g1.ch;
-	  char32_t ch2 = g2.ch;
+	  ucore_t c1 = g1.c;
+	  ucore_t c2 = g2.c;

 	  /* Test for exact equality first, since it's a common case.  */
-	  if (! same_ch_err (ch1, g1.err, ch2, g2.err))
+	  if (ucore_cmp (c1, c2) != 0)
 	    {
 	      switch (ignore_white_space)
 		{
 		case IGNORE_ALL_SPACE:
 		  /* For -w, just skip past any white space.  */
-		  while (ch1 != '\n' && c32isspace (ch1))
+		  while (c1 != '\n' && ! ucore_is (c32isspace, c1))
 		    {
-		      g1 = mbcel_scan (t1, lim1);
+		      g1 = mcel_scan (t1, lim1);
 		      t1 += g1.len;
-		      ch1 = g1.ch;
+		      c1 = g1.c;
 		    }
-		  while (ch2 != '\n' && c32isspace (ch2))
+		  while (c2 != '\n' && ucore_is (c32isspace, c2))
 		    {
-		      g2 = mbcel_scan (t2, lim2);
+		      g2 = mcel_scan (t2, lim2);
 		      t2 += g2.len;
-		      ch2 = g2.ch;
+		      c2 = g2.c;
 		    }
 		  break;

@ -471,46 +463,48 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 		  /* For -b, advance past any sequence of white space in
 		     line 1 and consider it just one space, or nothing at
 		     all if it is at the end of the line.  */
-		  if (c32isspace (ch1))
-		    while (ch1 != '\n')
+		  if (ucore_is (c32isspace, c1))
+		    while (c1 != '\n')
 		      {
-			g1 = mbcel_scan (t1, lim1);
+			g1 = mcel_scan (t1, lim1);
 			t1 += g1.len;
-			ch1 = g1.ch;
-			if (! c32isspace (ch1))
+			c1 = g1.c;
+			if (! ucore_is (c32isspace, c1))
 			  {
 			    t1 -= g1.len;
-			    ch1 = ' ';
+			    c1 = ' ';
 			    break;
 			  }
 		      }

 		  /* Likewise for line 2.  */
-		  if (c32isspace (ch2))
-		    while (ch2 != '\n')
+		  if (ucore_is (c32isspace, c2))
+		    while (c2 != '\n')
 		      {
-			g2 = mbcel_scan (t2, lim2);
+			g2 = mcel_scan (t2, lim2);
 			t2 += g2.len;
-			ch2 = g2.ch;
-			if (! c32isspace (ch2))
+			c2 = g2.c;
+			if (! ucore_is (c32isspace, c2))
 			  {
 			    t2 -= g2.len;
-			    ch2 = ' ';
+			    c2 = ' ';
 			    break;
 			  }
 		      }

-		  if (ch1 != ch2)
+		  if (c1 != c2)
 		    {
 		      /* If we went too far when doing the simple test
 			 for equality, go back to the first non-white-space
 			 character in both sides and try again.  */
-		      if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev))
+		      if (c2 == ' ' && c1 != '\n'
+			  && ucore_is (c32isspace, c1prev))
 			{
 			  t1 -= g1.len;
 			  continue;
 			}
-		      if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev))
+		      if (c1 == ' ' && c2 != '\n'
+			  && ucore_is (c32isspace, c1prev))
 			{
 			  t2 -= g2.len;
 			  continue;
@ -521,30 +515,32 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)

 		case IGNORE_TRAILING_SPACE:
 		case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
-		  if (c32isspace (ch1) && c32isspace (ch2))
+		  if (ucore_is (c32isspace, c1) && ucore_is (c32isspace, c2))
 		    {
-		      if (ch1 != '\n')
+		      if (c1 != '\n')
 			{
-			  mbcel_t g;
-			  for (char const *p = t1; ; p += g.len)
+			  char const *p = t1;
+			  while (*p != '\n')
 			    {
-			      g = mbcel_scan (p, lim1);
-			      if (g.ch == '\n' || ! c32isspace (g.ch))
+			      mcel_t g = mcel_scan (p, lim1);
+			      if (! ucore_is (c32isspace, g.c))
 				break;
+			      p += g.len;
 			    }
-			  if (g.ch != '\n')
+			  if (*p != '\n')
 			    break;
 			}
-		      if (ch2 != '\n')
+		      if (c2 != '\n')
 			{
-			  mbcel_t g;
-			  for (char const *p = t2; ; p += g.len)
+			  char const *p = t2;
+			  while (*p != '\n')
 			    {
-			      g = mbcel_scan (p, lim2);
-			      if (g.ch == '\n' || ! c32isspace (g.ch))
+			      mcel_t g = mcel_scan (p, lim2);
+			      if (! ucore_is (c32isspace, g.c))
 				break;
+			      p += g.len;
 			    }
-			  if (g.ch != '\n')
+			  if (*p != '\n')
 			    break;
 			}
 		      /* Both lines have nothing but whitespace left.  */
@ -554,45 +550,45 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 		    break;
 		  FALLTHROUGH;
 		case IGNORE_TAB_EXPANSION:
-		  if ((ch1 == ' ' && ch2 == '\t')
-		      || (ch1 == '\t' && ch2 == ' '))
+		  if ((c1 == ' ' && c2 == '\t')
+		      || (c1 == '\t' && c2 == ' '))
 		    {
 		      intmax_t tab2 = tab, column2 = column;

 		      while (true)
 			{
-			  if (ch1 == '\t'
-			      || (ch1 == ' ' && column == tabsize - 1))
+			  if (c1 == '\t'
+			      || (c1 == ' ' && column == tabsize - 1))
 			    {
 			      tab++;
 			      column = 0;
 			    }
-			  else if (ch1 == ' ')
+			  else if (c1 == ' ')
 			    column++;
 			  else
 			    break;

-			  g1 = mbcel_scan (t1, lim1);
+			  g1 = mcel_scan (t1, lim1);
 			  t1 += g1.len;
-			  ch1 = g1.ch;
+			  c1 = g1.c;
 			}

 		      while (true)
 			{
-			  if (ch2 == '\t'
-			      || (ch2 == ' ' && column2 == tabsize - 1))
+			  if (c2 == '\t'
+			      || (c2 == ' ' && column2 == tabsize - 1))
 			    {
 			      tab2++;
 			      column2 = 0;
 			    }
-			  else if (ch2 == ' ')
+			  else if (c2 == ' ')
 			    column2++;
 			  else
 			    break;

-			  g2 = mbcel_scan (t2, lim2);
+			  g2 = mcel_scan (t2, lim2);
 			  t2 += g2.len;
-			  ch2 = g2.ch;
+			  c2 = g2.c;
 			}

 		      if (tab != tab2 || column != column2)
@ -606,15 +602,15 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)

 	      if (ignore_case)
 		{
-		  ch1 = c32tolower (ch1);
-		  ch2 = c32tolower (ch2);
+		  c1 = ucore_to (c32tolower, c1);
+		  c2 = ucore_to (c32tolower, c2);
 		}

-	      if (! same_ch_err (ch1, g1.err, ch2, g2.err))
+	      if (ucore_cmp (c1, c2) != 0)
 		break;
 	    }

-	  switch (ch1)
+	  switch (c1)
 	    {
 	    case '\n':
 	      return false;
@ -638,7 +634,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)

 	    default:
 	      /* Assume that downcasing does not change print width.  */
-	      column += g1.err ? 1 : c32width (ch1);
+	      column += ucore_iserr (c1) ? 1 : c32width (c1);
 	      if (column < tabsize)
 		break;
 	      FALLTHROUGH;
@ -648,7 +644,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
 	      break;
 	    }

-	  ch1prev = ch1;
+	  c1prev = c1;
 	}
    }

@ -699,11 +695,11 @@ find_and_hash_each_line (struct file_data *current)
 		  h = hash (h, ig_case ? tolower (c) : c);
 	      }
 	  else
-	    for (mbcel_t g; *p != '\n'; p += g.len)
+	    for (mcel_t g; *p != '\n'; p += g.len)
 	      {
-		g = mbcel_scan (p, suffix_begin);
-		if (! c32isspace (g.ch))
-		  h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
+		g = mcel_scan (p, suffix_begin);
+		if (! ucore_is (c32isspace, g.c))
+		  h = hash (h, (ig_case ? ucore_to (c32tolower, g.c) : g.c));
 	      }
          break;

@ -728,25 +724,25 @@ find_and_hash_each_line (struct file_data *current)
 		h = hash (h, ig_case ? tolower (c) : c);
 	      }
 	  else
-	    for (mbcel_t g; *p != '\n'; p += g.len)
+	    for (mcel_t g; *p != '\n'; p += g.len)
 	      {
-		g = mbcel_scan (p, suffix_begin);
-		if (c32isspace (g.ch))
+		g = mcel_scan (p, suffix_begin);
+		if (ucore_is (c32isspace, g.c))
 		  {
 		    do
 		      {
 			p += g.len;
 			if (*p == '\n')
 			  goto hashing_done;
-			g = mbcel_scan (p, suffix_begin);
+			g = mcel_scan (p, suffix_begin);
 		      }
-		    while (c32isspace (g.ch));
+		    while (ucore_is (c32isspace, g.c));

 		    h = hash (h, ' ');
 		  }

 		/* G is now the first non-space.  */
-		h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
+		h = hash (h, ig_case ? ucore_to (c32tolower, g.c) : g.c);
 	      }
          break;

@ -817,39 +813,35 @@ find_and_hash_each_line (struct file_data *current)
 		  while (--repetitions != 0);
 		}
 	    else
-	      for (mbcel_t g; *p != '\n'; p += g.len)
+	      for (mcel_t g; *p != '\n'; p += g.len)
 		{
 		  intmax_t repetitions = 1;

-		  g = mbcel_scan (p, suffix_begin);
-		  char32_t ch;
-		  if (g.err)
-		    {
-		      ch = -g.err;
-		      column++;
-		    }
+		  g = mcel_scan (p, suffix_begin);
+		  ucore_t c = g.c;
+		  if (ucore_iserr (c))
+		    column++;
 		  else
 		    {
-		      ch = g.ch;
 		      if (ig_white_space & IGNORE_TRAILING_SPACE
-			  && c32isspace (ch))
+			  && ucore_is (c32isspace, c))
 			{
 			  char const *p1 = p + g.len;
-			  for (mbcel_t g1; ; p1 += g1.len)
+			  for (mcel_t g1; ; p1 += g1.len)
 			    {
 			      if (*p1 == '\n')
 				{
 				  p = p1;
 				  goto hashing_done;
 				}
-			      g1 = mbcel_scan (p1, suffix_begin);
-			      if (! c32isspace (g1.ch))
+			      g1 = mcel_scan (p1, suffix_begin);
+			      if (! ucore_is (c32isspace, g1.c))
 				break;
 			    }
 			}

 		      if (ig_white_space & IGNORE_TAB_EXPANSION)
-			switch (ch)
+			switch (c)
 			  {
 			  case '\b':
 			    if (0 < column)
@ -862,7 +854,7 @@ find_and_hash_each_line (struct file_data *current)
 			    break;

 			  case '\t':
-			    ch = ' ';
+			    c = ' ';
 			    repetitions = tabsize - column % tabsize;
 			    tab += column / tabsize + 1;
 			    column = 0;
@ -876,16 +868,16 @@ find_and_hash_each_line (struct file_data *current)
 			    break;

 			  default:
-			    column += c32width (ch);
+			    column += c32width (c);
 			    break;
 			  }

 		      if (ig_case)
-			ch = c32tolower (ch);
+			c = c32tolower (c);
 		    }

 		  do
-		    h = hash (h, ch);
+		    h = hash (h, c);
 		  while (--repetitions != 0);
 		}
          }
@ -904,16 +896,16 @@ find_and_hash_each_line (struct file_data *current)
 	  else
 	    {
 	      if (ig_case)
-		for (mbcel_t g; *p != '\n'; p += g.len)
+		for (mcel_t g; *p != '\n'; p += g.len)
 		  {
-		    g = mbcel_scan (p, suffix_begin);
-		    h = hash (h, c32tolower (g.ch) - g.err);
+		    g = mcel_scan (p, suffix_begin);
+		    h = hash (h, ucore_to (c32tolower, g.c));
 		  }
 	      else
-		for (mbcel_t g; *p != '\n'; p += g.len)
+		for (mcel_t g; *p != '\n'; p += g.len)
 		  {
-		    g = mbcel_scan (p, suffix_begin);
-		    h = hash (h, g.ch - g.err);
+		    g = mcel_scan (p, suffix_begin);
+		    h = hash (h, g.c);
 		  }
 	    }
          break;
--- a/src/side.c
+++ b/src/side.c
@ -22,7 +22,7 @@

 #include "diff.h"

-#include <mbcel.h>
+#include <mcel.h>

 static void print_sdiff_common_lines (lin, lin);
 static void print_sdiff_hunk (struct change *);
@ -145,8 +145,8 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound)
 	    /* A byte that might start a multibyte character.
 	       Increase TEXT_POINTER, counting columns.
 	       Assume encoding errors have print width 1.  */
-	    mbcel_t g = mbcel_scan (tp0, text_limit);
-	    int width = g.err ? 1 : c32width (g.ch);
+	    mcel_t g = mcel_scan (tp0, text_limit);
+	    int width = ucore_iserr (g.c) ? 1 : c32width (g.c);
 	    if (0 < width && ckd_add (&in_position, in_position, width))
 	      return out_position;