mirror of
https://https.git.savannah.gnu.org/git/diffutils.git
synced 2026-01-27 01:44:20 +00:00
diff: modularize and tune mcel code
Go back to a single mcel module, instead of trying to break it up into ucore and mcel pieces, as breaking it up hurt performance. Use gnulib-tool’s --local-dir to create diffutils-specific modules for mcel; the idea is that this will eventually migrate into Gnulib. * bootstrap.conf (avoided_gnulib_modules): Add mbuiterf. (gnulib_modules): Add mbscasecmp, mcel-prefer. (gnulib_tool_option_extras): Add --local-dir=gl to pick up new files. * cfg.mk (exclude_file_name_regexp--sc_prohibit_doubled_word): Do not exclude now-removed files lib/ucore.c, lib/ucore.h. * lib/Makefile.am: Adjust to use of modules. (noinst_HEADERS): Remove mcel.h, ucore.h. (libdiffutils_a_SOURCES): Remove mcel.c, mcel-casecmp.c, ucore.c * lib/mcel-casecmp.c, lib/ucore.c, lib/ucore.h: Remove. * lib/mcel.h: Switch to LGPLv2.1+. Do not include ucore.h. All uses of ucore_t changed back to using char32_t. Do what ucore.h used to do: include verify.h, limits.h, stddef.h, uchar.h; require config.h, define _GL_LIKELY, _GL_UNLIKELY. (MCEl_CHAR_MAX, MCEL_ERR_MIN, MCEL_ERR_MAX): New constants. (mcel_t): Switch from single ucore_t c to a char32_t ch and unsigned char err. This has significantly better performance on Fedora 38 x86-64. All uses changed. Check that unsigned char promotes to int. (mcel_ch, mcel_err, mcel_cmp, mcel_tocmp): New functions. (MCEL_ERR_SHIFT): Rename from MCEL_ENCODING_ERROR_SHIFT. All uses changed. (mcel_isbasic): Add a _GL_LIKELY to help compilers. All uses changed. (mcel_scan, mcel_scant): Simplify by using mcel_ch, mcel_err. (mcel_casecmp): Remove decl. Callers changed to use mbscasecmp. * gl/lib/mcel.c, gl/lib/mcel.h: Rename from lib/mcel.c, lib/mcel.h. * gl/lib/mbscasecmp.c: New file. * gl/modules/mcel, gl/modules/mcel-prefer, gl/modules/mcel-tests: * gl/tests/test-mcel.c: New files. * src/io.c: Revert use of ucore API. Use plain c32isspace etc. instead of ucore_is. Use .err instead of ucore_iserr. (same_ch_err): Bring back, and use it instead of ucore_cmp. * src/side.c (print_half_line): Use .err instead of ucore_iserr.
This commit is contained in:
parent
574e81bff2
commit
ae1cdc7239
@ -18,6 +18,7 @@
|
||||
avoided_gnulib_modules='
|
||||
--avoid=localename
|
||||
--avoid=lock-tests
|
||||
--avoid=mbuiterf
|
||||
--avoid=setlocale
|
||||
'
|
||||
|
||||
@ -73,7 +74,8 @@ largefile
|
||||
lstat
|
||||
maintainer-makefile
|
||||
manywarnings
|
||||
mbrtoc32
|
||||
mbscasecmp
|
||||
mcel-prefer
|
||||
mempcpy
|
||||
minmax
|
||||
mkstemp
|
||||
@ -140,6 +142,7 @@ XGETTEXT_OPTIONS=$XGETTEXT_OPTIONS'\\\
|
||||
'
|
||||
|
||||
gnulib_tool_option_extras="--tests-base=gnulib-tests
|
||||
--local-dir=gl
|
||||
--with-tests
|
||||
--symlink
|
||||
--makefile-name=gnulib.mk
|
||||
|
||||
3
cfg.mk
3
cfg.mk
@ -74,8 +74,7 @@ config-save:
|
||||
cp lib/config.h config.status $(_cf_state_dir)/latest
|
||||
|
||||
exclude_file_name_regexp--sc_space_tab = ^gl/lib/.*\.c\.diff$$
|
||||
exclude_file_name_regexp--sc_prohibit_doubled_word = \
|
||||
^(tests/y2038-vs-32bit|lib/ucore\.h)$$
|
||||
exclude_file_name_regexp--sc_prohibit_doubled_word = ^tests/y2038-vs-32bit$$
|
||||
|
||||
# Tell gnulib's tight_scope rule that we mark externs with XTERN
|
||||
export _gl_TS_extern = extern|XTERN|DIFF_INLINE|SYSTEM_INLINE|SYSTEM_EXTERN
|
||||
|
||||
112
gl/lib/mbscasecmp.c
Normal file
112
gl/lib/mbscasecmp.c
Normal file
@ -0,0 +1,112 @@
|
||||
/* Case-insensitive string comparison function.
|
||||
Copyright (C) 1998-1999, 2005-2023 Free Software Foundation, Inc.
|
||||
Written by Bruno Haible <bruno@clisp.org>, 2005,
|
||||
based on earlier glibc code.
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
/* Specification. */
|
||||
#include <string.h>
|
||||
|
||||
#include <ctype.h>
|
||||
#include <limits.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#if GNULIB_MCEL_PREFER
|
||||
# include "mcel.h"
|
||||
#else
|
||||
# include "mbuiterf.h"
|
||||
#endif
|
||||
|
||||
/* Compare the character strings S1 and S2, ignoring case, returning less than,
|
||||
equal to or greater than zero if S1 is lexicographically less than, equal to
|
||||
or greater than S2.
|
||||
Note: This function may, in multibyte locales, return 0 for strings of
|
||||
different lengths! */
|
||||
int
|
||||
mbscasecmp (const char *s1, const char *s2)
|
||||
{
|
||||
if (s1 == s2)
|
||||
return 0;
|
||||
|
||||
char const *p1 = s1;
|
||||
char const *p2 = s2;
|
||||
|
||||
/* Be careful not to look at the entire extent of s1 or s2 until needed.
|
||||
This is useful because when two strings differ, the difference is
|
||||
most often already in the very few first characters. */
|
||||
if (MB_CUR_MAX > 1)
|
||||
{
|
||||
#if GNULIB_MCEL_PREFER
|
||||
while (true)
|
||||
{
|
||||
mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
|
||||
mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
|
||||
int cmp = mcel_tocmp (c32tolower, g1, g2);
|
||||
if (cmp | !g1.ch)
|
||||
return cmp;
|
||||
}
|
||||
#else
|
||||
mbuif_state_t state1;
|
||||
const char *iter1;
|
||||
mbuif_init (state1);
|
||||
iter1 = s1;
|
||||
|
||||
mbuif_state_t state2;
|
||||
const char *iter2;
|
||||
mbuif_init (state2);
|
||||
iter2 = s2;
|
||||
|
||||
while (mbuif_avail (state1, iter1) && mbuif_avail (state2, iter2))
|
||||
{
|
||||
mbchar_t cur1 = mbuif_next (state1, iter1);
|
||||
mbchar_t cur2 = mbuif_next (state2, iter2);
|
||||
int cmp = mb_casecmp (cur1, cur2);
|
||||
|
||||
if (cmp != 0)
|
||||
return cmp;
|
||||
|
||||
iter1 += mb_len (cur1);
|
||||
iter2 += mb_len (cur2);
|
||||
}
|
||||
if (mbuif_avail (state1, iter1))
|
||||
/* s2 terminated before s1. */
|
||||
return 1;
|
||||
if (mbuif_avail (state2, iter2))
|
||||
/* s1 terminated before s2. */
|
||||
return -1;
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
else
|
||||
while (true)
|
||||
{
|
||||
unsigned char c1 = *p1++;
|
||||
unsigned char c2 = *p2++;
|
||||
/* On machines where 'char' and 'int' are types of the same size, the
|
||||
difference of two 'unsigned char' values - including the sign bit -
|
||||
doesn't fit in an 'int'. */
|
||||
int cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
|
||||
if (cmp)
|
||||
{
|
||||
c1 = tolower (c1);
|
||||
c2 = tolower (c2);
|
||||
cmp = UCHAR_MAX <= INT_MAX ? c1 - c2 : _GL_CMP (c1, c2);
|
||||
}
|
||||
if (cmp | !c1)
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
@ -3,7 +3,7 @@
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 3 of the
|
||||
published by the Free Software Foundation; either version 2.1 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
@ -16,7 +16,14 @@
|
||||
|
||||
/* Written by Paul Eggert. */
|
||||
|
||||
/* The mcel_scan function lets code iterate through an array of bytes,
|
||||
/* The macros in this file implement multi-byte character representation
|
||||
and forward iteration through a multi-byte string.
|
||||
They are simpler and typically faster than the mbiter family.
|
||||
However, they do not support obsolescent encodings like CP864,
|
||||
EBCDIC, Johab, and Shift JIS that glibc also does not support,
|
||||
and it is up to the caller to coalesce encoding-error bytes if desired.
|
||||
|
||||
The mcel_scan function lets code iterate through an array of bytes,
|
||||
supporting character encodings in practical use
|
||||
more simply than using plain mbrtoc32.
|
||||
|
||||
@ -35,8 +42,11 @@
|
||||
process (g);
|
||||
}
|
||||
|
||||
You can select from G using G.ch, G.err, and G.len.
|
||||
G is an encoding error if G.err is nonzero, a character otherwise.
|
||||
|
||||
The mcel_scanz function is similar except it works with a
|
||||
string of unknown length that is terminated with '\0'.
|
||||
string of unknown but positive length that is terminated with '\0'.
|
||||
Instead of this single-byte code:
|
||||
|
||||
char *p = ...;
|
||||
@ -57,12 +67,16 @@
|
||||
'\n', '.', '/' are safe, as they cannot be a part (even a trailing
|
||||
byte) of a multi-byte character.
|
||||
|
||||
You can select from G using G.c and G.len.
|
||||
You can use ucore_* functions on G.c, e.g., ucore_iserr (G.c),
|
||||
ucore_is (c32isalpha, G.c), and ucore_to (c32tolower, G.c).
|
||||
mcel_ch (CH, LEN) and mcel_err (ERR) construct mcel_t values.
|
||||
|
||||
mcel_strcasecmp compares two null-terminated multi-byte strings
|
||||
lexicographically, ignoring case.
|
||||
mcel_cmp (G1, G2) compares two mcel_t values lexicographically by
|
||||
character or by encoding byte value, with encoding bytes sorting
|
||||
after characters.
|
||||
|
||||
Calls like c32isalpha (G.ch) test G; they return false for encoding
|
||||
errors since calls like c32isalpha (0) return false. Calls like
|
||||
mcel_tocmp (c32tolower, G1, G2) are like mcel_cmp (G1, G2),
|
||||
but transliterate first.
|
||||
|
||||
Although ISO C and POSIX allow encodings that have shift states or
|
||||
that can produce multiple characters from an indivisible byte sequence,
|
||||
@ -73,9 +87,20 @@
|
||||
#ifndef _MCEL_H
|
||||
#define _MCEL_H 1
|
||||
|
||||
/* This API is an extension of ucore.h. Programs that include this
|
||||
file can assume ucore.h is included too. */
|
||||
#include <ucore.h>
|
||||
#if !_GL_CONFIG_H_INCLUDED
|
||||
#error "Please include config.h first."
|
||||
#endif
|
||||
|
||||
#include <verify.h>
|
||||
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
#include <uchar.h>
|
||||
|
||||
/* Pacify GCC re type limits. */
|
||||
#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
|
||||
# pragma GCC diagnostic ignored "-Wtype-limits"
|
||||
#endif
|
||||
|
||||
/* The maximum multi-byte character length supported on any platform.
|
||||
This can be less than MB_LEN_MAX because many platforms have a
|
||||
@ -85,24 +110,41 @@
|
||||
0 < MB_CUR_MAX <= MCEL_LEN_MAX <= MB_LEN_MAX. */
|
||||
enum { MCEL_LEN_MAX = MB_LEN_MAX < 4 ? MB_LEN_MAX : 4 };
|
||||
|
||||
/* mcel_t is a type representing a character or encoding error C,
|
||||
along with a count of the LEN bytes that represent C.
|
||||
1 <= LEN <= MB_LEN_MAX. */
|
||||
/* Bounds for mcel_t members. */
|
||||
enum { MCEL_CHAR_MAX = 0x10FFFF };
|
||||
enum { MCEL_ERR_MIN = 0x80 };
|
||||
enum { MCEL_ERR_MAX = UCHAR_MAX };
|
||||
|
||||
/* mcel_t is a type representing a character CH or an encoding error byte ERR,
|
||||
along with a count of the LEN bytes that represent CH or ERR.
|
||||
If ERR is zero, CH is a valid character and 0 < LEN <= MCEL_LEN_MAX;
|
||||
otherwise ERR is an encoding error byte, MCEL_ERR_MIN <= ERR <= MCEL_ERR_MAX.
|
||||
CH == 0, and LEN == 1. */
|
||||
typedef struct
|
||||
{
|
||||
ucore_t c;
|
||||
char32_t ch;
|
||||
unsigned char err;
|
||||
unsigned char len;
|
||||
} mcel_t;
|
||||
|
||||
/* Every multi-byte character length fits in mcel_t's LEN. */
|
||||
static_assert (MB_LEN_MAX <= UCHAR_MAX);
|
||||
|
||||
/* Shifting an encoding error byte left by this value
|
||||
suffices to sort encoding errors after characters. */
|
||||
enum { MCEL_ERR_SHIFT = 14 };
|
||||
static_assert (MCEL_CHAR_MAX < MCEL_ERR_MIN << MCEL_ERR_SHIFT);
|
||||
|
||||
/* Unsigned char promotes to int. */
|
||||
static_assert (UCHAR_MAX <= INT_MAX);
|
||||
|
||||
/* Bytes have 8 bits, as POSIX requires. */
|
||||
static_assert (CHAR_BIT == 8);
|
||||
|
||||
/* Pacify GCC re 'c <= 0x7f' below. */
|
||||
#if defined __GNUC__ && 4 < __GNUC__ + (3 <= __GNUC_MINOR__)
|
||||
# pragma GCC diagnostic ignored "-Wtype-limits"
|
||||
#ifndef _GL_LIKELY
|
||||
/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */
|
||||
# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
|
||||
# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
|
||||
#endif
|
||||
|
||||
_GL_INLINE_HEADER_BEGIN
|
||||
@ -110,18 +152,44 @@ _GL_INLINE_HEADER_BEGIN
|
||||
# define MCEL_INLINE _GL_INLINE
|
||||
#endif
|
||||
|
||||
/* With mcel there should be no need for the performance overhead of
|
||||
replacing glibc mbrtoc32, as callers shouldn't care whether the
|
||||
C locale treats a byte with the high bit set as an encoding error. */
|
||||
#ifdef __GLIBC__
|
||||
# undef mbrtoc32
|
||||
#endif
|
||||
/* mcel_t constructors. */
|
||||
MCEL_INLINE mcel_t
|
||||
mcel_ch (char32_t ch, size_t len)
|
||||
{
|
||||
assume (0 < len);
|
||||
assume (len <= MCEL_LEN_MAX);
|
||||
assume (ch <= MCEL_CHAR_MAX);
|
||||
return (mcel_t) {ch: ch, len: len};
|
||||
}
|
||||
MCEL_INLINE mcel_t
|
||||
mcel_err (unsigned char err)
|
||||
{
|
||||
assume (MCEL_ERR_MIN <= err);
|
||||
assume (err <= MCEL_ERR_MAX);
|
||||
return (mcel_t) {err: err, len: 1};
|
||||
}
|
||||
|
||||
/* Shifting an encoding error byte (at least 0x80) left by this value
|
||||
yields a value in the range UCORE_ERR_MIN .. 2*UCORE_ERR_MIN - 1.
|
||||
This suffices to sort encoding errors after characters. */
|
||||
enum { MCEL_ENCODING_ERROR_SHIFT = 14 };
|
||||
static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
|
||||
/* Compare C1 and C2, with encoding errors sorting after characters.
|
||||
Return <0, 0, >0 for <, =, >. */
|
||||
MCEL_INLINE int
|
||||
mcel_cmp (mcel_t c1, mcel_t c2)
|
||||
{
|
||||
int ch1 = c1.ch, ch2 = c2.ch;
|
||||
return ((c1.err - c2.err) * (1 << MCEL_ERR_SHIFT)) + (ch1 - ch2);
|
||||
}
|
||||
|
||||
/* Apply the uchar translator TO to C1 and C2 and compare the results,
|
||||
with encoding errors sorting after characters,
|
||||
Return <0, 0, >0 for <, =, >. */
|
||||
MCEL_INLINE int
|
||||
mcel_tocmp (wint_t (*to) (wint_t), mcel_t c1, mcel_t c2)
|
||||
{
|
||||
int cmp = mcel_cmp (c1, c2);
|
||||
if (_GL_LIKELY ((c1.err - c2.err) | !cmp))
|
||||
return cmp;
|
||||
int ch1 = to (c1.ch), ch2 = to (c2.ch);
|
||||
return ch1 - ch2;
|
||||
}
|
||||
|
||||
/* Whether C represents itself as a Unicode character
|
||||
when it is the first byte of a single- or multi-byte character.
|
||||
@ -130,9 +198,16 @@ static_assert (UCORE_ERR_MIN == 0x80 << MCEL_ENCODING_ERROR_SHIFT);
|
||||
MCEL_INLINE bool
|
||||
mcel_isbasic (char c)
|
||||
{
|
||||
return 0 <= c && c <= 0x7f;
|
||||
return _GL_LIKELY (0 <= c && c <= 0x7f);
|
||||
}
|
||||
|
||||
/* With mcel there should be no need for the performance overhead of
|
||||
replacing glibc mbrtoc32, as callers shouldn't care whether the
|
||||
C locale treats a byte with the high bit set as an encoding error. */
|
||||
#ifdef __GLIBC__
|
||||
# undef mbrtoc32
|
||||
#endif
|
||||
|
||||
/* Scan bytes from P inclusive to LIM exclusive. P must be less than LIM.
|
||||
Return the character or encoding error starting at P. */
|
||||
MCEL_INLINE mcel_t
|
||||
@ -141,8 +216,8 @@ mcel_scan (char const *p, char const *lim)
|
||||
/* Handle ASCII quickly to avoid the overhead of calling mbrtoc32.
|
||||
In supported encodings, the first byte of a multi-byte character
|
||||
cannot be an ASCII byte. */
|
||||
if (_GL_LIKELY (mcel_isbasic (*p)))
|
||||
return (mcel_t) { .c = *p, .len = 1 };
|
||||
if (mcel_isbasic (*p))
|
||||
return mcel_ch (*p, 1);
|
||||
|
||||
/* An initial mbstate_t; initialization optimized for some platforms.
|
||||
For details about these and other platforms, see wchar.in.h. */
|
||||
@ -171,29 +246,17 @@ mcel_scan (char const *p, char const *lim)
|
||||
mbstate_t mbs = {0};
|
||||
#endif
|
||||
|
||||
char32_t c;
|
||||
size_t len = mbrtoc32 (&c, p, lim - p, &mbs);
|
||||
char32_t ch;
|
||||
size_t len = mbrtoc32 (&ch, p, lim - p, &mbs);
|
||||
|
||||
/* Any LEN with top bit set is an encoding error, as LEN == (size_t) -3
|
||||
is not supported and MB_LEN_MAX is small. */
|
||||
if (_GL_LIKELY (len <= (size_t) -1 / 2))
|
||||
{
|
||||
/* A multi-byte character. LEN must be positive,
|
||||
as *P != '\0' and shift sequences are not supported. */
|
||||
assume (0 < len);
|
||||
assume (len <= MB_LEN_MAX);
|
||||
assume (c <= UCORE_CHAR_MAX);
|
||||
return (mcel_t) { .c = c, .len = len };
|
||||
}
|
||||
else
|
||||
{
|
||||
/* An encoding error. */
|
||||
unsigned char b = *p;
|
||||
c = b << MCEL_ENCODING_ERROR_SHIFT;
|
||||
assume (UCORE_ERR_MIN <= c);
|
||||
assume (c <= UCORE_ERR_MAX);
|
||||
return (mcel_t) { .c = c, .len = 1 };
|
||||
}
|
||||
if (_GL_UNLIKELY ((size_t) -1 / 2 < len))
|
||||
return mcel_err (*p);
|
||||
|
||||
/* A multi-byte character. LEN must be positive,
|
||||
as *P != '\0' and shift sequences are not supported. */
|
||||
return mcel_ch (ch, len);
|
||||
}
|
||||
|
||||
/* Scan bytes from P, a byte sequence terminated by TERMINATOR.
|
||||
@ -205,11 +268,11 @@ MCEL_INLINE mcel_t
|
||||
mcel_scant (char const *p, char terminator)
|
||||
{
|
||||
/* Handle ASCII quickly for speed. */
|
||||
if (_GL_LIKELY (mcel_isbasic (*p)))
|
||||
return (mcel_t) { .c = *p, .len = 1 };
|
||||
if (mcel_isbasic (*p))
|
||||
return mcel_ch (*p, 1);
|
||||
|
||||
/* Defer to mcel_scan for non-ASCII. Compute length with code that
|
||||
is typically branch-free and faster than memchr or strnlen. */
|
||||
is typically faster than strnlen. */
|
||||
char const *lim = p + 1;
|
||||
for (int i = 0; i < MCEL_LEN_MAX - 1; i++)
|
||||
lim += *lim != terminator;
|
||||
@ -226,11 +289,6 @@ mcel_scanz (char const *p)
|
||||
return mcel_scant (p, '\0');
|
||||
}
|
||||
|
||||
/* Compare the multi-byte strings S1 and S2 lexicographically, ignoring case.
|
||||
Return <0, 0, >0 for <, =, >. Consider encoding errors to be
|
||||
greater than characters and compare them byte by byte. */
|
||||
int mcel_casecmp (char const *s1, char const *s2);
|
||||
|
||||
_GL_INLINE_HEADER_END
|
||||
|
||||
#endif /* _MCEL_H */
|
||||
34
gl/modules/mcel
Normal file
34
gl/modules/mcel
Normal file
@ -0,0 +1,34 @@
|
||||
Description:
|
||||
Multibye Characters, Encoding errors, and Lengths
|
||||
|
||||
Files:
|
||||
lib/mcel.c
|
||||
lib/mcel.h
|
||||
|
||||
Depends-on:
|
||||
assert-h
|
||||
extern-inline
|
||||
limits-h
|
||||
mbrtoc32
|
||||
stdbool
|
||||
uchar
|
||||
verify
|
||||
|
||||
configure.ac:
|
||||
|
||||
Makefile.am:
|
||||
lib_SOURCES += mcel.c mcel.h
|
||||
|
||||
Include:
|
||||
"mcel.h"
|
||||
|
||||
Link:
|
||||
$(LTLIBUNISTRING) when linking with libtool, $(LIBUNISTRING) otherwise
|
||||
$(MBRTOWC_LIB)
|
||||
$(LTLIBC32CONV) when linking with libtool, $(LIBC32CONV) otherwise
|
||||
|
||||
License:
|
||||
LGPLv2+
|
||||
|
||||
Maintainer:
|
||||
all
|
||||
24
gl/modules/mcel-prefer
Normal file
24
gl/modules/mcel-prefer
Normal file
@ -0,0 +1,24 @@
|
||||
Description:
|
||||
mcel is preferred to the mbiter family when either will do.
|
||||
mcel is simpler and faster. However, it does not support some
|
||||
obsolete encodings that are also not supported by glibc locales,
|
||||
and the caller is responsible for coalescing sequences of
|
||||
error-encoding bytes if that is desired.
|
||||
|
||||
Files:
|
||||
|
||||
Depends-on:
|
||||
mcel
|
||||
|
||||
configure.ac:
|
||||
gl_MODULE_INDICATOR([mcel-prefer])
|
||||
|
||||
Makefile.am:
|
||||
|
||||
Include:
|
||||
|
||||
License:
|
||||
LGPLv2+
|
||||
|
||||
Maintainer:
|
||||
Paul Eggert
|
||||
12
gl/modules/mcel-tests
Normal file
12
gl/modules/mcel-tests
Normal file
@ -0,0 +1,12 @@
|
||||
Files:
|
||||
tests/test-mcel.c
|
||||
|
||||
Depends-on:
|
||||
assert-h
|
||||
setlocale
|
||||
|
||||
configure.ac:
|
||||
|
||||
Makefile.am:
|
||||
TESTS += test-mcel
|
||||
check_PROGRAMS += test-mcel
|
||||
138
gl/tests/test-mcel.c
Normal file
138
gl/tests/test-mcel.c
Normal file
@ -0,0 +1,138 @@
|
||||
/* Test <mcel.h>
|
||||
Copyright 2023 Free Software Foundation, Inc.
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#include <mcel.h>
|
||||
|
||||
#include <locale.h>
|
||||
|
||||
#include "macros.h"
|
||||
|
||||
static wint_t
|
||||
to_ascii (wint_t c)
|
||||
{
|
||||
return c & 0x7f;
|
||||
}
|
||||
|
||||
static int
|
||||
sgn (int i)
|
||||
{
|
||||
return (i > 0) - (i < 0);
|
||||
}
|
||||
|
||||
static void
|
||||
test_mcel_vs_mbrtoc32 (unsigned char uc, mcel_t c, size_t n, char32_t ch)
|
||||
{
|
||||
ASSERT (!c.err == (n <= MB_LEN_MAX));
|
||||
ASSERT (c.err
|
||||
? c.err == uc && c.ch == 0 && c.len == 1
|
||||
: c.ch == ch && c.len == (n ? n : 1));
|
||||
}
|
||||
|
||||
int
|
||||
main (void)
|
||||
{
|
||||
/* configure should already have checked that the locale is supported. */
|
||||
if (setlocale (LC_ALL, "") == NULL)
|
||||
return 1;
|
||||
|
||||
mcel_t prev;
|
||||
for (int ch = 0; ch < 0x80; ch++)
|
||||
{
|
||||
mcel_t c = mcel_ch (ch, 1);
|
||||
ASSERT (c.ch == ch);
|
||||
ASSERT (c.len == 1);
|
||||
ASSERT (!c.err);
|
||||
ASSERT (mcel_cmp (c, c) == 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
|
||||
if (ch)
|
||||
{
|
||||
ASSERT (mcel_cmp (prev, c) < 0);
|
||||
ASSERT (mcel_cmp (c, prev) > 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, prev, c) < 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, c, prev) > 0);
|
||||
}
|
||||
ASSERT (mcel_isbasic (ch));
|
||||
prev = c;
|
||||
}
|
||||
for (char ch = CHAR_MIN; ; ch++)
|
||||
{
|
||||
ASSERT (mcel_isbasic (ch) == (0 <= ch && ch <= 0x7f));
|
||||
if (ch == CHAR_MAX)
|
||||
break;
|
||||
}
|
||||
for (int ch = 0x80; ch < 0x200; ch++)
|
||||
{
|
||||
mcel_t c = mcel_ch (ch, 2);
|
||||
ASSERT (c.ch == ch);
|
||||
ASSERT (c.len == 2);
|
||||
ASSERT (!c.err);
|
||||
ASSERT (mcel_cmp (c, c) == 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
|
||||
ASSERT (mcel_cmp (prev, c) < 0);
|
||||
ASSERT (mcel_cmp (c, prev) > 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
|
||||
int cmp = to_ascii (c.ch) ? -1 : 1;
|
||||
ASSERT (sgn (mcel_tocmp (to_ascii, prev, c)) == cmp);
|
||||
ASSERT (sgn (mcel_tocmp (to_ascii, c, prev)) == -cmp);
|
||||
prev = c;
|
||||
}
|
||||
for (unsigned char err = 0x80; ; err++)
|
||||
{
|
||||
mcel_t c = mcel_err (err);
|
||||
ASSERT (!c.ch);
|
||||
ASSERT (c.len == 1);
|
||||
ASSERT (c.err == err);
|
||||
ASSERT (mcel_cmp (c, c) == 0);
|
||||
ASSERT (mcel_cmp (prev, c) < 0);
|
||||
ASSERT (mcel_cmp (c, prev) > 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, c, c) == 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, prev, c) < 0);
|
||||
ASSERT (mcel_tocmp (to_ascii, c, prev) > 0);
|
||||
prev = c;
|
||||
if (err == (unsigned char) -1)
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
|
||||
for (int j = CHAR_MIN; i <= CHAR_MAX; i++)
|
||||
for (int k = CHAR_MIN; k <= CHAR_MAX; k++)
|
||||
{
|
||||
char const ijk[] = {i, j, k};
|
||||
mbstate_t mbs = {0};
|
||||
char32_t ch;
|
||||
size_t n = mbrtoc32 (&ch, ijk, sizeof ijk, &mbs);
|
||||
mcel_t c = mcel_scan (ijk, ijk + sizeof ijk);
|
||||
test_mcel_vs_mbrtoc32 (i, c, n, ch);
|
||||
|
||||
static char const terminator[] = "\r\n./";
|
||||
for (int ti = 0; ti < sizeof terminator; ti++)
|
||||
{
|
||||
char t = terminator[ti];
|
||||
if (i == t)
|
||||
continue;
|
||||
char const ijkt[] = {i, j, k, t};
|
||||
mcel_t d = mcel_scant (ijk, t);
|
||||
ASSERT (c.ch == d.ch && c.err == d.err && c.len == d.len);
|
||||
if (!t)
|
||||
{
|
||||
mcel_t z = mcel_scanz (ijk);
|
||||
ASSERT (d.ch == z.ch && d.err == z.err && d.len == z.len);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -29,7 +29,7 @@ noinst_HEADERS =
|
||||
|
||||
include gnulib.mk
|
||||
|
||||
noinst_HEADERS += cmpbuf.h diagnose.h mcel.h ucore.h
|
||||
libdiffutils_a_SOURCES += cmpbuf.c diagnose.c mcel.c mcel-casecmp.c ucore.c
|
||||
noinst_HEADERS += cmpbuf.h diagnose.h
|
||||
libdiffutils_a_SOURCES += cmpbuf.c diagnose.c
|
||||
|
||||
AM_CFLAGS += $(GNULIB_WARN_CFLAGS) $(WERROR_CFLAGS)
|
||||
|
||||
@ -1,60 +0,0 @@
|
||||
/* Case-insensitive string comparison function.
|
||||
Copyright 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation, either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Written by Paul Eggert. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
/* Specification. */
|
||||
#include <mcel.h>
|
||||
|
||||
#include <ctype.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
int
|
||||
mcel_casecmp (char const *s1, char const *s2)
|
||||
{
|
||||
char const *p1 = s1;
|
||||
char const *p2 = s2;
|
||||
|
||||
/* Do not look at the entire extent of S1 or S2 until needed:
|
||||
when two strings differ, the difference is typically early. */
|
||||
if (MB_CUR_MAX == 1)
|
||||
while (true)
|
||||
{
|
||||
static_assert (UCHAR_MAX <= INT_MAX);
|
||||
unsigned char c1 = *p1++;
|
||||
unsigned char c2 = *p2++;
|
||||
int cmp = c1 - c2;
|
||||
if (_GL_UNLIKELY (cmp))
|
||||
{
|
||||
c1 = tolower (c1);
|
||||
c2 = tolower (c2);
|
||||
cmp = c1 - c2;
|
||||
}
|
||||
if (_GL_UNLIKELY (cmp | !c1))
|
||||
return cmp;
|
||||
}
|
||||
else
|
||||
while (true)
|
||||
{
|
||||
mcel_t g1 = mcel_scanz (p1); p1 += g1.len;
|
||||
mcel_t g2 = mcel_scanz (p2); p2 += g2.len;
|
||||
int cmp = ucore_tocmp (c32tolower, g1.c, g2.c);
|
||||
if (_GL_UNLIKELY (cmp | !g1.c))
|
||||
return cmp;
|
||||
}
|
||||
}
|
||||
@ -1,3 +0,0 @@
|
||||
#include <config.h>
|
||||
#define UCORE_INLINE _GL_EXTERN_INLINE
|
||||
#include "ucore.h"
|
||||
132
lib/ucore.h
132
lib/ucore.h
@ -1,132 +0,0 @@
|
||||
/* Unicode Characters OR Encoding errors (UCOREs)
|
||||
Copyright 2023 Free Software Foundation, Inc.
|
||||
|
||||
This file is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License as
|
||||
published by the Free Software Foundation; either version 3 of the
|
||||
License, or (at your option) any later version.
|
||||
|
||||
This file is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public License
|
||||
along with this program. If not, see <https://www.gnu.org/licenses/>. */
|
||||
|
||||
/* Written by Paul Eggert. */
|
||||
|
||||
/* This API's fundamental type ucore_t represents
|
||||
a single Unicode character or an encoding error.
|
||||
ucore_iserr (C) tests whether C is an encoding error.
|
||||
ucore_is (P, C) etc. test whether char class P accepts C.
|
||||
ucore_to (TO, C) etc. use TO to convert C.
|
||||
ucore_cmp (C1, C2) and ucore_tocmp (TO, C1, C2) compare C1 and C2,
|
||||
with encoding errors sorting after characters. */
|
||||
|
||||
#ifndef _UCORE_H
|
||||
#define _UCORE_H 1
|
||||
|
||||
#if !_GL_CONFIG_H_INCLUDED
|
||||
#error "Please include config.h first."
|
||||
#endif
|
||||
|
||||
#include <verify.h>
|
||||
|
||||
#include <limits.h>
|
||||
#include <stddef.h>
|
||||
#include <uchar.h>
|
||||
|
||||
/* ucore_t represents a Unicode Character OR Encoding error.
|
||||
If 0 <= C <= UCORE_CHAR_MAX, C represents a Unicode character.
|
||||
If UCORE_ERR_MIN <= C <= UCORE_ERR_MAX, C represents an encoding error.
|
||||
Other ucore_t values C are invalid. */
|
||||
typedef int ucore_t;
|
||||
|
||||
enum {
|
||||
UCORE_CHAR_MAX = 0x10FFFF,
|
||||
UCORE_ERR_MIN = 0x200000,
|
||||
UCORE_ERR_MAX = 2 * UCORE_ERR_MIN - 1
|
||||
};
|
||||
|
||||
/* Information is not lost by encoding errors as integers. */
|
||||
static_assert (UCHAR_MAX <= UCORE_ERR_MAX - UCORE_ERR_MIN);
|
||||
|
||||
/* On glibc platforms, predicates like c32isalnum and c32tolower
|
||||
do the right thing for char32_t values that are not valid characters.
|
||||
POSIX says the behavior is undefined, so play it safe elsewhere.
|
||||
Do not rely on UCORE_C32_SAFE for c32width. */
|
||||
#ifdef __GLIBC__
|
||||
enum { UCORE_C32_SAFE = true };
|
||||
#else
|
||||
enum { UCORE_C32_SAFE = false };
|
||||
#endif
|
||||
|
||||
#ifndef _GL_LIKELY
|
||||
/* Rely on __builtin_expect, as provided by the module 'builtin-expect'. */
|
||||
# define _GL_LIKELY(cond) __builtin_expect ((cond), 1)
|
||||
# define _GL_UNLIKELY(cond) __builtin_expect ((cond), 0)
|
||||
#endif
|
||||
|
||||
_GL_INLINE_HEADER_BEGIN
|
||||
#ifndef UCORE_INLINE
|
||||
# define UCORE_INLINE _GL_INLINE
|
||||
#endif
|
||||
|
||||
/* Return true if C represents an encoding error, false otherwise. */
|
||||
UCORE_INLINE bool
|
||||
ucore_iserr (ucore_t c)
|
||||
{
|
||||
/* (c & UCORE_ERR_MIN) is a bit cheaper than (UCORE_ERR_MIN <= c)
|
||||
with GCC 13 x86-64. */
|
||||
if (_GL_UNLIKELY (c & UCORE_ERR_MIN))
|
||||
{
|
||||
assume (UCORE_ERR_MIN <= c && c <= UCORE_ERR_MAX);
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
assume (0 <= c && c <= UCORE_CHAR_MAX);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Whether the uchar predicate P accepts C, e.g., ucore_is (c32isalpha, C). */
|
||||
UCORE_INLINE bool
|
||||
ucore_is (int (*p) (wint_t), wint_t c)
|
||||
{
|
||||
/* When C is out of range, predicates based on glibc return false.
|
||||
Behavior is undefined on other platforms, so play it safe. */
|
||||
return (UCORE_C32_SAFE || ! ucore_iserr (c)) && p (c);
|
||||
}
|
||||
|
||||
/* Apply the uchar translator TO to C, e.g., ucore_to (c32tolower, C). */
|
||||
UCORE_INLINE wint_t
|
||||
ucore_to (wint_t (*to) (wint_t), ucore_t c)
|
||||
{
|
||||
return UCORE_C32_SAFE || ! ucore_iserr (c) ? to (c) : c;
|
||||
}
|
||||
|
||||
/* Compare C1 and C2, with encoding errors sorting after characters.
|
||||
Return <0, 0, >0 for <, =, >. */
|
||||
UCORE_INLINE int
|
||||
ucore_cmp (ucore_t c1, ucore_t c2)
|
||||
{
|
||||
return c1 - c2;
|
||||
}
|
||||
|
||||
/* Apply the uchar translater TO to C1 and C2 and compare the results,
|
||||
with encoding errors sorting after characters,
|
||||
Return <0, 0, >0 for <, =, >. */
|
||||
UCORE_INLINE int
|
||||
ucore_tocmp (wint_t (*to) (wint_t), ucore_t c1, ucore_t c2)
|
||||
{
|
||||
if (c1 == c2)
|
||||
return 0;
|
||||
int i1 = ucore_to (to, c1), i2 = ucore_to (to, c2);
|
||||
return i1 - i2;
|
||||
}
|
||||
|
||||
_GL_INLINE_HEADER_END
|
||||
|
||||
#endif /* _MCEL_H */
|
||||
132
src/io.c
132
src/io.c
@ -230,6 +230,14 @@ slurp (struct file_data *current)
|
||||
}
|
||||
}
|
||||
|
||||
/* Return true if CH1 and ERR1 stand for the same character or
|
||||
encoding error as CH2 and ERR2. */
|
||||
static bool
|
||||
same_ch_err (char32_t ch1, unsigned char err1, char32_t ch2, unsigned char err2)
|
||||
{
|
||||
return ! ((ch1 ^ ch2) | (err1 ^ err2));
|
||||
}
|
||||
|
||||
/* Compare lines S1 of length S1LEN and S2 of length S2LEN (typically
|
||||
one line from each input file) according to the command line options.
|
||||
Line lengths include the trailing newline.
|
||||
@ -427,7 +435,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
{
|
||||
char const *lim1 = s1 + s1len;
|
||||
char const *lim2 = s2 + s2len;
|
||||
ucore_t c1prev = 0;
|
||||
char32_t ch1prev = 0;
|
||||
|
||||
while (true)
|
||||
{
|
||||
@ -435,27 +443,27 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
mcel_t g2 = mcel_scan (t2, lim2);
|
||||
t1 += g1.len;
|
||||
t2 += g2.len;
|
||||
ucore_t c1 = g1.c;
|
||||
ucore_t c2 = g2.c;
|
||||
char32_t ch1 = g1.ch;
|
||||
char32_t ch2 = g2.ch;
|
||||
|
||||
/* Test for exact equality first, since it's a common case. */
|
||||
if (ucore_cmp (c1, c2) != 0)
|
||||
if (! same_ch_err (ch1, g1.err, ch2, g2.err))
|
||||
{
|
||||
switch (ignore_white_space)
|
||||
{
|
||||
case IGNORE_ALL_SPACE:
|
||||
/* For -w, just skip past any white space. */
|
||||
while (c1 != '\n' && ! ucore_is (c32isspace, c1))
|
||||
while (ch1 != '\n' && c32isspace (ch1))
|
||||
{
|
||||
g1 = mcel_scan (t1, lim1);
|
||||
t1 += g1.len;
|
||||
c1 = g1.c;
|
||||
ch1 = g1.ch;
|
||||
}
|
||||
while (c2 != '\n' && ucore_is (c32isspace, c2))
|
||||
while (ch2 != '\n' && c32isspace (ch2))
|
||||
{
|
||||
g2 = mcel_scan (t2, lim2);
|
||||
t2 += g2.len;
|
||||
c2 = g2.c;
|
||||
ch2 = g2.ch;
|
||||
}
|
||||
break;
|
||||
|
||||
@ -463,48 +471,46 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
/* For -b, advance past any sequence of white space in
|
||||
line 1 and consider it just one space, or nothing at
|
||||
all if it is at the end of the line. */
|
||||
if (ucore_is (c32isspace, c1))
|
||||
while (c1 != '\n')
|
||||
if (c32isspace (ch1))
|
||||
while (ch1 != '\n')
|
||||
{
|
||||
g1 = mcel_scan (t1, lim1);
|
||||
t1 += g1.len;
|
||||
c1 = g1.c;
|
||||
if (! ucore_is (c32isspace, c1))
|
||||
ch1 = g1.ch;
|
||||
if (! c32isspace (ch1))
|
||||
{
|
||||
t1 -= g1.len;
|
||||
c1 = ' ';
|
||||
ch1 = ' ';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/* Likewise for line 2. */
|
||||
if (ucore_is (c32isspace, c2))
|
||||
while (c2 != '\n')
|
||||
if (c32isspace (ch2))
|
||||
while (ch2 != '\n')
|
||||
{
|
||||
g2 = mcel_scan (t2, lim2);
|
||||
t2 += g2.len;
|
||||
c2 = g2.c;
|
||||
if (! ucore_is (c32isspace, c2))
|
||||
ch2 = g2.ch;
|
||||
if (! c32isspace (ch2))
|
||||
{
|
||||
t2 -= g2.len;
|
||||
c2 = ' ';
|
||||
ch2 = ' ';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (c1 != c2)
|
||||
if (ch1 != ch2)
|
||||
{
|
||||
/* If we went too far when doing the simple test
|
||||
for equality, go back to the first non-white-space
|
||||
character in both sides and try again. */
|
||||
if (c2 == ' ' && c1 != '\n'
|
||||
&& ucore_is (c32isspace, c1prev))
|
||||
if (ch2 == ' ' && ch1 != '\n' && c32isspace (ch1prev))
|
||||
{
|
||||
t1 -= g1.len;
|
||||
continue;
|
||||
}
|
||||
if (c1 == ' ' && c2 != '\n'
|
||||
&& ucore_is (c32isspace, c1prev))
|
||||
if (ch1 == ' ' && ch2 != '\n' && c32isspace (ch1prev))
|
||||
{
|
||||
t2 -= g2.len;
|
||||
continue;
|
||||
@ -515,28 +521,28 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
|
||||
case IGNORE_TRAILING_SPACE:
|
||||
case IGNORE_TAB_EXPANSION_AND_TRAILING_SPACE:
|
||||
if (ucore_is (c32isspace, c1) && ucore_is (c32isspace, c2))
|
||||
if (c32isspace (ch1) && c32isspace (ch2))
|
||||
{
|
||||
if (c1 != '\n')
|
||||
if (ch1 != '\n')
|
||||
{
|
||||
char const *p = t1;
|
||||
while (*p != '\n')
|
||||
{
|
||||
mcel_t g = mcel_scan (p, lim1);
|
||||
if (! ucore_is (c32isspace, g.c))
|
||||
if (c32isspace (g.ch))
|
||||
break;
|
||||
p += g.len;
|
||||
}
|
||||
if (*p != '\n')
|
||||
break;
|
||||
}
|
||||
if (c2 != '\n')
|
||||
if (ch2 != '\n')
|
||||
{
|
||||
char const *p = t2;
|
||||
while (*p != '\n')
|
||||
{
|
||||
mcel_t g = mcel_scan (p, lim2);
|
||||
if (! ucore_is (c32isspace, g.c))
|
||||
if (! c32isspace (g.ch))
|
||||
break;
|
||||
p += g.len;
|
||||
}
|
||||
@ -550,45 +556,45 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
break;
|
||||
FALLTHROUGH;
|
||||
case IGNORE_TAB_EXPANSION:
|
||||
if ((c1 == ' ' && c2 == '\t')
|
||||
|| (c1 == '\t' && c2 == ' '))
|
||||
if ((ch1 == ' ' && ch2 == '\t')
|
||||
|| (ch1 == '\t' && ch2 == ' '))
|
||||
{
|
||||
intmax_t tab2 = tab, column2 = column;
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (c1 == '\t'
|
||||
|| (c1 == ' ' && column == tabsize - 1))
|
||||
if (ch1 == '\t'
|
||||
|| (ch1 == ' ' && column == tabsize - 1))
|
||||
{
|
||||
tab++;
|
||||
column = 0;
|
||||
}
|
||||
else if (c1 == ' ')
|
||||
else if (ch1 == ' ')
|
||||
column++;
|
||||
else
|
||||
break;
|
||||
|
||||
g1 = mcel_scan (t1, lim1);
|
||||
t1 += g1.len;
|
||||
c1 = g1.c;
|
||||
ch1 = g1.ch;
|
||||
}
|
||||
|
||||
while (true)
|
||||
{
|
||||
if (c2 == '\t'
|
||||
|| (c2 == ' ' && column2 == tabsize - 1))
|
||||
if (ch2 == '\t'
|
||||
|| (ch2 == ' ' && column2 == tabsize - 1))
|
||||
{
|
||||
tab2++;
|
||||
column2 = 0;
|
||||
}
|
||||
else if (c2 == ' ')
|
||||
else if (ch2 == ' ')
|
||||
column2++;
|
||||
else
|
||||
break;
|
||||
|
||||
g2 = mcel_scan (t2, lim2);
|
||||
t2 += g2.len;
|
||||
c2 = g2.c;
|
||||
ch2 = g2.ch;
|
||||
}
|
||||
|
||||
if (tab != tab2 || column != column2)
|
||||
@ -602,15 +608,15 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
|
||||
if (ignore_case)
|
||||
{
|
||||
c1 = ucore_to (c32tolower, c1);
|
||||
c2 = ucore_to (c32tolower, c2);
|
||||
ch1 = c32tolower (ch1);
|
||||
ch2 = c32tolower (ch2);
|
||||
}
|
||||
|
||||
if (ucore_cmp (c1, c2) != 0)
|
||||
if (! same_ch_err (ch1, g1.err, ch2, g2.err))
|
||||
break;
|
||||
}
|
||||
|
||||
switch (c1)
|
||||
switch (ch1)
|
||||
{
|
||||
case '\n':
|
||||
return false;
|
||||
@ -634,7 +640,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
|
||||
default:
|
||||
/* Assume that downcasing does not change print width. */
|
||||
column += ucore_iserr (c1) ? 1 : c32width (c1);
|
||||
column += g1.err ? 1 : c32width (ch1);
|
||||
if (column < tabsize)
|
||||
break;
|
||||
FALLTHROUGH;
|
||||
@ -644,7 +650,7 @@ lines_differ (char const *s1, idx_t s1len, char const *s2, idx_t s2len)
|
||||
break;
|
||||
}
|
||||
|
||||
c1prev = c1;
|
||||
ch1prev = ch1;
|
||||
}
|
||||
}
|
||||
|
||||
@ -698,8 +704,8 @@ find_and_hash_each_line (struct file_data *current)
|
||||
for (mcel_t g; *p != '\n'; p += g.len)
|
||||
{
|
||||
g = mcel_scan (p, suffix_begin);
|
||||
if (! ucore_is (c32isspace, g.c))
|
||||
h = hash (h, (ig_case ? ucore_to (c32tolower, g.c) : g.c));
|
||||
if (! c32isspace (g.ch))
|
||||
h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
|
||||
}
|
||||
break;
|
||||
|
||||
@ -727,7 +733,7 @@ find_and_hash_each_line (struct file_data *current)
|
||||
for (mcel_t g; *p != '\n'; p += g.len)
|
||||
{
|
||||
g = mcel_scan (p, suffix_begin);
|
||||
if (ucore_is (c32isspace, g.c))
|
||||
if (c32isspace (g.ch))
|
||||
{
|
||||
do
|
||||
{
|
||||
@ -736,13 +742,13 @@ find_and_hash_each_line (struct file_data *current)
|
||||
goto hashing_done;
|
||||
g = mcel_scan (p, suffix_begin);
|
||||
}
|
||||
while (ucore_is (c32isspace, g.c));
|
||||
while (c32isspace (g.ch));
|
||||
|
||||
h = hash (h, ' ');
|
||||
}
|
||||
|
||||
/* G is now the first non-space. */
|
||||
h = hash (h, ig_case ? ucore_to (c32tolower, g.c) : g.c);
|
||||
h = hash (h, (ig_case ? c32tolower (g.ch) : g.ch) - g.err);
|
||||
}
|
||||
break;
|
||||
|
||||
@ -818,13 +824,17 @@ find_and_hash_each_line (struct file_data *current)
|
||||
intmax_t repetitions = 1;
|
||||
|
||||
g = mcel_scan (p, suffix_begin);
|
||||
ucore_t c = g.c;
|
||||
if (ucore_iserr (c))
|
||||
column++;
|
||||
char32_t ch;
|
||||
if (g.err)
|
||||
{
|
||||
ch = -g.err;
|
||||
column++;
|
||||
}
|
||||
else
|
||||
{
|
||||
ch = g.ch;
|
||||
if (ig_white_space & IGNORE_TRAILING_SPACE
|
||||
&& ucore_is (c32isspace, c))
|
||||
&& c32isspace (ch))
|
||||
{
|
||||
char const *p1 = p + g.len;
|
||||
for (mcel_t g1; ; p1 += g1.len)
|
||||
@ -835,13 +845,13 @@ find_and_hash_each_line (struct file_data *current)
|
||||
goto hashing_done;
|
||||
}
|
||||
g1 = mcel_scan (p1, suffix_begin);
|
||||
if (! ucore_is (c32isspace, g1.c))
|
||||
if (! c32isspace (g1.ch))
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (ig_white_space & IGNORE_TAB_EXPANSION)
|
||||
switch (c)
|
||||
switch (ch)
|
||||
{
|
||||
case '\b':
|
||||
if (0 < column)
|
||||
@ -854,7 +864,7 @@ find_and_hash_each_line (struct file_data *current)
|
||||
break;
|
||||
|
||||
case '\t':
|
||||
c = ' ';
|
||||
ch = ' ';
|
||||
repetitions = tabsize - column % tabsize;
|
||||
tab += column / tabsize + 1;
|
||||
column = 0;
|
||||
@ -868,16 +878,16 @@ find_and_hash_each_line (struct file_data *current)
|
||||
break;
|
||||
|
||||
default:
|
||||
column += c32width (c);
|
||||
column += c32width (ch);
|
||||
break;
|
||||
}
|
||||
|
||||
if (ig_case)
|
||||
c = c32tolower (c);
|
||||
ch = c32tolower (ch);
|
||||
}
|
||||
|
||||
do
|
||||
h = hash (h, c);
|
||||
h = hash (h, ch);
|
||||
while (--repetitions != 0);
|
||||
}
|
||||
}
|
||||
@ -899,13 +909,13 @@ find_and_hash_each_line (struct file_data *current)
|
||||
for (mcel_t g; *p != '\n'; p += g.len)
|
||||
{
|
||||
g = mcel_scan (p, suffix_begin);
|
||||
h = hash (h, ucore_to (c32tolower, g.c));
|
||||
h = hash (h, c32tolower (g.ch) - g.err);
|
||||
}
|
||||
else
|
||||
for (mcel_t g; *p != '\n'; p += g.len)
|
||||
{
|
||||
g = mcel_scan (p, suffix_begin);
|
||||
h = hash (h, g.c);
|
||||
h = hash (h, g.ch - g.err);
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
@ -146,7 +146,7 @@ print_half_line (char const *const *line, intmax_t indent, intmax_t out_bound)
|
||||
Increase TEXT_POINTER, counting columns.
|
||||
Assume encoding errors have print width 1. */
|
||||
mcel_t g = mcel_scan (tp0, text_limit);
|
||||
int width = ucore_iserr (g.c) ? 1 : c32width (g.c);
|
||||
int width = g.err ? 1 : c32width (g.ch);
|
||||
if (0 < width && ckd_add (&in_position, in_position, width))
|
||||
return out_position;
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user