mirror of
https://https.git.savannah.gnu.org/git/grep.git
synced 2026-01-26 15:39:06 +00:00
grep: go back to 3.9 -P '\d' behavior
Treating \d differently from Perl was more trouble than it was worth. * NEWS, doc/grep.texi (grep Programs): Document this. * src/pcresearch.c (PCRE2_EXTRA_ASCII_BSD): Remove. All uses removed. * tests/pcre-ascii-digits: Adjust to this change. * tests/pcre-utf8-w: Revert to 3.9.
This commit is contained in:
parent
29a9b72db3
commit
eaca869822
8
NEWS
8
NEWS
@ -4,6 +4,14 @@ GNU grep NEWS -*- outline -*-
|
||||
|
||||
** Bug fixes
|
||||
|
||||
With -P, \d now matches all decimal digits, not just ASCII digits.
|
||||
That is, \d is equivalent to [[:digit:]], not to [0-9].
|
||||
This is more compatible with plain Perl, and reverts to the
|
||||
behavior of grep 3.9. If you prefer \d to mean [0-9] and
|
||||
have a PCRE2 version later than 10.42 installed, you can
|
||||
prefix your regular expression with (?aD).
|
||||
[bug introduced in grep 3.10]
|
||||
|
||||
Searching a directory with at least 100,000 entries no longer fails
|
||||
with "Operation not supported" and exit status 2. Now, this prints 1
|
||||
and no diagnostic, as expected:
|
||||
|
||||
@ -1162,13 +1162,6 @@ combined with the @option{-z} (@option{--null-data}) option, and note that
|
||||
|
||||
For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
|
||||
@itemize
|
||||
@item
|
||||
@samp{\d} matches only the ten ASCII digits
|
||||
(and @samp{\D} matches the complement), regardless of locale.
|
||||
Use @samp{\p@{Nd@}} to also match non-ASCII digits.
|
||||
(The behavior of @samp{\d} and @samp{\D} is unspecified after
|
||||
in-regexp directives like @samp{(?aD)}.)
|
||||
|
||||
@item
|
||||
Although PCRE tracks the syntax and semantics of Perl's regular
|
||||
expressions, the match is not always exact. For example, Perl
|
||||
@ -1176,6 +1169,12 @@ evolves and a Perl installation may predate or postdate the PCRE2
|
||||
installation on the same host, or their Unicode versions may differ,
|
||||
or Perl and PCRE2 may disagree about an obscure construct.
|
||||
|
||||
For example, @samp{\d} ordinarily is like @samp{\p@{Nd@}} and matches
|
||||
all decimal digits, whereas @samp{[0-9]} matches only ASCII digits.
|
||||
PCRE2 versions after 10.42 let you change this behavior: in a regular
|
||||
expression that starts with @samp{(?aD)}, @samp{\d} is like @samp{[0-9]}.
|
||||
However, PCRE2 10.42 and earlier do not support @samp{(?aD)}.
|
||||
|
||||
@item
|
||||
By default, @command{grep} applies each regexp to a line at a time,
|
||||
so the @samp{(?s)} directive (making @samp{.} match line breaks)
|
||||
|
||||
@ -33,9 +33,6 @@
|
||||
# define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
|
||||
# define pcre2_set_depth_limit pcre2_set_recursion_limit
|
||||
#endif
|
||||
#ifndef PCRE2_EXTRA_ASCII_BSD
|
||||
# define PCRE2_EXTRA_ASCII_BSD 0
|
||||
#endif
|
||||
|
||||
/* Use PCRE2_MATCH_INVALID_UTF if supported and not buggy;
|
||||
see <https://github.com/PCRE2Project/pcre2/issues/224>.
|
||||
@ -168,19 +165,11 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
|
||||
if (! localeinfo.using_utf8)
|
||||
die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
|
||||
|
||||
flags |= PCRE2_UTF;
|
||||
flags |= PCRE2_UTF | PCRE2_UCP;
|
||||
|
||||
/* If supported, consider invalid UTF-8 as a barrier not an error. */
|
||||
flags |= MATCH_INVALID_UTF;
|
||||
|
||||
/* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
|
||||
so that \d does not have the undesirable effect of matching
|
||||
non-ASCII digits. Otherwise (i.e., with PCRE2 10.42 and earlier),
|
||||
escapes like \w have only their ASCII interpretations,
|
||||
but that's better than the confusion that would ensue if \d
|
||||
matched non-ASCII digits. */
|
||||
flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
|
||||
|
||||
#if 0
|
||||
/* Do not match individual code units but only UTF-8. */
|
||||
flags |= PCRE2_NEVER_BACKSLASH_C;
|
||||
@ -191,16 +180,12 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
|
||||
if (rawmemchr (pattern, '\n') != patlim)
|
||||
die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
|
||||
|
||||
#ifdef PCRE2_EXTRA_MATCH_LINE
|
||||
uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD
|
||||
| (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0));
|
||||
pcre2_set_compile_extra_options (ccontext, extra_options);
|
||||
#endif
|
||||
|
||||
void *re_storage = nullptr;
|
||||
if (match_lines)
|
||||
{
|
||||
#ifndef PCRE2_EXTRA_MATCH_LINE
|
||||
#ifdef PCRE2_EXTRA_MATCH_LINE
|
||||
pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE);
|
||||
#else
|
||||
static char const *const xprefix = "^(?:";
|
||||
static char const *const xsuffix = ")$";
|
||||
idx_t re_size = size + strlen (xprefix) + strlen (xsuffix);
|
||||
|
||||
@ -18,7 +18,7 @@ require_pcre_
|
||||
echo . | grep -qP '(*UTF).' 2>/dev/null \
|
||||
|| skip_ 'PCRE unicode support is compiled out'
|
||||
echo 0 | grep -qP '(?aD)\d' \
|
||||
|| skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
|
||||
|| skip_ 'PCRE 10.42 and older lack (?aD)'
|
||||
|
||||
fail=0
|
||||
|
||||
@ -29,27 +29,27 @@ printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_
|
||||
printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_
|
||||
printf '\n' >> in || framework_failure_
|
||||
|
||||
# Ensure that \d matches no character.
|
||||
returns_ 1 grep -P '\d' in > out || fail=1
|
||||
# Ensure that (?aD)\d matches no character.
|
||||
returns_ 1 grep -P '(?aD)\d' in > out || fail=1
|
||||
compare /dev/null out || fail=1
|
||||
|
||||
# Ensure that ^\D+$ matches the entire line.
|
||||
grep -P '^\D+$' in > out || fail=1
|
||||
# Ensure that (?aD)^\D+$ matches the entire line.
|
||||
grep -P '(?aD)^\D+$' in > out || fail=1
|
||||
compare in out || fail=1
|
||||
|
||||
# When built with PCRE2 10.43 and newer, one may use (?aD) and (?-aD)
|
||||
# to toggle between modes. (?aD) is the default (making \d == [0-9]).
|
||||
# (?-aD) relaxes \d, making it match "all" digits.
|
||||
# to toggle between modes. (?aD) makes \d == [0-9].
|
||||
# (?-aD), the default, makes \d match all digits.
|
||||
# Use mixed digits as input: Arabic 0 and ASCII 4: ٠4
|
||||
printf '\331\2404\n' > in2 || framework_failure_
|
||||
|
||||
returns_ 1 grep -P '\d\d' in2 > out || fail=1
|
||||
returns_ 1 grep -P '(?aD)\d\d' in2 > out || fail=1
|
||||
compare /dev/null out || fail=1
|
||||
|
||||
grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
|
||||
grep -P '\d(?aD)\d' in2 > out || fail=1
|
||||
compare in2 out || fail=1
|
||||
|
||||
returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
|
||||
returns_ 1 grep -P '(?aD)\d(?-aD)\d' in2 > out || fail=1
|
||||
compare /dev/null out || fail=1
|
||||
|
||||
Exit $fail
|
||||
|
||||
@ -16,8 +16,6 @@ require_pcre_
|
||||
|
||||
echo . | grep -qP '(*UTF).' 2>/dev/null \
|
||||
|| skip_ 'PCRE unicode support is compiled out'
|
||||
echo 0 | grep -qP '(?aD)\d' \
|
||||
|| skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
|
||||
|
||||
fail=0
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user