mirror of
https://https.git.savannah.gnu.org/git/grep.git
synced 2026-01-26 15:39:06 +00:00
grep: -P (--perl-regexp) \D once again works like [^0-9]
* NEWS: Mention \D, too. * doc/grep.texi: Likewise * src/pcresearch.c (pcre_pattern_expand_backslash_d): Handle \D. Also, ifdef-out this new function and its call site when not needed. * tests/pcre-ascii-digits: Test \D, too. Tighten one test by using returns_ 1. Add comments and tests that work only with 10.43 and newer. Paul Eggert raised the issue of \D in https://bugs.gnu.org/62267#8
This commit is contained in:
parent
99330c2b1d
commit
98ee05b4dd
2
NEWS
2
NEWS
@ -9,7 +9,7 @@ GNU grep NEWS -*- outline -*-
|
||||
properly had the undesirable side effect of making \d also match
|
||||
e.g., the Arabic digits: ٠١٢٣٤٥٦٧٨٩. With grep-3.9, -P '\d+'
|
||||
would match that ten-digit (20-byte) string. Now, to match such
|
||||
a digit, you would use \p{Nd}.
|
||||
a digit, you would use \p{Nd}. Similarly, \D is now mapped to [^0-9].
|
||||
[bug introduced in grep 3.9]
|
||||
|
||||
|
||||
|
||||
@ -1144,21 +1144,15 @@ combined with the @option{-z} (@option{--null-data}) option, and note that
|
||||
For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
|
||||
@itemize
|
||||
@item
|
||||
@samp{\d} matches only the ten ASCII digits, regardless of locale.
|
||||
@samp{\d} matches only the ten ASCII digits
|
||||
(and @samp{\D} matches the complement), regardless of locale.
|
||||
Use @samp{\p@{Nd@}} to also match non-ASCII digits.
|
||||
|
||||
When @command{grep} is built with PCRE2 10.42 and earlier, @samp{\d}
|
||||
ignores in-regexp directives like @samp{(?aD)} and matches only ASCII
|
||||
digits regardless of these directives. However, later versions of
|
||||
PCRE2 likely will fix this, and the plan is for @command{grep} to
|
||||
respect those directives if possible.
|
||||
@c Using PCRE2 git commit pcre2-10.40-112-g6277357, this demonstrates
|
||||
@c the equivalent of how grep could use PCRE2_EXTRA_ASCII_BSD to make \d's
|
||||
@c ASCII-only behavior the default:
|
||||
@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
|
||||
@c [Exit 1]
|
||||
@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
|
||||
@c ٠١٢٣٤٥٦٧٨٩
|
||||
When @command{grep} is built with PCRE2 10.42 and earlier,
|
||||
@samp{\d} and @samp{\D} ignore in-regexp directives like @samp{(?aD)}
|
||||
and work like @samp{[0-9]} and @samp{[^0-9]} respectively.
|
||||
However, later versions of PCRE2 likely will fix this,
|
||||
and the plan is for @command{grep} to respect those directives if possible.
|
||||
|
||||
@item
|
||||
Although PCRE tracks the syntax and semantics of Perl's regular
|
||||
|
||||
@ -133,10 +133,13 @@ bad_utf8_from_pcre2 (int e)
|
||||
#endif
|
||||
}
|
||||
|
||||
#if ! PCRE2_EXTRA_ASCII_BSD
|
||||
/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
|
||||
digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
|
||||
match non-ASCII digits in some locales. Use \p{Nd} if you require to match
|
||||
those. */
|
||||
those. Similarly, replace each \D with [^0-9].
|
||||
FIXME: remove in 2025, or whenever we no longer accommodate pcre2-10.42
|
||||
and prior. */
|
||||
static void
|
||||
pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
|
||||
{
|
||||
@ -182,6 +185,9 @@ pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
|
||||
case 'd':
|
||||
p = mempcpy (p, "[0-9]", 5);
|
||||
break;
|
||||
case 'D':
|
||||
p = mempcpy (p, "[^0-9]", 6);
|
||||
break;
|
||||
default:
|
||||
*p++ = '\\';
|
||||
*p++ = *keys;
|
||||
@ -206,6 +212,7 @@ pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
|
||||
*keys_p = new_keys;
|
||||
*len_p = p - new_keys;
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Compile the -P style PATTERN, containing SIZE bytes that are
|
||||
followed by '\n'. Return a description of the compiled pattern. */
|
||||
@ -213,8 +220,9 @@ pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
|
||||
void *
|
||||
Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
|
||||
{
|
||||
if (! PCRE2_EXTRA_ASCII_BSD)
|
||||
pcre_pattern_expand_backslash_d (&pattern, &size);
|
||||
#if ! PCRE2_EXTRA_ASCII_BSD
|
||||
pcre_pattern_expand_backslash_d (&pattern, &size);
|
||||
#endif
|
||||
|
||||
PCRE2_SIZE e;
|
||||
int ec;
|
||||
|
||||
@ -1,6 +1,7 @@
|
||||
#!/bin/sh
|
||||
# Ensure that grep -P's \d matches only the 10 ASCII digits.
|
||||
# With, grep-3.9, \d would match e.g., the multibyte Arabic digits.
|
||||
# The same applied to \D.
|
||||
#
|
||||
# Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
#
|
||||
@ -24,8 +25,38 @@ fail=0
|
||||
# \331\245\331\246\331\247\331\250\331\251
|
||||
printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_
|
||||
printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_
|
||||
printf '\n' >> in || framework_failure_
|
||||
|
||||
grep -P '\d+' in > out && fail=1
|
||||
# Ensure that \d matches no character.
|
||||
returns_ 1 grep -P '\d' in > out || fail=1
|
||||
compare /dev/null out || fail=1
|
||||
|
||||
# Ensure that ^\D+$ matches the entire line.
|
||||
grep -P '^\D+$' in > out || fail=1
|
||||
compare in out || fail=1
|
||||
|
||||
# When built with PCRE2 10.43 and newer, one may use (?aD) and (?-aD)
|
||||
# to toggle between modes. (?aD) is the default (making \d == [0-9]).
|
||||
# (?-aD) relaxes \d, making it match "all" digits.
|
||||
# Use mixed digits as input: Arabic 0 and ASCII 4: ٠4
|
||||
printf '\331\2404\n' > in2 || framework_failure_
|
||||
|
||||
returns_ 1 grep -P '\d\d' in2 > out || fail=1
|
||||
compare /dev/null out || fail=1
|
||||
|
||||
# The following tests work only when built with 10.43 or newer,
|
||||
# with which, grep accepts the mode-setting '(?aD)':
|
||||
if echo 0 | grep -qP '(?aD)\d'; then
|
||||
|
||||
grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
|
||||
compare in2 out || fail=1
|
||||
|
||||
returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
|
||||
compare /dev/null out || fail=1
|
||||
|
||||
else
|
||||
warn_ 'skipped some tests: use PCRE2 10.43 or newer to enable' \
|
||||
'support for e.g., (?aD) and (?-aD)'
|
||||
fi
|
||||
|
||||
Exit $fail
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user