mirror of
https://https.git.savannah.gnu.org/git/grep.git
synced 2026-01-26 15:39:06 +00:00
grep: -P (--perl-regexp) \d: match only ASCII digits
Prior to grep-3.9, the PCRE matcher had always treated \d just like [0-9]. grep-3.9's fix for \w and \b mistakenly relaxed \d to also match multibyte digits. * src/grep.c (P_MATCHER_INDEX): Define enum. (pcre_pattern_expand_backslash_d): New function. (main): Call it for -P. * NEWS (Bug fixes): Mention it. * doc/grep.texi: Document it: with -P, \d matches only ASCII digits. Provide a PCRE documentation URL and an example of how to use (?s) with -z. * tests/pcre-ascii-digits: New test. * tests/Makefile.am (TESTS): Add that file name. Reported as https://bugs.gnu.org/62267
This commit is contained in:
parent
7979ea7ddb
commit
c83ffc197e
10
NEWS
10
NEWS
@ -2,6 +2,16 @@ GNU grep NEWS -*- outline -*-
|
||||
|
||||
* Noteworthy changes in release ?.? (????-??-??) [?]
|
||||
|
||||
** Bug fixes
|
||||
|
||||
With -P, \d now matches only ASCII digits, regardless of PCRE
|
||||
options/modes. The changes in grep-3.9 to make \b and \w work
|
||||
properly had the undesirable side effect of making \d also match
|
||||
e.g., the Arabic digits: ٠١٢٣٤٥٦٧٨٩. With grep-3.9, -P '\d+'
|
||||
would match that ten-digit (20-byte) string. Now, to match such
|
||||
a digit, you would use \p{Nd}.
|
||||
[bug introduced in grep 3.9]
|
||||
|
||||
|
||||
* Noteworthy changes in release 3.9 (2023-03-05) [stable]
|
||||
|
||||
|
||||
@ -1141,6 +1141,37 @@ combined with the @option{-z} (@option{--null-data}) option, and note that
|
||||
@samp{grep@ -P} may warn of unimplemented features.
|
||||
@xref{Other Options}.
|
||||
|
||||
For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
|
||||
@itemize
|
||||
@item
|
||||
@samp{\d} always matches only the ten ASCII digits, regardless of locale or
|
||||
in-regexp directives like @samp{(?aD)}.
|
||||
Use @samp{\p@{Nd@}} if you require to match non-ASCII digits.
|
||||
Once pcre2 support for @samp{(?aD)} is widespread enough,
|
||||
we expect to make that the default, so it will be overridable.
|
||||
@c Using pcre2 git commit pcre2-10.40-112-g6277357, this demonstrates how
|
||||
@c we'll prefix with (?aD) to make \d's ASCII-only behavior the default:
|
||||
@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '(?aD)^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
|
||||
@c [Exit 1]
|
||||
@c $ LC_ALL=en_US.UTF-8 ./pcre2grep -u '^\d+' <<< '٠١٢٣٤٥٦٧٨٩'
|
||||
@c ٠١٢٣٤٥٦٧٨٩
|
||||
|
||||
@item
|
||||
By default, @command{grep} applies each regexp to a line at a time,
|
||||
so the @samp{(?s)} directive (making @samp{.} match line breaks)
|
||||
is generally ineffective.
|
||||
However, with @option{-z} (@option{--null-data}) it can work:
|
||||
@example
|
||||
$ printf 'a\nb\n' |grep -zP '(?s)a.b'
|
||||
a
|
||||
b
|
||||
@end example
|
||||
But beware: with the @option{-z} (@option{--null-data}) and a file
|
||||
containing no NUL byte, grep must read the entire file into memory
|
||||
before processing any of it.
|
||||
Thus, it will exhaust memory and fail for some large files.
|
||||
@end itemize
|
||||
|
||||
@end table
|
||||
|
||||
|
||||
|
||||
82
src/grep.c
82
src/grep.c
@ -2089,7 +2089,8 @@ static struct
|
||||
#endif
|
||||
};
|
||||
/* Keep these in sync with the 'matchers' table. */
|
||||
enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0 };
|
||||
enum { E_MATCHER_INDEX = 1, F_MATCHER_INDEX = 2, G_MATCHER_INDEX = 0,
|
||||
P_MATCHER_INDEX = 6 };
|
||||
|
||||
/* Return the index of the matcher corresponding to M if available.
|
||||
MATCHER is the index of the previous matcher, or -1 if none.
|
||||
@ -2378,6 +2379,80 @@ fgrep_to_grep_pattern (char **keys_p, idx_t *len_p)
|
||||
*len_p = p - new_keys;
|
||||
}
|
||||
|
||||
/* Replace each \d in *KEYS_P with [0-9], to ensure that \d matches only ASCII
|
||||
digits. Now that we enable PCRE2_UCP for pcre regexps, \d would otherwise
|
||||
match non-ASCII digits in some locales. Use \p{Nd} if you require to match
|
||||
those. */
|
||||
static void
|
||||
pcre_pattern_expand_backslash_d (char **keys_p, idx_t *len_p)
|
||||
{
|
||||
idx_t len = *len_p;
|
||||
char *keys = *keys_p;
|
||||
mbstate_t mb_state = { 0 };
|
||||
char *new_keys = xnmalloc (len / 2 + 1, 5);
|
||||
char *p = new_keys;
|
||||
bool prev_backslash = false;
|
||||
|
||||
for (ptrdiff_t n; len; keys += n, len -= n)
|
||||
{
|
||||
n = mb_clen (keys, len, &mb_state);
|
||||
switch (n)
|
||||
{
|
||||
case -2:
|
||||
n = len;
|
||||
FALLTHROUGH;
|
||||
default:
|
||||
if (prev_backslash)
|
||||
{
|
||||
prev_backslash = false;
|
||||
*p++ = '\\';
|
||||
}
|
||||
p = mempcpy (p, keys, n);
|
||||
break;
|
||||
|
||||
case -1:
|
||||
if (prev_backslash)
|
||||
{
|
||||
prev_backslash = false;
|
||||
*p++ = '\\';
|
||||
}
|
||||
memset (&mb_state, 0, sizeof mb_state);
|
||||
n = 1;
|
||||
FALLTHROUGH;
|
||||
case 1:
|
||||
if (prev_backslash)
|
||||
{
|
||||
prev_backslash = false;
|
||||
switch (*keys)
|
||||
{
|
||||
case 'd':
|
||||
p = mempcpy (p, "[0-9]", 5);
|
||||
break;
|
||||
default:
|
||||
*p++ = '\\';
|
||||
*p++ = *keys;
|
||||
break;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (*keys == '\\')
|
||||
prev_backslash = true;
|
||||
else
|
||||
*p++ = *keys;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (prev_backslash)
|
||||
*p++ = '\\';
|
||||
*p = '\n';
|
||||
free (*keys_p);
|
||||
*keys_p = new_keys;
|
||||
*len_p = p - new_keys;
|
||||
}
|
||||
|
||||
/* If it is easy, convert the MATCHER-style patterns KEYS (of size
|
||||
*LEN_P) to -F style, update *LEN_P to a possibly-smaller value, and
|
||||
return F_MATCHER_INDEX. If not, leave KEYS and *LEN_P alone and
|
||||
@ -2970,6 +3045,11 @@ main (int argc, char **argv)
|
||||
matcher = try_fgrep_pattern (matcher, keys, &keycc);
|
||||
}
|
||||
|
||||
/* If -P, replace each \d with [0-9].
|
||||
Those who want to match non-ASCII digits must use \p{Nd}. */
|
||||
if (matcher == P_MATCHER_INDEX)
|
||||
pcre_pattern_expand_backslash_d (&keys, &keycc);
|
||||
|
||||
execute = matchers[matcher].execute;
|
||||
compiled_pattern =
|
||||
matchers[matcher].compile (keys, keycc, matchers[matcher].syntax,
|
||||
|
||||
@ -139,6 +139,7 @@ TESTS = \
|
||||
options \
|
||||
pcre \
|
||||
pcre-abort \
|
||||
pcre-ascii-digits \
|
||||
pcre-context \
|
||||
pcre-count \
|
||||
pcre-infloop \
|
||||
|
||||
31
tests/pcre-ascii-digits
Executable file
31
tests/pcre-ascii-digits
Executable file
@ -0,0 +1,31 @@
|
||||
#!/bin/sh
|
||||
# Ensure that grep -P's \d matches only the 10 ASCII digits.
|
||||
# With, grep-3.9, \d would match e.g., the multibyte Arabic digits.
|
||||
#
|
||||
# Copyright (C) 2023 Free Software Foundation, Inc.
|
||||
#
|
||||
# Copying and distribution of this file, with or without modification,
|
||||
# are permitted in any medium without royalty provided the copyright
|
||||
# notice and this notice are preserved.
|
||||
|
||||
. "${srcdir=.}/init.sh"; path_prepend_ ../src
|
||||
require_en_utf8_locale_
|
||||
LC_ALL=en_US.UTF-8
|
||||
export LC_ALL
|
||||
require_pcre_
|
||||
|
||||
echo . | grep -qP '(*UTF).' 2>/dev/null \
|
||||
|| skip_ 'PCRE unicode support is compiled out'
|
||||
|
||||
fail=0
|
||||
|
||||
# $ printf %s ٠١٢٣٤٥٦٧٨٩|od -An -to1 -w10 |sed 's/ /\\/g'; : arabic digits
|
||||
# \331\240\331\241\331\242\331\243\331\244
|
||||
# \331\245\331\246\331\247\331\250\331\251
|
||||
printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_
|
||||
printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_
|
||||
|
||||
grep -P '\d+' in > out && fail=1
|
||||
compare /dev/null out || fail=1
|
||||
|
||||
Exit $fail
|
||||
Loading…
x
Reference in New Issue
Block a user