grep: go back to 3.9 -P '\d' behavior

Treating \d differently from Perl was more trouble than it was worth. * NEWS, doc/grep.texi (grep Programs): Document this. * src/pcresearch.c (PCRE2_EXTRA_ASCII_BSD): Remove. All uses removed. * tests/pcre-ascii-digits: Adjust to this change. * tests/pcre-utf8-w: Revert to 3.9.
2026-01-26 15:39:06 +00:00 · 2024-12-14 10:49:00 -07:00 · 2024-12-14 10:49:00 -07:00 · eaca869822
commit eaca869822
parent 29a9b72db3
5 changed files with 28 additions and 38 deletions
--- a/8
+++ b/8
@ -4,6 +4,14 @@ GNU grep NEWS                                    -*- outline -*-

 ** Bug fixes

+  With -P, \d now matches all decimal digits, not just ASCII digits.
+  That is, \d is equivalent to [[:digit:]], not to [0-9].
+  This is more compatible with plain Perl, and reverts to the
+  behavior of grep 3.9.  If you prefer \d to mean [0-9] and
+  have a PCRE2 version later than 10.42 installed, you can
+  prefix your regular expression with (?aD).
+  [bug introduced in grep 3.10]
+
  Searching a directory with at least 100,000 entries no longer fails
  with "Operation not supported" and exit status 2. Now, this prints 1
  and no diagnostic, as expected:
--- a/doc/grep.texi
+++ b/doc/grep.texi
@ -1162,13 +1162,6 @@ combined with the @option{-z} (@option{--null-data}) option, and note that

 For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
@itemize
-@item
-@samp{\d} matches only the ten ASCII digits
-(and @samp{\D} matches the complement), regardless of locale.
-Use @samp{\p@{Nd@}} to also match non-ASCII digits.
-(The behavior of @samp{\d} and @samp{\D} is unspecified after
-in-regexp directives like @samp{(?aD)}.)
-
@item
 Although PCRE tracks the syntax and semantics of Perl's regular
 expressions, the match is not always exact.  For example, Perl
@ -1176,6 +1169,12 @@ evolves and a Perl installation may predate or postdate the PCRE2
 installation on the same host, or their Unicode versions may differ,
 or Perl and PCRE2 may disagree about an obscure construct.

+For example, @samp{\d} ordinarily is like @samp{\p@{Nd@}} and matches
+all decimal digits, whereas @samp{[0-9]} matches only ASCII digits.
+PCRE2 versions after 10.42 let you change this behavior: in a regular
+expression that starts with @samp{(?aD)}, @samp{\d} is like @samp{[0-9]}.
+However, PCRE2 10.42 and earlier do not support @samp{(?aD)}.
+
@item
 By default, @command{grep} applies each regexp to a line at a time,
 so the @samp{(?s)} directive (making @samp{.} match line breaks)
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@ -33,9 +33,6 @@
 # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
 # define pcre2_set_depth_limit pcre2_set_recursion_limit
 #endif
-#ifndef PCRE2_EXTRA_ASCII_BSD
-# define PCRE2_EXTRA_ASCII_BSD 0
-#endif

 /* Use PCRE2_MATCH_INVALID_UTF if supported and not buggy;
   see <https://github.com/PCRE2Project/pcre2/issues/224>.
@ -168,19 +165,11 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
      if (! localeinfo.using_utf8)
        die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));

-      flags |= PCRE2_UTF;
+      flags |= PCRE2_UTF | PCRE2_UCP;

      /* If supported, consider invalid UTF-8 as a barrier not an error.  */
      flags |= MATCH_INVALID_UTF;

-      /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
-         so that \d does not have the undesirable effect of matching
-         non-ASCII digits.  Otherwise (i.e., with PCRE2 10.42 and earlier),
-         escapes like \w have only their ASCII interpretations,
-         but that's better than the confusion that would ensue if \d
-         matched non-ASCII digits.  */
-      flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
-
 #if 0
      /* Do not match individual code units but only UTF-8.  */
      flags |= PCRE2_NEVER_BACKSLASH_C;
@ -191,16 +180,12 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
  if (rawmemchr (pattern, '\n') != patlim)
    die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));

-#ifdef PCRE2_EXTRA_MATCH_LINE
-  uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD
-                            | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0));
-  pcre2_set_compile_extra_options (ccontext, extra_options);
-#endif
-
  void *re_storage = nullptr;
  if (match_lines)
    {
-#ifndef PCRE2_EXTRA_MATCH_LINE
+#ifdef PCRE2_EXTRA_MATCH_LINE
+      pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE);
+#else
      static char const *const xprefix = "^(?:";
      static char const *const xsuffix = ")$";
      idx_t re_size = size + strlen (xprefix) + strlen (xsuffix);
--- a/tests/pcre-ascii-digits
+++ b/tests/pcre-ascii-digits
@ -18,7 +18,7 @@ require_pcre_
 echo . | grep -qP '(*UTF).' 2>/dev/null \
  || skip_ 'PCRE unicode support is compiled out'
 echo 0 | grep -qP '(?aD)\d' \
-  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
+  || skip_ 'PCRE 10.42 and older lack (?aD)'

 fail=0

@ -29,27 +29,27 @@ printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_
 printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_
 printf '\n' >> in || framework_failure_

-# Ensure that \d matches no character.
-returns_ 1 grep -P '\d' in > out || fail=1
+# Ensure that (?aD)\d matches no character.
+returns_ 1 grep -P '(?aD)\d' in > out || fail=1
 compare /dev/null out || fail=1

-# Ensure that ^\D+$ matches the entire line.
-grep -P '^\D+$' in > out || fail=1
+# Ensure that (?aD)^\D+$ matches the entire line.
+grep -P '(?aD)^\D+$' in > out || fail=1
 compare in out || fail=1

 # When built with PCRE2 10.43 and newer, one may use (?aD) and (?-aD)
-# to toggle between modes.  (?aD) is the default (making \d == [0-9]).
-# (?-aD) relaxes \d, making it match "all" digits.
+# to toggle between modes.  (?aD) makes \d == [0-9].
+# (?-aD), the default, makes \d match all digits.
 # Use mixed digits as input: Arabic 0 and ASCII 4: ٠4
 printf '\331\2404\n' > in2 || framework_failure_

-returns_ 1 grep -P '\d\d' in2 > out || fail=1
+returns_ 1 grep -P '(?aD)\d\d' in2 > out || fail=1
 compare /dev/null out || fail=1

-grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
+grep -P '\d(?aD)\d' in2 > out || fail=1
 compare in2 out || fail=1

-returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
+returns_ 1 grep -P '(?aD)\d(?-aD)\d' in2 > out || fail=1
 compare /dev/null out || fail=1

 Exit $fail
--- a/tests/pcre-utf8-w
+++ b/tests/pcre-utf8-w
@ -16,8 +16,6 @@ require_pcre_

 echo . | grep -qP '(*UTF).' 2>/dev/null \
  || skip_ 'PCRE unicode support is compiled out'
-echo 0 | grep -qP '(?aD)\d' \
-  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'

 fail=0