From eaca8698225c65b56e83b9390e1c3c9fbdc9acd0 Mon Sep 17 00:00:00 2001
From: Paul Eggert <eggert@cs.ucla.edu>
Date: Sat, 14 Dec 2024 10:49:00 -0700
Subject: [PATCH] grep: go back to 3.9 -P '\d' behavior

Treating \d differently from Perl was more trouble than it was worth.
* NEWS, doc/grep.texi (grep Programs): Document this.
* src/pcresearch.c (PCRE2_EXTRA_ASCII_BSD):
Remove.  All uses removed.
* tests/pcre-ascii-digits: Adjust to this change.
* tests/pcre-utf8-w: Revert to 3.9.
---
 NEWS                    |  8 ++++++++
 doc/grep.texi           | 13 ++++++-------
 src/pcresearch.c        | 23 ++++-------------------
 tests/pcre-ascii-digits | 20 ++++++++++----------
 tests/pcre-utf8-w       |  2 --
 5 files changed, 28 insertions(+), 38 deletions(-)

diff --git a/NEWS b/NEWS
index 4294fc6..7e482b5 100644
--- a/NEWS
+++ b/NEWS
@@ -4,6 +4,14 @@ GNU grep NEWS                                    -*- outline -*-
 
 ** Bug fixes
 
+  With -P, \d now matches all decimal digits, not just ASCII digits.
+  That is, \d is equivalent to [[:digit:]], not to [0-9].
+  This is more compatible with plain Perl, and reverts to the
+  behavior of grep 3.9.  If you prefer \d to mean [0-9] and
+  have a PCRE2 version later than 10.42 installed, you can
+  prefix your regular expression with (?aD).
+  [bug introduced in grep 3.10]
+
   Searching a directory with at least 100,000 entries no longer fails
   with "Operation not supported" and exit status 2. Now, this prints 1
   and no diagnostic, as expected:
diff --git a/doc/grep.texi b/doc/grep.texi
index 8495919..bfcba7e 100644
--- a/doc/grep.texi
+++ b/doc/grep.texi
@@ -1162,13 +1162,6 @@ combined with the @option{-z} (@option{--null-data}) option, and note that
 
 For documentation, refer to @url{https://www.pcre.org/}, with these caveats:
 @itemize
-@item
-@samp{\d} matches only the ten ASCII digits
-(and @samp{\D} matches the complement), regardless of locale.
-Use @samp{\p@{Nd@}} to also match non-ASCII digits.
-(The behavior of @samp{\d} and @samp{\D} is unspecified after
-in-regexp directives like @samp{(?aD)}.)
-
 @item
 Although PCRE tracks the syntax and semantics of Perl's regular
 expressions, the match is not always exact.  For example, Perl
@@ -1176,6 +1169,12 @@ evolves and a Perl installation may predate or postdate the PCRE2
 installation on the same host, or their Unicode versions may differ,
 or Perl and PCRE2 may disagree about an obscure construct.
 
+For example, @samp{\d} ordinarily is like @samp{\p@{Nd@}} and matches
+all decimal digits, whereas @samp{[0-9]} matches only ASCII digits.
+PCRE2 versions after 10.42 let you change this behavior: in a regular
+expression that starts with @samp{(?aD)}, @samp{\d} is like @samp{[0-9]}.
+However, PCRE2 10.42 and earlier do not support @samp{(?aD)}.
+
 @item
 By default, @command{grep} applies each regexp to a line at a time,
 so the @samp{(?s)} directive (making @samp{.} match line breaks)
diff --git a/src/pcresearch.c b/src/pcresearch.c
index 4d79425..4a08531 100644
--- a/src/pcresearch.c
+++ b/src/pcresearch.c
@@ -33,9 +33,6 @@
 # define PCRE2_ERROR_DEPTHLIMIT PCRE2_ERROR_RECURSIONLIMIT
 # define pcre2_set_depth_limit pcre2_set_recursion_limit
 #endif
-#ifndef PCRE2_EXTRA_ASCII_BSD
-# define PCRE2_EXTRA_ASCII_BSD 0
-#endif
 
 /* Use PCRE2_MATCH_INVALID_UTF if supported and not buggy;
    see <https://github.com/PCRE2Project/pcre2/issues/224>.
@@ -168,19 +165,11 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
       if (! localeinfo.using_utf8)
         die (EXIT_TROUBLE, 0, _("-P supports only unibyte and UTF-8 locales"));
 
-      flags |= PCRE2_UTF;
+      flags |= PCRE2_UTF | PCRE2_UCP;
 
       /* If supported, consider invalid UTF-8 as a barrier not an error.  */
       flags |= MATCH_INVALID_UTF;
 
-      /* If PCRE2_EXTRA_ASCII_BSD is available, use PCRE2_UCP
-         so that \d does not have the undesirable effect of matching
-         non-ASCII digits.  Otherwise (i.e., with PCRE2 10.42 and earlier),
-         escapes like \w have only their ASCII interpretations,
-         but that's better than the confusion that would ensue if \d
-         matched non-ASCII digits.  */
-      flags |= PCRE2_EXTRA_ASCII_BSD ? PCRE2_UCP : 0;
-
 #if 0
       /* Do not match individual code units but only UTF-8.  */
       flags |= PCRE2_NEVER_BACKSLASH_C;
@@ -191,16 +180,12 @@ Pcompile (char *pattern, idx_t size, reg_syntax_t ignored, bool exact)
   if (rawmemchr (pattern, '\n') != patlim)
     die (EXIT_TROUBLE, 0, _("the -P option only supports a single pattern"));
 
-#ifdef PCRE2_EXTRA_MATCH_LINE
-  uint32_t extra_options = (PCRE2_EXTRA_ASCII_BSD
-                            | (match_lines ? PCRE2_EXTRA_MATCH_LINE : 0));
-  pcre2_set_compile_extra_options (ccontext, extra_options);
-#endif
-
   void *re_storage = nullptr;
   if (match_lines)
     {
-#ifndef PCRE2_EXTRA_MATCH_LINE
+#ifdef PCRE2_EXTRA_MATCH_LINE
+      pcre2_set_compile_extra_options (ccontext, PCRE2_EXTRA_MATCH_LINE);
+#else
       static char const *const xprefix = "^(?:";
       static char const *const xsuffix = ")$";
       idx_t re_size = size + strlen (xprefix) + strlen (xsuffix);
diff --git a/tests/pcre-ascii-digits b/tests/pcre-ascii-digits
index c738aa1..50fe251 100755
--- a/tests/pcre-ascii-digits
+++ b/tests/pcre-ascii-digits
@@ -18,7 +18,7 @@ require_pcre_
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
 echo 0 | grep -qP '(?aD)\d' \
-  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
+  || skip_ 'PCRE 10.42 and older lack (?aD)'
 
 fail=0
 
@@ -29,27 +29,27 @@ printf '\331\240\331\241\331\242\331\243\331\244' > in || framework_failure_
 printf '\331\245\331\246\331\247\331\250\331\251' >> in || framework_failure_
 printf '\n' >> in || framework_failure_
 
-# Ensure that \d matches no character.
-returns_ 1 grep -P '\d' in > out || fail=1
+# Ensure that (?aD)\d matches no character.
+returns_ 1 grep -P '(?aD)\d' in > out || fail=1
 compare /dev/null out || fail=1
 
-# Ensure that ^\D+$ matches the entire line.
-grep -P '^\D+$' in > out || fail=1
+# Ensure that (?aD)^\D+$ matches the entire line.
+grep -P '(?aD)^\D+$' in > out || fail=1
 compare in out || fail=1
 
 # When built with PCRE2 10.43 and newer, one may use (?aD) and (?-aD)
-# to toggle between modes.  (?aD) is the default (making \d == [0-9]).
-# (?-aD) relaxes \d, making it match "all" digits.
+# to toggle between modes.  (?aD) makes \d == [0-9].
+# (?-aD), the default, makes \d match all digits.
 # Use mixed digits as input: Arabic 0 and ASCII 4: ٠4
 printf '\331\2404\n' > in2 || framework_failure_
 
-returns_ 1 grep -P '\d\d' in2 > out || fail=1
+returns_ 1 grep -P '(?aD)\d\d' in2 > out || fail=1
 compare /dev/null out || fail=1
 
-grep -P '(?-aD)\d(?aD)\d' in2 > out || fail=1
+grep -P '\d(?aD)\d' in2 > out || fail=1
 compare in2 out || fail=1
 
-returns_ 1 grep -P '\d(?-aD)\d' in2 > out || fail=1
+returns_ 1 grep -P '(?aD)\d(?-aD)\d' in2 > out || fail=1
 compare /dev/null out || fail=1
 
 Exit $fail
diff --git a/tests/pcre-utf8-w b/tests/pcre-utf8-w
index 1229da4..86ff8eb 100755
--- a/tests/pcre-utf8-w
+++ b/tests/pcre-utf8-w
@@ -16,8 +16,6 @@ require_pcre_
 
 echo . | grep -qP '(*UTF).' 2>/dev/null \
   || skip_ 'PCRE unicode support is compiled out'
-echo 0 | grep -qP '(?aD)\d' \
-  || skip_ 'PCRE 10.42 and older lack PCRE2_EXTRA_ASCII_BSD'
 
 fail=0