dfa: reflect move of grep's DFA code to gnulib

Now that the core DFA code and tests reside in gnulib,
remove the copies here and use what gnulib provides.
* bootstrap.conf: Use the dfa module.
* cfg.mk: Remove settings involving files that have moved.
(_gl_TS_unmarked_extern_functions): Add dfaerror and dfawarn.
It is wrong/ugly to have to define these global symbols to use
the dfa module, but we'll adjust that separately.
* po/POTFILES.in: Apply s/src/lib/ to src/dfa.c.
* src/Makefile.am: Remove mention of dfa.[ch] and localeinfo.[ch].
* tests/Makefile.am: Remove mention of the tests that we have
moved to the gnulib module.
* src/dfa.c: Remove file.
* src/dfa.h: Likewise.
* src/localeinfo.c: Likewise.
* src/localeinfo.h: Likewise.
* tests/dfa-match: Likewise.
* tests/dfa-match-aux.c: Likewise.
* tests/invalid-char-class: Likewise.
This commit is contained in:
Jim Meyering 2016-08-17 10:49:10 -07:00
parent 57f1fef2cf
commit ca2ded9ca8
12 changed files with 8 additions and 4515 deletions

View File

@ -29,6 +29,7 @@ argmatch
binary-io
c-ctype
closeout
dfa
do-release-commit-and-tag
error
exclude

8
cfg.mk
View File

@ -30,7 +30,7 @@ bootstrap-tools = autoconf,automake,gnulib
# The tight_scope test gets confused about inline functions.
# like 'to_uchar'.
_gl_TS_unmarked_extern_functions = main usage mb_clen to_uchar
_gl_TS_unmarked_extern_functions = main usage mb_clen to_uchar dfaerror dfawarn
# Now that we have better tests, make this the default.
export VERBOSE = yes
@ -138,16 +138,12 @@ update-copyright-env = \
include $(abs_top_srcdir)/dist-check.mk
exclude_file_name_regexp--sc_bindtextdomain = \
^tests/(get-mb-cur-max|dfa-match-aux)\.c$$
exclude_file_name_regexp--sc_prohibit_atoi_atof = \
^tests/dfa-match-aux\.c$$
^tests/get-mb-cur-max\.c$$
exclude_file_name_regexp--sc_prohibit_strcmp = /colorize-.*\.c$$
exclude_file_name_regexp--sc_prohibit_xalloc_without_use = ^src/kwset\.c$$
exclude_file_name_regexp--sc_prohibit_tab_based_indentation = \
(Makefile|\.(am|mk)$$)
exclude_file_name_regexp--sc_error_message_uppercase = ^src/dfa\.c$$
exclude_file_name_regexp--sc_prohibit_strncpy = ^src/dfa\.c$$
exclude_file_name_regexp--sc_prohibit_doubled_word = ^tests/count-newline$$

View File

@ -17,6 +17,7 @@
lib/argmatch.c
lib/closeout.c
lib/dfa.c
lib/error.c
lib/getopt.c
lib/obstack.c
@ -26,6 +27,5 @@ lib/regcomp.c
lib/version-etc.c
lib/xalloc-die.c
lib/xstrtol-error.c
src/dfa.c
src/grep.c
src/pcresearch.c

View File

@ -24,10 +24,10 @@ AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS)
bin_PROGRAMS = grep
bin_SCRIPTS = egrep fgrep
grep_SOURCES = grep.c searchutils.c \
dfa.c dfasearch.c \
kwset.c kwsearch.c localeinfo.c \
dfasearch.c \
kwset.c kwsearch.c \
pcresearch.c
noinst_HEADERS = grep.h dfa.h kwset.h localeinfo.h search.h system.h
noinst_HEADERS = grep.h kwset.h search.h system.h
# Sometimes, the expansion of $(LIBINTL) includes -lc which may
# include modules defining variables like 'optind', so libgreputils.a

4060
src/dfa.c

File diff suppressed because it is too large Load Diff

126
src/dfa.h
View File

@ -1,126 +0,0 @@
/* dfa.h - declarations for GNU deterministic regexp compiler
Copyright (C) 1988, 1998, 2007, 2009-2016 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc.,
51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA */
/* Written June, 1988 by Mike Haertel */
#include <regex.h>
#include <stdbool.h>
#include <stddef.h>
#if 3 <= __GNUC__
# define _GL_ATTRIBUTE_MALLOC __attribute__ ((__malloc__))
#else
# define _GL_ATTRIBUTE_MALLOC
#endif
struct localeinfo; /* See localeinfo.h. */
/* Element of a list of strings, at least one of which is known to
appear in any R.E. matching the DFA. */
struct dfamust
{
bool exact;
bool begline;
bool endline;
char *must;
};
/* The dfa structure. It is completely opaque. */
struct dfa;
/* Entry points. */
/* Allocate a struct dfa. The struct dfa is completely opaque.
The returned pointer should be passed directly to free() after
calling dfafree() on it. */
extern struct dfa *dfaalloc (void) _GL_ATTRIBUTE_MALLOC;
/* DFA options that can be ORed together, for dfasyntax's 4th arg. */
enum
{
/* ^ and $ match only the start and end of data, and do not match
end-of-line within data. This is always false for grep, but
possibly true for other apps. */
DFA_ANCHOR = 1 << 0,
/* Ignore case while matching. */
DFA_CASE_FOLD = 1 << 1,
/* '\0' in data is end-of-line, instead of the traditional '\n'. */
DFA_EOL_NUL = 1 << 2
};
/* Initialize or reinitialize a DFA. This must be called before
any of the routines below. The arguments are:
1. The DFA to operate on.
2. Information about the current locale.
3. Syntax bits described in regex.h.
4. Additional DFA options described above. */
extern void dfasyntax (struct dfa *, struct localeinfo const *,
reg_syntax_t, int);
/* Build and return the struct dfamust from the given struct dfa. */
extern struct dfamust *dfamust (struct dfa const *);
/* Free the storage held by the components of a struct dfamust. */
extern void dfamustfree (struct dfamust *);
/* Compile the given string of the given length into the given struct dfa.
Final argument is a flag specifying whether to build a searching or an
exact matcher. */
extern void dfacomp (char const *, size_t, struct dfa *, bool);
/* Search through a buffer looking for a match to the given struct dfa.
Find the first occurrence of a string matching the regexp in the
buffer, and the shortest possible version thereof. Return a pointer to
the first character after the match, or NULL if none is found. BEGIN
points to the beginning of the buffer, and END points to the first byte
after its end. Note however that we store a sentinel byte (usually
newline) in *END, so the actual buffer must be one byte longer.
When ALLOW_NL is true, newlines may appear in the matching string.
If COUNT is non-NULL, increment *COUNT once for each newline processed.
Finally, if BACKREF is non-NULL set *BACKREF to indicate whether we
encountered a back-reference. The caller can use this to decide
whether to fall back on a backtracking matcher. */
extern char *dfaexec (struct dfa *d, char const *begin, char *end,
bool allow_nl, size_t *count, bool *backref);
/* Return a superset for D. The superset matches everything that D
matches, along with some other strings (though the latter should be
rare, for efficiency reasons). Return a null pointer if no useful
superset is available. */
extern struct dfa *dfasuperset (struct dfa const *d) _GL_ATTRIBUTE_PURE;
/* The DFA is likely to be fast. */
extern bool dfaisfast (struct dfa const *) _GL_ATTRIBUTE_PURE;
/* Free the storage held by the components of a struct dfa. */
extern void dfafree (struct dfa *);
/* Error handling. */
/* dfawarn() is called by the regexp routines whenever a regex is compiled
that likely doesn't do what the user wanted. It takes a single
argument, a NUL-terminated string describing the situation. The user
must supply a dfawarn. */
extern void dfawarn (const char *);
/* dfaerror() is called by the regexp routines whenever an error occurs. It
takes a single argument, a NUL-terminated string describing the error.
The user must supply a dfaerror. */
extern _Noreturn void dfaerror (const char *);

View File

@ -1,113 +0,0 @@
/* locale information
Copyright 2016 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
02110-1301, USA. */
/* Written by Paul Eggert. */
#include <config.h>
#include <localeinfo.h>
#include <verify.h>
#include <limits.h>
#include <locale.h>
#include <stdlib.h>
#include <string.h>
#include <wctype.h>
/* The sbclen implementation relies on this. */
verify (MB_LEN_MAX <= SCHAR_MAX);
/* Return true if the locale uses UTF-8. */
static bool
is_using_utf8 (void)
{
wchar_t wc;
mbstate_t mbs = {0};
return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
}
/* Initialize *LOCALEINFO from the current locale. */
void
init_localeinfo (struct localeinfo *localeinfo)
{
int i;
localeinfo->multibyte = MB_CUR_MAX > 1;
localeinfo->using_utf8 = is_using_utf8 ();
for (i = CHAR_MIN; i <= CHAR_MAX; i++)
{
char c = i;
unsigned char uc = i;
mbstate_t s = {0};
wchar_t wc;
size_t len = mbrtowc (&wc, &c, 1, &s);
localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
}
}
/* The set of wchar_t values C such that there's a useful locale
somewhere where C != towupper (C) && C != towlower (towupper (C)).
For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
static short const lonesome_lower[] =
{
0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
/* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
counterpart in locales predating Unicode 4.0.0 (April 2003). */
0x03F2,
0x03F5, 0x1E9B, 0x1FBE,
};
/* Verify that the worst case fits. This is 1 for towupper, 1 for
towlower, and 1 for each entry in LONESOME_LOWER. */
verify (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
<= CASE_FOLDED_BUFSIZE);
/* Find the characters equal to C after case-folding, other than C
itself, and store them into FOLDED. Return the number of characters
stored. */
int
case_folded_counterparts (wchar_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
{
int i;
int n = 0;
wint_t uc = towupper (c);
wint_t lc = towlower (uc);
if (uc != c)
folded[n++] = uc;
if (lc != uc && lc != c && towupper (lc) == uc)
folded[n++] = lc;
for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
{
wint_t li = lonesome_lower[i];
if (li != lc && li != uc && li != c && towupper (li) == uc)
folded[n++] = li;
}
return n;
}

View File

@ -1,54 +0,0 @@
/* locale information
Copyright 2016 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
02110-1301, USA. */
/* Written by Paul Eggert. */
#include <limits.h>
#include <stdbool.h>
#include <wchar.h>
struct localeinfo
{
/* MB_CUR_MAX > 1. */
bool multibyte;
/* The locale uses UTF-8. */
bool using_utf8;
/* An array indexed by byte values B that contains 1 if B is a
single-byte character, -1 if B is an encoding error, and -2 if B
is the leading byte of a multibyte character that contains more
than one byte. */
signed char sbclen[UCHAR_MAX + 1];
/* An array indexed by byte values B that contains the corresponding
wide character (if any) for B if sbclen[B] == 1. WEOF means the
byte is not a valid single-byte character, i.e., sbclen[B] == -1
or -2. */
wint_t sbctowc[UCHAR_MAX + 1];
};
extern void init_localeinfo (struct localeinfo *);
/* Maximum number of characters that can be the case-folded
counterparts of a single character, not counting the character
itself. This is a generous upper bound. */
enum { CASE_FOLDED_BUFSIZE = 32 };
extern int case_folded_counterparts (wchar_t, wchar_t[CASE_FOLDED_BUFSIZE]);

View File

@ -34,7 +34,7 @@ TESTSUITE_PERL_OPTIONS += -M"CuTmpdir qw($$f)"
SH_LOG_COMPILER = $(SHELL)
PL_LOG_COMPILER = $(TESTSUITE_PERL) $(TESTSUITE_PERL_OPTIONS)
check_PROGRAMS = get-mb-cur-max dfa-match-aux
check_PROGRAMS = get-mb-cur-max
AM_CPPFLAGS = -I$(top_builddir)/lib -I$(top_srcdir)/lib \
-I$(top_srcdir)/src
AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
@ -42,7 +42,6 @@ AM_CFLAGS = $(WARN_CFLAGS) $(WERROR_CFLAGS)
# Tell the linker to omit references to unused shared libraries.
AM_LDFLAGS = $(IGNORE_UNUSED_LIBRARIES_CFLAGS)
LDADD = ../lib/libgreputils.a $(LIBINTL) ../lib/libgreputils.a
dfa_match_aux_LDADD = ../src/dfa.$(OBJEXT) ../src/localeinfo.$(OBJEXT) $(LDADD)
# The triple-backref test is expected to fail with both the system
# matcher (i.e., with glibc) and with the included matcher.
@ -86,7 +85,6 @@ TESTS = \
count-newline \
dfa-coverage \
dfa-heap-overrun \
dfa-match \
dfaexec-multibyte \
empty \
empty-line \
@ -110,7 +108,6 @@ TESTS = \
in-eq-out-infloop \
include-exclude \
inconsistent-range \
invalid-char-class \
invalid-multibyte-infloop \
khadafy \
kwset-abuse \

View File

@ -1,45 +0,0 @@
#!/bin/sh
# This would fail for grep-2.21.
# Copyright 2014-2016 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
. "${srcdir=.}/init.sh"; path_prepend_ ../src
# Add "." to PATH for the use of dfa-match-aux.
path_prepend_ .
require_timeout_
fail=0
fail1=0
dfa-match-aux a ba 0 > out || fail1=1
compare /dev/null out || fail1=1
if test $fail1 -ne 0; then
warn_ 'dfa-match test #1 failed\n'
fail=1
fi
fail2=0
in=$(printf "bb\nbb")
timeout 10 dfa-match-aux a "$in" 1 > out || fail2=1
compare /dev/null out || fail2=1
if test $fail2 -ne 0; then
warn_ 'dfa-match test #2 failed\n'
fail=1
fi
Exit $fail

View File

@ -1,73 +0,0 @@
/* Auxiliary program to test a DFA code path that cannot be triggered
by grep or gawk.
Copyright 2014-2016 Free Software Foundation, Inc.
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3, or (at your option)
any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
02110-1301, USA. */
#include <config.h>
#include <locale.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <regex.h>
#include <dfa.h>
#include <localeinfo.h>
#include "getprogname.h"
_Noreturn void
dfaerror (char const *mesg)
{
printf ("dfaerror: %s\n", mesg);
exit (EXIT_FAILURE);
}
_Noreturn void
dfawarn (char const *mesg)
{
printf ("dfawarn: %s\n", mesg);
exit (EXIT_FAILURE);
}
int
main (int argc, char **argv)
{
struct dfa *dfa;
char *beg, *end, *p;
int allow_nl;
struct localeinfo localeinfo;
if (argc < 3)
exit (EXIT_FAILURE);
setlocale (LC_ALL, "");
init_localeinfo (&localeinfo);
dfa = dfaalloc ();
dfasyntax (dfa, &localeinfo, RE_SYNTAX_GREP | RE_NO_EMPTY_RANGES, 0);
dfacomp (argv[1], strlen (argv[1]), dfa, 0);
beg = argv[2];
end = argv[2] + strlen (argv[2]);
allow_nl = argc > 3 && atoi (argv[3]);
p = dfaexec (dfa, beg, end, allow_nl, NULL, NULL);
if (p != NULL)
printf ("%zd\n", p - beg);
exit (EXIT_SUCCESS);
}

View File

@ -1,30 +0,0 @@
#!/bin/sh
# This use of our DFA-testing helper would fail for grep-2.21.
# Copyright 2014-2016 Free Software Foundation, Inc.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
. "${srcdir=.}/init.sh"; path_prepend_ ../src
# Add "." to PATH for the use of dfa-match-aux.
path_prepend_ .
fail=0
echo 'dfaerror: invalid character class' > exp
LC_ALL=C dfa-match-aux '[[:foo:]]' a > out 2>&1
compare exp out || fail=1
Exit $fail