mirror of
https://https.git.savannah.gnu.org/git/gettext.git
synced 2026-01-26 15:39:11 +00:00
xgettext: PHP: Support heredoc and nowdoc strings.
Reported: by Bodo Graumann <mail@bodograumann.de> at <https://savannah.gnu.org/bugs/?27740>, at <https://savannah.gnu.org/bugs/?35944>, by Cédric Anne <canne@teclib.com> at <https://savannah.gnu.org/bugs/?62158>. * gettext-tools/src/x-php.c (TAB_WIDTH): New macro. (process_heredoc): New function. (phase4_get): Process the heredoc or nowdoc string, instead of blindly skipping it. * gettext-tools/tests/xgettext-php-1: Add tests of heredoc strings with embedded expressions. * gettext-tools/tests/xgettext-php-2: Expect spaces before a heredoc end label to be eaten. Add tests of heredoc strings and nowdoc strings. * NEWS: Mention the improvement.
This commit is contained in:
parent
4df2df7213
commit
9d4340a788
4
NEWS
4
NEWS
@ -30,7 +30,9 @@ Version 0.23 - September 2024
|
||||
o xgettext now reports warnings instead of fatal errors.
|
||||
o Strings with embedded expressions (a.k.a. interpolated strings) are now
|
||||
recognized.
|
||||
- PHP: Strings with embedded expressions are now recognized.
|
||||
- PHP:
|
||||
o Strings with embedded expressions are now recognized.
|
||||
o Heredoc and Nowdoc strings are now scanned correctly.
|
||||
|
||||
* Runtime behaviour:
|
||||
- In the C.UTF-8 locale, like in the C locale, the *gettext() functions
|
||||
|
||||
@ -840,8 +840,246 @@ free_token (token_ty *tp)
|
||||
}
|
||||
|
||||
|
||||
/* In heredoc and nowdoc, assume a tab width of 8. */
|
||||
#define TAB_WIDTH 8
|
||||
|
||||
|
||||
/* 4. Combine characters into tokens. Discard whitespace. */
|
||||
|
||||
/* On a heredoc string, do the same processing as phase4_getc (below) does
|
||||
on a double-quoted string (except for recognizing a double-quote as
|
||||
end-of-string, of course).
|
||||
Return the processed string, or NULL if it contains variables or embedded
|
||||
expressions. */
|
||||
static char *
|
||||
process_heredoc (struct php_extractor *xp, const char *doc, int doc_line_number)
|
||||
{
|
||||
bool is_constant = true;
|
||||
int lineno = doc_line_number;
|
||||
int bufmax = strlen (doc) + 1;
|
||||
char *buffer = xmalloc (bufmax);
|
||||
int bufpos;
|
||||
|
||||
heredoc_continued:
|
||||
bufpos = 0;
|
||||
for (;;)
|
||||
{
|
||||
char c = *doc++;
|
||||
if (c == '\0')
|
||||
break;
|
||||
if (c == '\n')
|
||||
lineno++;
|
||||
if (c == '$')
|
||||
{
|
||||
c = *doc++;
|
||||
if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
|
||||
|| c == '_' || c >= 0x7f)
|
||||
{
|
||||
/* String with variables. */
|
||||
is_constant = false;
|
||||
continue;
|
||||
}
|
||||
if (c == '{')
|
||||
/* Heredoc string with embedded expressions. */
|
||||
goto heredoc_with_embedded_expressions;
|
||||
--doc;
|
||||
c = '$';
|
||||
}
|
||||
if (c == '{')
|
||||
{
|
||||
c = *doc++;
|
||||
if (c == '$')
|
||||
/* Heredoc string with embedded expressions. */
|
||||
goto heredoc_with_embedded_expressions;
|
||||
--doc;
|
||||
c = '{';
|
||||
}
|
||||
if (c == '\\')
|
||||
{
|
||||
int n, j;
|
||||
|
||||
c = *doc++;
|
||||
switch (c)
|
||||
{
|
||||
case '\\':
|
||||
case '$':
|
||||
break;
|
||||
|
||||
case '0': case '1': case '2': case '3':
|
||||
case '4': case '5': case '6': case '7':
|
||||
n = 0;
|
||||
for (j = 0; j < 3; ++j)
|
||||
{
|
||||
n = n * 8 + c - '0';
|
||||
c = *doc++;
|
||||
switch (c)
|
||||
{
|
||||
default:
|
||||
break;
|
||||
|
||||
case '0': case '1': case '2': case '3':
|
||||
case '4': case '5': case '6': case '7':
|
||||
continue;
|
||||
}
|
||||
break;
|
||||
}
|
||||
--doc;
|
||||
c = n;
|
||||
break;
|
||||
|
||||
case 'x':
|
||||
n = 0;
|
||||
for (j = 0; j < 2; ++j)
|
||||
{
|
||||
c = *doc++;
|
||||
switch (c)
|
||||
{
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8': case '9':
|
||||
n = n * 16 + c - '0';
|
||||
break;
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E':
|
||||
case 'F':
|
||||
n = n * 16 + 10 + c - 'A';
|
||||
break;
|
||||
case 'a': case 'b': case 'c': case 'd': case 'e':
|
||||
case 'f':
|
||||
n = n * 16 + 10 + c - 'a';
|
||||
break;
|
||||
default:
|
||||
--doc;
|
||||
c = 0;
|
||||
break;
|
||||
}
|
||||
if (c == 0)
|
||||
break;
|
||||
}
|
||||
if (j == 0)
|
||||
{
|
||||
--doc;
|
||||
c = '\\';
|
||||
}
|
||||
else
|
||||
c = n;
|
||||
break;
|
||||
|
||||
case 'n':
|
||||
c = '\n';
|
||||
break;
|
||||
case 't':
|
||||
c = '\t';
|
||||
break;
|
||||
case 'r':
|
||||
c = '\r';
|
||||
break;
|
||||
|
||||
default:
|
||||
--doc;
|
||||
c = '\\';
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (bufpos >= bufmax)
|
||||
{
|
||||
bufmax = 2 * bufmax + 10;
|
||||
buffer = xrealloc (buffer, bufmax);
|
||||
}
|
||||
buffer[bufpos++] = c;
|
||||
}
|
||||
if (bufpos >= bufmax)
|
||||
{
|
||||
bufmax = bufmax + 1;
|
||||
buffer = xrealloc (buffer, bufmax);
|
||||
}
|
||||
buffer[bufpos] = '\0';
|
||||
if (is_constant)
|
||||
return buffer;
|
||||
else
|
||||
{
|
||||
free (buffer);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
heredoc_with_embedded_expressions:
|
||||
is_constant = false;
|
||||
{
|
||||
size_t nesting_stack_alloc = 10;
|
||||
char *nesting_stack = xmalloc (nesting_stack_alloc);
|
||||
size_t nesting_stack_depth = 0;
|
||||
/* We just read a '{', so expect a matching '}'. */
|
||||
nesting_stack[nesting_stack_depth++] = '}';
|
||||
|
||||
/* Find the extent of the expression. */
|
||||
bufpos = 0;
|
||||
for (;;)
|
||||
{
|
||||
char c = *doc;
|
||||
if (c == '\0')
|
||||
{
|
||||
if (nesting_stack_depth > 0)
|
||||
if_error (IF_SEVERITY_WARNING,
|
||||
logical_file_name, lineno, (size_t)(-1), false,
|
||||
_("unterminated expression in heredoc, expected a '%c'"),
|
||||
nesting_stack[nesting_stack_depth - 1]);
|
||||
break;
|
||||
}
|
||||
doc++;
|
||||
if (c == '\n')
|
||||
lineno++;
|
||||
if (c == '{' || c == '[' || c == '(')
|
||||
{
|
||||
if (nesting_stack_depth >= nesting_stack_alloc)
|
||||
{
|
||||
nesting_stack_alloc = 2 * nesting_stack_alloc;
|
||||
nesting_stack =
|
||||
xrealloc (nesting_stack, nesting_stack_alloc);
|
||||
}
|
||||
nesting_stack[nesting_stack_depth++] =
|
||||
(c == '{' ? '}' : c == '[' ? ']' : ')');
|
||||
}
|
||||
else if (c == '}' || c == ']' || c == ')')
|
||||
{
|
||||
if (nesting_stack_depth > 0
|
||||
&& c == nesting_stack[nesting_stack_depth - 1])
|
||||
{
|
||||
if (--nesting_stack_depth == 0)
|
||||
break;
|
||||
}
|
||||
else
|
||||
if_error (IF_SEVERITY_WARNING,
|
||||
logical_file_name, lineno, (size_t)(-1), false,
|
||||
_("unterminated expression in heredoc contains unbalanced '%c'"),
|
||||
c);
|
||||
}
|
||||
if (bufpos >= bufmax)
|
||||
{
|
||||
bufmax = 2 * bufmax + 10;
|
||||
buffer = xrealloc (buffer, bufmax);
|
||||
}
|
||||
buffer[bufpos++] = c;
|
||||
}
|
||||
|
||||
/* Recursively extract messages from the expression. */
|
||||
char *substring = xmalloc (bufpos);
|
||||
memcpy (substring, buffer, bufpos);
|
||||
|
||||
struct php_extractor *rxp = XMALLOC (struct php_extractor);
|
||||
rxp->mlp = xp->mlp;
|
||||
rxp->fp = NULL;
|
||||
rxp->input = substring;
|
||||
rxp->input_end = substring + bufpos;
|
||||
rxp->line_number = xp->line_number;
|
||||
php_extractor_init_rest (rxp);
|
||||
|
||||
extract_php_input (rxp);
|
||||
|
||||
free (rxp);
|
||||
free (substring);
|
||||
free (nesting_stack);
|
||||
}
|
||||
goto heredoc_continued;
|
||||
}
|
||||
|
||||
static void
|
||||
phase4_get (struct php_extractor *xp, token_ty *tp)
|
||||
{
|
||||
@ -1155,7 +1393,7 @@ phase4_get (struct php_extractor *xp, token_ty *tp)
|
||||
tp->type = token_type_other;
|
||||
{
|
||||
size_t nesting_stack_alloc = 10;
|
||||
char *nesting_stack = malloc (nesting_stack_alloc);
|
||||
char *nesting_stack = xmalloc (nesting_stack_alloc);
|
||||
size_t nesting_stack_depth = 0;
|
||||
/* We just read a '{', so expect a matching '}'. */
|
||||
nesting_stack[nesting_stack_depth++] = '}';
|
||||
@ -1307,9 +1545,7 @@ phase4_get (struct php_extractor *xp, token_ty *tp)
|
||||
int c3 = phase1_getc (xp);
|
||||
if (c3 == '<')
|
||||
{
|
||||
int label_start = 0;
|
||||
|
||||
/* Start of here and now document.
|
||||
/* Start of heredoc or nowdoc.
|
||||
Parse whitespace, then label, then newline. */
|
||||
do
|
||||
c = phase3_getc (xp);
|
||||
@ -1330,51 +1566,201 @@ phase4_get (struct php_extractor *xp, token_ty *tp)
|
||||
/* buffer[0..bufpos-1] now contains the label
|
||||
(including single or double quotes). */
|
||||
|
||||
if (*buffer == '\'' || *buffer == '"')
|
||||
int doc_line_number = xp->line_number;
|
||||
|
||||
bool heredoc = true;
|
||||
int label_start = 0;
|
||||
int label_end = bufpos;
|
||||
if (bufpos >= 2
|
||||
&& ((buffer[label_start] == '\'' && buffer[label_end - 1] == '\'')
|
||||
|| (buffer[label_start] == '"' && buffer[label_end - 1] == '"')))
|
||||
{
|
||||
heredoc = (buffer[label_start] == '"');
|
||||
label_start++;
|
||||
bufpos--;
|
||||
label_end--;
|
||||
}
|
||||
|
||||
/* Now skip the here document. */
|
||||
/* Now read the heredoc or nowdoc. */
|
||||
size_t doc_alloc = 10;
|
||||
char *doc = xmalloc (doc_alloc);
|
||||
size_t doc_len = 0;
|
||||
size_t doc_start_of_line = 0;
|
||||
|
||||
/* These two variables keep track of the matching of the
|
||||
end label. */
|
||||
int in_label_pos = -1; /* <= label_end - label_start */
|
||||
int end_label_indent = 0;
|
||||
|
||||
for (;;)
|
||||
{
|
||||
c = phase1_getc (xp);
|
||||
if (c == EOF)
|
||||
break;
|
||||
if (c == '\n' || c == '\r')
|
||||
{
|
||||
int bufidx = label_start;
|
||||
|
||||
while (bufidx < bufpos)
|
||||
if (doc_len >= doc_alloc)
|
||||
{
|
||||
doc_alloc = 2 * doc_alloc + 10;
|
||||
doc = xrealloc (doc, doc_alloc);
|
||||
}
|
||||
doc[doc_len++] = c;
|
||||
|
||||
if (c == '\n')
|
||||
doc_start_of_line = doc_len;
|
||||
|
||||
/* Incrementally match the label. */
|
||||
if (in_label_pos == 0 && (c == ' ' || c == '\t'))
|
||||
{
|
||||
if (c == '\t')
|
||||
end_label_indent |= TAB_WIDTH - 1;
|
||||
end_label_indent++;
|
||||
}
|
||||
else if (in_label_pos >= 0
|
||||
&& in_label_pos < label_end - label_start
|
||||
&& c == buffer[label_start + in_label_pos])
|
||||
{
|
||||
in_label_pos++;
|
||||
}
|
||||
else if (in_label_pos == label_end - label_start)
|
||||
{
|
||||
switch (c)
|
||||
{
|
||||
c = phase1_getc (xp);
|
||||
if (c == EOF)
|
||||
break;
|
||||
if (c != buffer[bufidx])
|
||||
{
|
||||
phase1_ungetc (xp, c);
|
||||
break;
|
||||
}
|
||||
bufidx++;
|
||||
case 'A': case 'B': case 'C': case 'D': case 'E':
|
||||
case 'F': case 'G': case 'H': case 'I': case 'J':
|
||||
case 'K': case 'L': case 'M': case 'N': case 'O':
|
||||
case 'P': case 'Q': case 'R': case 'S': case 'T':
|
||||
case 'U': case 'V': case 'W': case 'X': case 'Y':
|
||||
case 'Z':
|
||||
case '_':
|
||||
case 'a': case 'b': case 'c': case 'd': case 'e':
|
||||
case 'f': case 'g': case 'h': case 'i': case 'j':
|
||||
case 'k': case 'l': case 'm': case 'n': case 'o':
|
||||
case 'p': case 'q': case 'r': case 's': case 't':
|
||||
case 'u': case 'v': case 'w': case 'x': case 'y':
|
||||
case 'z':
|
||||
case '0': case '1': case '2': case '3': case '4':
|
||||
case '5': case '6': case '7': case '8': case '9':
|
||||
case 128: case 129: case 130: case 131: case 132:
|
||||
case 133: case 134: case 135: case 136: case 137:
|
||||
case 138: case 139: case 140: case 141: case 142:
|
||||
case 143: case 144: case 145: case 146: case 147:
|
||||
case 148: case 149: case 150: case 151: case 152:
|
||||
case 153: case 154: case 155: case 156: case 157:
|
||||
case 158: case 159: case 160: case 161: case 162:
|
||||
case 163: case 164: case 165: case 166: case 167:
|
||||
case 168: case 169: case 170: case 171: case 172:
|
||||
case 173: case 174: case 175: case 176: case 177:
|
||||
case 178: case 179: case 180: case 181: case 182:
|
||||
case 183: case 184: case 185: case 186: case 187:
|
||||
case 188: case 189: case 190: case 191: case 192:
|
||||
case 193: case 194: case 195: case 196: case 197:
|
||||
case 198: case 199: case 200: case 201: case 202:
|
||||
case 203: case 204: case 205: case 206: case 207:
|
||||
case 208: case 209: case 210: case 211: case 212:
|
||||
case 213: case 214: case 215: case 216: case 217:
|
||||
case 218: case 219: case 220: case 221: case 222:
|
||||
case 223: case 224: case 225: case 226: case 227:
|
||||
case 228: case 229: case 230: case 231: case 232:
|
||||
case 233: case 234: case 235: case 236: case 237:
|
||||
case 238: case 239: case 240: case 241: case 242:
|
||||
case 243: case 244: case 245: case 246: case 247:
|
||||
case 248: case 249: case 250: case 251: case 252:
|
||||
case 253: case 254: case 255:
|
||||
in_label_pos = -1;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (bufidx == bufpos)
|
||||
if (in_label_pos >= 0)
|
||||
{
|
||||
c = phase1_getc (xp);
|
||||
if (c != ';')
|
||||
phase1_ungetc (xp, c);
|
||||
c = phase1_getc (xp);
|
||||
if (c == '\n' || c == '\r')
|
||||
break;
|
||||
/* Finished recognizing the label. */
|
||||
phase1_ungetc (xp, c);
|
||||
break;
|
||||
}
|
||||
}
|
||||
else if (c == '\n' || c == '\r')
|
||||
{
|
||||
in_label_pos = 0;
|
||||
end_label_indent = 0;
|
||||
}
|
||||
else
|
||||
{
|
||||
in_label_pos = -1;
|
||||
end_label_indent = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* FIXME: Ideally we should turn the here document into a
|
||||
string literal if it didn't contain $ substitution. And
|
||||
we should also respect backslash escape sequences like
|
||||
in double-quoted strings. */
|
||||
tp->type = token_type_other;
|
||||
/* The contents is the substring
|
||||
[doc, doc + doc_start_of_line). */
|
||||
doc_len = doc_start_of_line;
|
||||
|
||||
/* Discard leading indentation. */
|
||||
if (end_label_indent > 0)
|
||||
{
|
||||
/* Scan through the doc string, copying *q = *p. */
|
||||
const char *p;
|
||||
char *q = doc;
|
||||
int curr_line_indent = 0;
|
||||
|
||||
for (p = doc; p < doc + doc_len; p++)
|
||||
{
|
||||
/* Invariant: doc <= q <= p <= doc + doc_len. */
|
||||
char c = *p;
|
||||
*q++ = c;
|
||||
if (curr_line_indent < end_label_indent)
|
||||
{
|
||||
if (c == ' ')
|
||||
{
|
||||
curr_line_indent++;
|
||||
--q;
|
||||
}
|
||||
else if (c == '\t')
|
||||
{
|
||||
curr_line_indent |= TAB_WIDTH - 1;
|
||||
curr_line_indent++;
|
||||
if (curr_line_indent <= end_label_indent)
|
||||
--q;
|
||||
}
|
||||
}
|
||||
if (c == '\n')
|
||||
curr_line_indent = 0;
|
||||
}
|
||||
doc_len = q - doc;
|
||||
}
|
||||
|
||||
/* Discard the trailing newline. */
|
||||
if (doc_len > 0 && doc[doc_len - 1] == '\n')
|
||||
{
|
||||
--doc_len;
|
||||
if (doc_len > 0 && doc[doc_len - 1] == '\r')
|
||||
--doc_len;
|
||||
}
|
||||
|
||||
/* NUL-terminate it. */
|
||||
if (doc_len >= doc_alloc)
|
||||
{
|
||||
doc_alloc = doc_alloc + 1;
|
||||
doc = xrealloc (doc, doc_alloc);
|
||||
}
|
||||
doc[doc_len++] = '\0';
|
||||
|
||||
/* For a here document, do the same processing as in
|
||||
double-quoted strings (see above). */
|
||||
if (heredoc)
|
||||
{
|
||||
char *processed_doc =
|
||||
process_heredoc (xp, doc, doc_line_number);
|
||||
free (doc);
|
||||
doc = processed_doc;
|
||||
}
|
||||
|
||||
if (doc != NULL)
|
||||
{
|
||||
tp->type = token_type_string_literal;
|
||||
tp->string = doc;
|
||||
tp->comment = add_reference (savable_comment);
|
||||
}
|
||||
else
|
||||
tp->type = token_type_other;
|
||||
return;
|
||||
}
|
||||
phase1_ungetc (xp, c3);
|
||||
|
||||
@ -25,6 +25,22 @@ echo _("embedded_2_${foo}bar");
|
||||
echo _("embedded_3_{$foo}bar");
|
||||
echo _("embedded_4_{$array[func(_('embedded_4_sub1'))]}_bar_{$array[func(_('embedded_4_sub2'))]}_baz");
|
||||
echo _("embedded_5");
|
||||
// Heredoc with with embedded expressions.
|
||||
echo _(<<<EOT
|
||||
embedded_6_$foo bar
|
||||
EOT);
|
||||
echo _(<<<EOT
|
||||
embedded_7_${foo}bar
|
||||
EOT);
|
||||
echo _(<<<EOT
|
||||
embedded_8_{$foo}bar
|
||||
EOT);
|
||||
echo _(<<<EOT
|
||||
embedded_9_{$array[func(_('embedded_9_sub1'))]}_bar_{$array[func(_('embedded_9_sub2'))]}_baz
|
||||
EOT);
|
||||
echo _(<<<EOT
|
||||
embedded_10
|
||||
EOT);
|
||||
?>
|
||||
EOF
|
||||
|
||||
@ -62,6 +78,15 @@ msgstr ""
|
||||
|
||||
msgid "embedded_5"
|
||||
msgstr ""
|
||||
|
||||
msgid "embedded_9_sub1"
|
||||
msgstr ""
|
||||
|
||||
msgid "embedded_9_sub2"
|
||||
msgstr ""
|
||||
|
||||
msgid "embedded_10"
|
||||
msgstr ""
|
||||
EOF
|
||||
|
||||
: ${DIFF=diff}
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
#!/bin/sh
|
||||
. "${srcdir=.}/init.sh"; path_prepend_ . ../src
|
||||
|
||||
# Test PHP support: here documents.
|
||||
# Test PHP support: heredoc strings, nowdoc strings.
|
||||
|
||||
cat <<\EOF > xg-ph-2.php
|
||||
<?
|
||||
@ -36,6 +36,20 @@ Odoakar
|
||||
Theoderich
|
||||
EOTMARKER
|
||||
echo _("Franks");
|
||||
echo _(<<<EOTMARKER
|
||||
Chlodovechus Rex Francorum
|
||||
Carolus Rex Francorum et Langobardorum, Magnus Imperator Romanorum
|
||||
EOTMARKER);
|
||||
echo _("Hohenstaufens");
|
||||
echo _(<<<"EOTMARKER"
|
||||
Fridericus Imperator Romani Imperii
|
||||
Fridericus II Imperator
|
||||
EOTMARKER);
|
||||
echo _("French");
|
||||
echo _(<<<'EOTMARKER'
|
||||
Louis XIV
|
||||
Napoleon Bonaparte
|
||||
EOTMARKER);
|
||||
?>
|
||||
EOF
|
||||
|
||||
@ -70,8 +84,32 @@ msgstr ""
|
||||
msgid "Romans"
|
||||
msgstr ""
|
||||
|
||||
msgid "Goths"
|
||||
msgstr ""
|
||||
|
||||
msgid "Franks"
|
||||
msgstr ""
|
||||
|
||||
msgid ""
|
||||
"Chlodovechus Rex Francorum\n"
|
||||
"Carolus Rex Francorum et Langobardorum, Magnus Imperator Romanorum"
|
||||
msgstr ""
|
||||
|
||||
msgid "Hohenstaufens"
|
||||
msgstr ""
|
||||
|
||||
msgid ""
|
||||
" Fridericus Imperator Romani Imperii\n"
|
||||
" Fridericus II Imperator"
|
||||
msgstr ""
|
||||
|
||||
msgid "French"
|
||||
msgstr ""
|
||||
|
||||
msgid ""
|
||||
"Louis XIV\n"
|
||||
"Napoleon Bonaparte"
|
||||
msgstr ""
|
||||
EOF
|
||||
|
||||
: ${DIFF=diff}
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user