xgettext: Scheme: #!fold-case and #!no-fold-case directives.

Reported by Florent Angly <florent.angly@gmail.com>
at <https://savannah.gnu.org/bugs/?61987>.

* autogen.sh (GNULIB_MODULES_TOOLS_FOR_SRC): Add unicase/u8-casefold,
uninorm/nfc.
* gettext-tools/src/x-scheme.c: Include unicase.h, uninorm.h.
(SIZEOF): New macro.
(phase0_getc): Renamed from do_getc. Remove line_number handling.
(phase0_ungetc): Renamed from do_ungetc. Remove line_number handling.
(MAX_PHASE1_PUSHBACK): New macro.
(phase1_pushback, phase1_pushback_length): New variables.
(phase1_getc, phase1_ungetc): New functions.
(casefold): New variable.
(read_token): Use phase1_getc, phase1_ungetc instead of do_getc, do_ungetc.
(read_object): Likewise. If casefold is true, apply Unicode case-folding to
symbols before looking them up in the hash table. Recognize all #! directives
supported by Guile.
(extract_whole_file): Initialize phase1_pushback_length, casefold.
* gettext-tools/tests/xgettext-scheme-6: New file.
* gettext-tools/tests/Makefile.am (TESTS): Add it.
This commit is contained in:
Bruno Haible 2024-08-16 22:36:30 +02:00
parent 2c8f439035
commit d5cd59cbc6
4 changed files with 272 additions and 68 deletions

View File

@ -250,10 +250,12 @@ if ! $skip_gnulib; then
sys_stat
sys_time
trim
unicase/u8-casefold
unictype/ctype-space
unictype/syntax-java-whitespace
unilbrk/ulc-width-linebreaks
uniname/uniname
uninorm/nfc
unistd
unistr/u8-check
unistr/u8-mbtouc

View File

@ -33,6 +33,8 @@
#include "attribute.h"
#include "message.h"
#include "xgettext.h"
#include "unicase.h"
#include "uninorm.h"
#include "xg-pos.h"
#include "xg-mixed-string.h"
#include "xg-arglist-context.h"
@ -46,6 +48,8 @@
#define _(s) gettext(s)
#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
/* The Scheme syntax is described in R5RS and following standards:
- R5RS: https://conservatory.scheme.org/schemers/Documents/Standards/R5RS/HTML/
@ -75,6 +79,13 @@
- The syntax code assigned to each character, and how tokens are built
up from characters (single escape, multiple escape etc.).
- Directives:
#!r6rs (see R6RS § 4) turns R6RS compliance on
#!fold-case (see R7RS § 2.1) turns case-folding of identifiers on
#!no-fold-case (see R7RS § 2.1) turns case-folding of identifiers off
#!curly-infix (guile specific)
#!curly-infix-and-bracket-lists (guile specific)
- Comment syntax:
';' up to end of line
'#;' <datum> (see R6RS § 4.2.3, R7RS § 2.2)
@ -173,7 +184,7 @@ static FILE *fp;
/* Fetch the next character from the input file. */
static int
do_getc ()
phase0_getc ()
{
int c = getc (fp);
@ -183,22 +194,59 @@ do_getc ()
error (EXIT_FAILURE, errno,
_("error while reading \"%s\""), real_file_name);
}
else if (c == '\n')
line_number++;
return c;
}
/* Put back the last fetched character, not EOF. */
static void
do_ungetc (int c)
MAYBE_UNUSED static void
phase0_ungetc (int c)
{
if (c == '\n')
line_number--;
ungetc (c, fp);
}
/* 1. line_number handling. */
/* Maximum used.
Must be larger than the longest possible directive. */
#define MAX_PHASE1_PUSHBACK 32
static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
static int phase1_pushback_length;
/* Read the next single character from the input file. */
static int
phase1_getc ()
{
int c;
if (phase1_pushback_length)
c = phase1_pushback[--phase1_pushback_length];
else
c = phase0_getc ();
if (c == '\n')
++line_number;
return c;
}
/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */
static void
phase1_ungetc (int c)
{
if (c != EOF)
{
if (c == '\n')
--line_number;
if (phase1_pushback_length == SIZEOF (phase1_pushback))
abort ();
phase1_pushback[phase1_pushback_length++] = c;
}
}
/* ========================== Reading of tokens. ========================== */
@ -206,6 +254,10 @@ do_ungetc (int c)
False to follow R6RS and R7RS. */
static bool follow_guile;
/* True if all read identifiers are to be casefolded, i.e. essentially mapped
to lower case. */
static bool casefold;
/* A token consists of a sequence of characters. */
struct token
{
@ -253,14 +305,14 @@ read_token (struct token *tp, int first)
for (;;)
{
int c = do_getc ();
int c = phase1_getc ();
if (c == EOF)
break;
if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
|| c == '"' || c == '(' || c == ')' || c == ';')
{
do_ungetc (c);
phase1_ungetc (c);
break;
}
grow_token (tp);
@ -708,7 +760,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
_("too deeply nested objects"));
for (;;)
{
int ch = do_getc ();
int ch = phase1_getc ();
bool seen_underscore_prefix = false;
switch (ch)
@ -736,7 +788,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
comment_start ();
for (;;)
{
int c = do_getc ();
int c = phase1_getc ();
if (c == EOF || c == '\n')
break;
if (c != ';')
@ -803,6 +855,29 @@ read_object (struct object *op, flag_region_ty *outer_region)
if (inner.type == t_symbol)
{
char *symbol_name = string_of_object (&inner);
if (casefold)
{
char *symbol_name_converted =
from_current_source_encoding (symbol_name,
lc_outside,
logical_file_name,
line_number);
size_t symbol_name_casefolded_len;
char *symbol_name_casefolded =
(char *)
u8_casefold ((uint8_t *) symbol_name_converted,
strlen (symbol_name_converted) + 1,
NULL, UNINORM_NFC,
NULL, &symbol_name_casefolded_len);
if (symbol_name_converted != symbol_name)
free (symbol_name_converted);
if (symbol_name_casefolded != NULL)
{
free (symbol_name);
symbol_name = symbol_name_casefolded;
}
}
void *keyword_value;
if (hash_find_entry (&keywords,
@ -864,11 +939,11 @@ read_object (struct object *op, flag_region_ty *outer_region)
case ',':
{
int c = do_getc ();
int c = phase1_getc ();
/* The ,@ handling inside lists is wrong anyway, because
,@form expands to an unknown number of elements. */
if (c != EOF && c != '@')
do_ungetc (c);
phase1_ungetc (c);
}
FALLTHROUGH;
case '\'':
@ -892,7 +967,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
case '#':
/* Dispatch macro handling. */
{
int dmc = do_getc ();
int dmc = phase1_getc ();
if (dmc == EOF)
/* Invalid input. Be tolerant, no error message. */
{
@ -903,7 +978,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
switch (dmc)
{
case '(': /* Vector */
do_ungetc (dmc);
phase1_ungetc (dmc);
{
struct object inner;
++nesting_depth;
@ -934,7 +1009,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
case 'y':
{
struct token token;
do_ungetc (dmc);
phase1_ungetc (dmc);
read_token (&token, '#');
if ((token.charcount == 2
&& (token.chars[1] == 'a' || token.chars[1] == 'c'
@ -960,9 +1035,9 @@ read_object (struct object *op, flag_region_ty *outer_region)
&& token.chars[2] == 'u'
&& token.chars[3] == '8'))))
{
int c = do_getc ();
int c = phase1_getc ();
if (c != EOF)
do_ungetc (c);
phase1_ungetc (c);
if (c == '(')
{
/* Homogenous vector syntax:
@ -1015,7 +1090,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
case 'I': case 'i':
{
struct token token;
do_ungetc (dmc);
phase1_ungetc (dmc);
read_token (&token, '#');
if (is_number (&token))
{
@ -1030,9 +1105,9 @@ read_object (struct object *op, flag_region_ty *outer_region)
if (token.charcount == 2
&& (token.chars[1] == 'e' || token.chars[1] == 'i'))
{
int c = do_getc ();
int c = phase1_getc ();
if (c != EOF)
do_ungetc (c);
phase1_ungetc (c);
if (c == '(')
{
/* Homogenous vector syntax:
@ -1080,50 +1155,124 @@ read_object (struct object *op, flag_region_ty *outer_region)
}
case '!':
/* Block comment '#! ... !#'. See
<https://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>. */
/* Directive or block comment. */
{
int c;
comment_start ();
c = do_getc ();
for (;;)
const char * const directives[] =
{
if (c == EOF)
break;
if (c == '!')
"r6rs",
"fold-case",
"no-fold-case",
"curly-infix",
"curly-infix-and-bracket-lists"
};
int num_directives = SIZEOF (directives);
enum { max_directive_len = 29 };
bool seen_directive = false;
int d;
for (d = 0; d < num_directives; d++)
{
const char *directive = directives[d];
int directive_len = strlen (directive);
int c[max_directive_len];
int i;
for (i = 0; i < directive_len; i++)
{
c = do_getc ();
if (c == EOF)
break;
if (c == '#')
c[i] = phase1_getc ();
if (c[i] != directive[i])
{
comment_line_end (0);
phase1_ungetc (c[i]);
break;
}
else
comment_add ('!');
}
else
if (i == directive_len)
{
/* We skip all leading white space. */
if (!(buflen == 0 && (c == ' ' || c == '\t')))
comment_add (c);
if (c == '\n')
int e = phase1_getc ();
/* Like in read_token. */
if (e == ' '
|| e == '\r' || e == '\f' || e == '\t' || e == '\n'
|| e == '"' || e == '(' || e == ')' || e == ';')
{
comment_line_end (1);
comment_start ();
/* Seen the directive. */
phase1_ungetc (e);
seen_directive = true;
switch (d)
{
case 0: /* #!r6rs */
follow_guile = false;
break;
case 1: /* #!fold-case */
casefold = true;
break;
case 2: /* #!no-fold-case */
casefold = false;
break;
case 3: /* #!curly-infix */
case 4: /* #!curly-infix-and-bracket-lists */
if_error (IF_SEVERITY_WARNING,
logical_file_name, line_number, (size_t)(-1),
false,
_("Unsupported Guile directive \"%s\"."),
directive);
break;
default:
abort ();
}
break;
}
c = do_getc ();
phase1_ungetc (e);
}
while (i > 0)
{
i--;
phase1_ungetc (c[i]);
}
}
if (c == EOF)
if (!seen_directive)
/* Block comment '#! ... !#'. See
<https://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>. */
{
/* EOF not allowed here. But be tolerant. */
op->type = t_eof;
return;
int c;
comment_start ();
c = phase1_getc ();
for (;;)
{
if (c == EOF)
break;
if (c == '!')
{
c = phase1_getc ();
if (c == EOF)
break;
if (c == '#')
{
comment_line_end (0);
break;
}
else
comment_add ('!');
}
else
{
/* We skip all leading white space. */
if (!(buflen == 0 && (c == ' ' || c == '\t')))
comment_add (c);
if (c == '\n')
{
comment_line_end (1);
comment_start ();
}
c = phase1_getc ();
}
}
if (c == EOF)
{
/* EOF not allowed here. But be tolerant. */
op->type = t_eof;
return;
}
last_comment_line = line_number;
}
last_comment_line = line_number;
continue;
}
@ -1136,14 +1285,14 @@ read_object (struct object *op, flag_region_ty *outer_region)
int c;
comment_start ();
c = do_getc ();
c = phase1_getc ();
for (;;)
{
if (c == EOF)
break;
if (c == '|')
{
c = do_getc ();
c = phase1_getc ();
if (c == EOF)
break;
if (c == '#')
@ -1156,14 +1305,14 @@ read_object (struct object *op, flag_region_ty *outer_region)
depth--;
comment_add ('|');
comment_add ('#');
c = do_getc ();
c = phase1_getc ();
}
else
comment_add ('|');
}
else if (c == '#')
{
c = do_getc ();
c = phase1_getc ();
if (c == EOF)
break;
comment_add ('#');
@ -1171,7 +1320,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
{
depth++;
comment_add ('|');
c = do_getc ();
c = phase1_getc ();
}
}
else
@ -1184,7 +1333,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
comment_line_end (1);
comment_start ();
}
c = do_getc ();
c = phase1_getc ();
}
}
if (c == EOF)
@ -1219,23 +1368,23 @@ read_object (struct object *op, flag_region_ty *outer_region)
for (;;)
{
int c = do_getc ();
int c = phase1_getc ();
if (c == EOF)
break;
if (c == '\\')
{
c = do_getc ();
c = phase1_getc ();
if (c == EOF)
break;
}
else if (c == '}')
{
c = do_getc ();
c = phase1_getc ();
if (c == '#')
break;
if (c != EOF)
do_ungetc (c);
phase1_ungetc (c);
c = '}';
}
grow_token (op->token);
@ -1251,7 +1400,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
/* Character. */
{
struct token token;
int c = do_getc ();
int c = phase1_getc ();
if (c != EOF)
{
read_token (&token, c);
@ -1285,7 +1434,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
{
int c;
do
c = do_getc ();
c = phase1_getc ();
while (c >= '0' && c <= '9');
/* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
But be tolerant. */
@ -1321,7 +1470,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
/* GIMP script-fu extension: '_' before a string literal is
considered a gettext call on the string. */
{
int c = do_getc ();
int c = phase1_getc ();
if (c == EOF)
/* Invalid input. Be tolerant, no error message. */
{
@ -1330,7 +1479,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
}
if (c != '"')
{
do_ungetc (c);
phase1_ungetc (c);
/* If '_' is not followed by a string literal,
consider it a part of symbol. */
@ -1351,7 +1500,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
op->line_number_at_start = line_number;
for (;;)
{
int c = do_getc ();
int c = phase1_getc ();
if (c == EOF)
/* Invalid input. Be tolerant, no error message. */
break;
@ -1359,7 +1508,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
break;
if (c == '\\')
{
c = do_getc ();
c = phase1_getc ();
if (c == EOF)
/* Invalid input. Be tolerant, no error message. */
break;
@ -1464,6 +1613,10 @@ extract_whole_file (FILE *f,
logical_file_name = xstrdup (logical_filename);
line_number = 1;
phase1_pushback_length = 0;
casefold = false;
last_comment_line = -1;
last_non_comment_line = -1;

View File

@ -153,7 +153,7 @@ TESTS = gettext-1 gettext-2 \
xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \
xgettext-ruby-1 \
xgettext-scheme-1 xgettext-scheme-2 xgettext-scheme-3 \
xgettext-scheme-4 xgettext-scheme-5 \
xgettext-scheme-4 xgettext-scheme-5 xgettext-scheme-6 \
xgettext-scheme-format-1 xgettext-scheme-format-2 \
xgettext-scheme-stackovfl-1 xgettext-scheme-stackovfl-2 \
xgettext-sh-1 xgettext-sh-2 xgettext-sh-3 xgettext-sh-4 xgettext-sh-5 \

View File

@ -0,0 +1,49 @@
#!/bin/sh
. "${srcdir=.}/init.sh"; path_prepend_ . ../src
# Test Scheme support: #!fold-case and #!no-fold-case directives.
cat <<\EOF > xg-sc-7.scm
(display (gettext "orange"))
(Display (GetText "mango"))
#!fold-case
(Display (GetText "apple"))
(DISPLAY (GETTEXT "banana"))
#!fold-case
(Display (GetText "pear"))
(FORMAT T (GETTEXT "Got ~D dollars" n))
#!no-fold-case
(display (gettext "cherry"))
(Display (GetText "plum"))
(FORMAT T (GETTEXT "Got ~D euros" n))
EOF
: ${XGETTEXT=xgettext}
${XGETTEXT} --omit-header --no-location -d xg-sc-7 xg-sc-7.scm || Exit 1
cat <<\EOF > xg-sc-7.ok
msgid "orange"
msgstr ""
msgid "apple"
msgstr ""
msgid "banana"
msgstr ""
msgid "pear"
msgstr ""
#, scheme-format
msgid "Got ~D dollars"
msgstr ""
msgid "cherry"
msgstr ""
EOF
: ${DIFF=diff}
${DIFF} xg-sc-7.ok xg-sc-7.po
result=$?
exit $result