From d5cd59cbc63f2879373d84428debf8b79e94e4f7 Mon Sep 17 00:00:00 2001 From: Bruno Haible Date: Fri, 16 Aug 2024 22:36:30 +0200 Subject: [PATCH] xgettext: Scheme: #!fold-case and #!no-fold-case directives. Reported by Florent Angly at . * autogen.sh (GNULIB_MODULES_TOOLS_FOR_SRC): Add unicase/u8-casefold, uninorm/nfc. * gettext-tools/src/x-scheme.c: Include unicase.h, uninorm.h. (SIZEOF): New macro. (phase0_getc): Renamed from do_getc. Remove line_number handling. (phase0_ungetc): Renamed from do_ungetc. Remove line_number handling. (MAX_PHASE1_PUSHBACK): New macro. (phase1_pushback, phase1_pushback_length): New variables. (phase1_getc, phase1_ungetc): New functions. (casefold): New variable. (read_token): Use phase1_getc, phase1_ungetc instead of do_getc, do_ungetc. (read_object): Likewise. If casefold is true, apply Unicode case-folding to symbols before looking them up in the hash table. Recognize all #! directives supported by Guile. (extract_whole_file): Initialize phase1_pushback_length, casefold. * gettext-tools/tests/xgettext-scheme-6: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it. --- autogen.sh | 2 + gettext-tools/src/x-scheme.c | 287 ++++++++++++++++++++------ gettext-tools/tests/Makefile.am | 2 +- gettext-tools/tests/xgettext-scheme-6 | 49 +++++ 4 files changed, 272 insertions(+), 68 deletions(-) create mode 100755 gettext-tools/tests/xgettext-scheme-6 diff --git a/autogen.sh b/autogen.sh index 9d092645b..b69a27b1d 100755 --- a/autogen.sh +++ b/autogen.sh @@ -250,10 +250,12 @@ if ! $skip_gnulib; then sys_stat sys_time trim + unicase/u8-casefold unictype/ctype-space unictype/syntax-java-whitespace unilbrk/ulc-width-linebreaks uniname/uniname + uninorm/nfc unistd unistr/u8-check unistr/u8-mbtouc diff --git a/gettext-tools/src/x-scheme.c b/gettext-tools/src/x-scheme.c index c32af1fa5..601fa760f 100644 --- a/gettext-tools/src/x-scheme.c +++ b/gettext-tools/src/x-scheme.c @@ -33,6 +33,8 @@ #include "attribute.h" #include "message.h" #include "xgettext.h" +#include "unicase.h" +#include "uninorm.h" #include "xg-pos.h" #include "xg-mixed-string.h" #include "xg-arglist-context.h" @@ -46,6 +48,8 @@ #define _(s) gettext(s) +#define SIZEOF(a) (sizeof(a) / sizeof(a[0])) + /* The Scheme syntax is described in R5RS and following standards: - R5RS: https://conservatory.scheme.org/schemers/Documents/Standards/R5RS/HTML/ @@ -75,6 +79,13 @@ - The syntax code assigned to each character, and how tokens are built up from characters (single escape, multiple escape etc.). + - Directives: + #!r6rs (see R6RS § 4) turns R6RS compliance on + #!fold-case (see R7RS § 2.1) turns case-folding of identifiers on + #!no-fold-case (see R7RS § 2.1) turns case-folding of identifiers off + #!curly-infix (guile specific) + #!curly-infix-and-bracket-lists (guile specific) + - Comment syntax: ';' up to end of line '#;' (see R6RS § 4.2.3, R7RS § 2.2) @@ -173,7 +184,7 @@ static FILE *fp; /* Fetch the next character from the input file. */ static int -do_getc () +phase0_getc () { int c = getc (fp); @@ -183,22 +194,59 @@ do_getc () error (EXIT_FAILURE, errno, _("error while reading \"%s\""), real_file_name); } - else if (c == '\n') - line_number++; return c; } /* Put back the last fetched character, not EOF. */ -static void -do_ungetc (int c) +MAYBE_UNUSED static void +phase0_ungetc (int c) { - if (c == '\n') - line_number--; ungetc (c, fp); } +/* 1. line_number handling. */ + +/* Maximum used. + Must be larger than the longest possible directive. */ +#define MAX_PHASE1_PUSHBACK 32 +static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK]; +static int phase1_pushback_length; + +/* Read the next single character from the input file. */ +static int +phase1_getc () +{ + int c; + + if (phase1_pushback_length) + c = phase1_pushback[--phase1_pushback_length]; + else + c = phase0_getc (); + + if (c == '\n') + ++line_number; + + return c; +} + +/* Supports MAX_PHASE1_PUSHBACK characters of pushback. */ +static void +phase1_ungetc (int c) +{ + if (c != EOF) + { + if (c == '\n') + --line_number; + + if (phase1_pushback_length == SIZEOF (phase1_pushback)) + abort (); + phase1_pushback[phase1_pushback_length++] = c; + } +} + + /* ========================== Reading of tokens. ========================== */ @@ -206,6 +254,10 @@ do_ungetc (int c) False to follow R6RS and R7RS. */ static bool follow_guile; +/* True if all read identifiers are to be casefolded, i.e. essentially mapped + to lower case. */ +static bool casefold; + /* A token consists of a sequence of characters. */ struct token { @@ -253,14 +305,14 @@ read_token (struct token *tp, int first) for (;;) { - int c = do_getc (); + int c = phase1_getc (); if (c == EOF) break; if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n' || c == '"' || c == '(' || c == ')' || c == ';') { - do_ungetc (c); + phase1_ungetc (c); break; } grow_token (tp); @@ -708,7 +760,7 @@ read_object (struct object *op, flag_region_ty *outer_region) _("too deeply nested objects")); for (;;) { - int ch = do_getc (); + int ch = phase1_getc (); bool seen_underscore_prefix = false; switch (ch) @@ -736,7 +788,7 @@ read_object (struct object *op, flag_region_ty *outer_region) comment_start (); for (;;) { - int c = do_getc (); + int c = phase1_getc (); if (c == EOF || c == '\n') break; if (c != ';') @@ -803,6 +855,29 @@ read_object (struct object *op, flag_region_ty *outer_region) if (inner.type == t_symbol) { char *symbol_name = string_of_object (&inner); + if (casefold) + { + char *symbol_name_converted = + from_current_source_encoding (symbol_name, + lc_outside, + logical_file_name, + line_number); + size_t symbol_name_casefolded_len; + char *symbol_name_casefolded = + (char *) + u8_casefold ((uint8_t *) symbol_name_converted, + strlen (symbol_name_converted) + 1, + NULL, UNINORM_NFC, + NULL, &symbol_name_casefolded_len); + if (symbol_name_converted != symbol_name) + free (symbol_name_converted); + if (symbol_name_casefolded != NULL) + { + free (symbol_name); + symbol_name = symbol_name_casefolded; + } + } + void *keyword_value; if (hash_find_entry (&keywords, @@ -864,11 +939,11 @@ read_object (struct object *op, flag_region_ty *outer_region) case ',': { - int c = do_getc (); + int c = phase1_getc (); /* The ,@ handling inside lists is wrong anyway, because ,@form expands to an unknown number of elements. */ if (c != EOF && c != '@') - do_ungetc (c); + phase1_ungetc (c); } FALLTHROUGH; case '\'': @@ -892,7 +967,7 @@ read_object (struct object *op, flag_region_ty *outer_region) case '#': /* Dispatch macro handling. */ { - int dmc = do_getc (); + int dmc = phase1_getc (); if (dmc == EOF) /* Invalid input. Be tolerant, no error message. */ { @@ -903,7 +978,7 @@ read_object (struct object *op, flag_region_ty *outer_region) switch (dmc) { case '(': /* Vector */ - do_ungetc (dmc); + phase1_ungetc (dmc); { struct object inner; ++nesting_depth; @@ -934,7 +1009,7 @@ read_object (struct object *op, flag_region_ty *outer_region) case 'y': { struct token token; - do_ungetc (dmc); + phase1_ungetc (dmc); read_token (&token, '#'); if ((token.charcount == 2 && (token.chars[1] == 'a' || token.chars[1] == 'c' @@ -960,9 +1035,9 @@ read_object (struct object *op, flag_region_ty *outer_region) && token.chars[2] == 'u' && token.chars[3] == '8')))) { - int c = do_getc (); + int c = phase1_getc (); if (c != EOF) - do_ungetc (c); + phase1_ungetc (c); if (c == '(') { /* Homogenous vector syntax: @@ -1015,7 +1090,7 @@ read_object (struct object *op, flag_region_ty *outer_region) case 'I': case 'i': { struct token token; - do_ungetc (dmc); + phase1_ungetc (dmc); read_token (&token, '#'); if (is_number (&token)) { @@ -1030,9 +1105,9 @@ read_object (struct object *op, flag_region_ty *outer_region) if (token.charcount == 2 && (token.chars[1] == 'e' || token.chars[1] == 'i')) { - int c = do_getc (); + int c = phase1_getc (); if (c != EOF) - do_ungetc (c); + phase1_ungetc (c); if (c == '(') { /* Homogenous vector syntax: @@ -1080,50 +1155,124 @@ read_object (struct object *op, flag_region_ty *outer_region) } case '!': - /* Block comment '#! ... !#'. See - . */ + /* Directive or block comment. */ { - int c; - - comment_start (); - c = do_getc (); - for (;;) + const char * const directives[] = { - if (c == EOF) - break; - if (c == '!') + "r6rs", + "fold-case", + "no-fold-case", + "curly-infix", + "curly-infix-and-bracket-lists" + }; + int num_directives = SIZEOF (directives); + enum { max_directive_len = 29 }; + bool seen_directive = false; + int d; + for (d = 0; d < num_directives; d++) + { + const char *directive = directives[d]; + int directive_len = strlen (directive); + int c[max_directive_len]; + int i; + for (i = 0; i < directive_len; i++) { - c = do_getc (); - if (c == EOF) - break; - if (c == '#') + c[i] = phase1_getc (); + if (c[i] != directive[i]) { - comment_line_end (0); + phase1_ungetc (c[i]); break; } - else - comment_add ('!'); } - else + if (i == directive_len) { - /* We skip all leading white space. */ - if (!(buflen == 0 && (c == ' ' || c == '\t'))) - comment_add (c); - if (c == '\n') + int e = phase1_getc (); + /* Like in read_token. */ + if (e == ' ' + || e == '\r' || e == '\f' || e == '\t' || e == '\n' + || e == '"' || e == '(' || e == ')' || e == ';') { - comment_line_end (1); - comment_start (); + /* Seen the directive. */ + phase1_ungetc (e); + seen_directive = true; + switch (d) + { + case 0: /* #!r6rs */ + follow_guile = false; + break; + case 1: /* #!fold-case */ + casefold = true; + break; + case 2: /* #!no-fold-case */ + casefold = false; + break; + case 3: /* #!curly-infix */ + case 4: /* #!curly-infix-and-bracket-lists */ + if_error (IF_SEVERITY_WARNING, + logical_file_name, line_number, (size_t)(-1), + false, + _("Unsupported Guile directive \"%s\"."), + directive); + break; + default: + abort (); + } + break; } - c = do_getc (); + phase1_ungetc (e); + } + while (i > 0) + { + i--; + phase1_ungetc (c[i]); } } - if (c == EOF) + if (!seen_directive) + /* Block comment '#! ... !#'. See + . */ { - /* EOF not allowed here. But be tolerant. */ - op->type = t_eof; - return; + int c; + + comment_start (); + c = phase1_getc (); + for (;;) + { + if (c == EOF) + break; + if (c == '!') + { + c = phase1_getc (); + if (c == EOF) + break; + if (c == '#') + { + comment_line_end (0); + break; + } + else + comment_add ('!'); + } + else + { + /* We skip all leading white space. */ + if (!(buflen == 0 && (c == ' ' || c == '\t'))) + comment_add (c); + if (c == '\n') + { + comment_line_end (1); + comment_start (); + } + c = phase1_getc (); + } + } + if (c == EOF) + { + /* EOF not allowed here. But be tolerant. */ + op->type = t_eof; + return; + } + last_comment_line = line_number; } - last_comment_line = line_number; continue; } @@ -1136,14 +1285,14 @@ read_object (struct object *op, flag_region_ty *outer_region) int c; comment_start (); - c = do_getc (); + c = phase1_getc (); for (;;) { if (c == EOF) break; if (c == '|') { - c = do_getc (); + c = phase1_getc (); if (c == EOF) break; if (c == '#') @@ -1156,14 +1305,14 @@ read_object (struct object *op, flag_region_ty *outer_region) depth--; comment_add ('|'); comment_add ('#'); - c = do_getc (); + c = phase1_getc (); } else comment_add ('|'); } else if (c == '#') { - c = do_getc (); + c = phase1_getc (); if (c == EOF) break; comment_add ('#'); @@ -1171,7 +1320,7 @@ read_object (struct object *op, flag_region_ty *outer_region) { depth++; comment_add ('|'); - c = do_getc (); + c = phase1_getc (); } } else @@ -1184,7 +1333,7 @@ read_object (struct object *op, flag_region_ty *outer_region) comment_line_end (1); comment_start (); } - c = do_getc (); + c = phase1_getc (); } } if (c == EOF) @@ -1219,23 +1368,23 @@ read_object (struct object *op, flag_region_ty *outer_region) for (;;) { - int c = do_getc (); + int c = phase1_getc (); if (c == EOF) break; if (c == '\\') { - c = do_getc (); + c = phase1_getc (); if (c == EOF) break; } else if (c == '}') { - c = do_getc (); + c = phase1_getc (); if (c == '#') break; if (c != EOF) - do_ungetc (c); + phase1_ungetc (c); c = '}'; } grow_token (op->token); @@ -1251,7 +1400,7 @@ read_object (struct object *op, flag_region_ty *outer_region) /* Character. */ { struct token token; - int c = do_getc (); + int c = phase1_getc (); if (c != EOF) { read_token (&token, c); @@ -1285,7 +1434,7 @@ read_object (struct object *op, flag_region_ty *outer_region) { int c; do - c = do_getc (); + c = phase1_getc (); while (c >= '0' && c <= '9'); /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}. But be tolerant. */ @@ -1321,7 +1470,7 @@ read_object (struct object *op, flag_region_ty *outer_region) /* GIMP script-fu extension: '_' before a string literal is considered a gettext call on the string. */ { - int c = do_getc (); + int c = phase1_getc (); if (c == EOF) /* Invalid input. Be tolerant, no error message. */ { @@ -1330,7 +1479,7 @@ read_object (struct object *op, flag_region_ty *outer_region) } if (c != '"') { - do_ungetc (c); + phase1_ungetc (c); /* If '_' is not followed by a string literal, consider it a part of symbol. */ @@ -1351,7 +1500,7 @@ read_object (struct object *op, flag_region_ty *outer_region) op->line_number_at_start = line_number; for (;;) { - int c = do_getc (); + int c = phase1_getc (); if (c == EOF) /* Invalid input. Be tolerant, no error message. */ break; @@ -1359,7 +1508,7 @@ read_object (struct object *op, flag_region_ty *outer_region) break; if (c == '\\') { - c = do_getc (); + c = phase1_getc (); if (c == EOF) /* Invalid input. Be tolerant, no error message. */ break; @@ -1464,6 +1613,10 @@ extract_whole_file (FILE *f, logical_file_name = xstrdup (logical_filename); line_number = 1; + phase1_pushback_length = 0; + + casefold = false; + last_comment_line = -1; last_non_comment_line = -1; diff --git a/gettext-tools/tests/Makefile.am b/gettext-tools/tests/Makefile.am index 5f71e3700..f98e192cd 100644 --- a/gettext-tools/tests/Makefile.am +++ b/gettext-tools/tests/Makefile.am @@ -153,7 +153,7 @@ TESTS = gettext-1 gettext-2 \ xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \ xgettext-ruby-1 \ xgettext-scheme-1 xgettext-scheme-2 xgettext-scheme-3 \ - xgettext-scheme-4 xgettext-scheme-5 \ + xgettext-scheme-4 xgettext-scheme-5 xgettext-scheme-6 \ xgettext-scheme-format-1 xgettext-scheme-format-2 \ xgettext-scheme-stackovfl-1 xgettext-scheme-stackovfl-2 \ xgettext-sh-1 xgettext-sh-2 xgettext-sh-3 xgettext-sh-4 xgettext-sh-5 \ diff --git a/gettext-tools/tests/xgettext-scheme-6 b/gettext-tools/tests/xgettext-scheme-6 new file mode 100755 index 000000000..1ada5053c --- /dev/null +++ b/gettext-tools/tests/xgettext-scheme-6 @@ -0,0 +1,49 @@ +#!/bin/sh +. "${srcdir=.}/init.sh"; path_prepend_ . ../src + +# Test Scheme support: #!fold-case and #!no-fold-case directives. + +cat <<\EOF > xg-sc-7.scm +(display (gettext "orange")) +(Display (GetText "mango")) +#!fold-case +(Display (GetText "apple")) +(DISPLAY (GETTEXT "banana")) +#!fold-case +(Display (GetText "pear")) +(FORMAT T (GETTEXT "Got ~D dollars" n)) +#!no-fold-case +(display (gettext "cherry")) +(Display (GetText "plum")) +(FORMAT T (GETTEXT "Got ~D euros" n)) +EOF + +: ${XGETTEXT=xgettext} +${XGETTEXT} --omit-header --no-location -d xg-sc-7 xg-sc-7.scm || Exit 1 + +cat <<\EOF > xg-sc-7.ok +msgid "orange" +msgstr "" + +msgid "apple" +msgstr "" + +msgid "banana" +msgstr "" + +msgid "pear" +msgstr "" + +#, scheme-format +msgid "Got ~D dollars" +msgstr "" + +msgid "cherry" +msgstr "" +EOF + +: ${DIFF=diff} +${DIFF} xg-sc-7.ok xg-sc-7.po +result=$? + +exit $result