xgettext: Scheme: #!fold-case and #!no-fold-case directives.

Reported by Florent Angly <florent.angly@gmail.com> at <https://savannah.gnu.org/bugs/?61987>. * autogen.sh (GNULIB_MODULES_TOOLS_FOR_SRC): Add unicase/u8-casefold, uninorm/nfc. * gettext-tools/src/x-scheme.c: Include unicase.h, uninorm.h. (SIZEOF): New macro. (phase0_getc): Renamed from do_getc. Remove line_number handling. (phase0_ungetc): Renamed from do_ungetc. Remove line_number handling. (MAX_PHASE1_PUSHBACK): New macro. (phase1_pushback, phase1_pushback_length): New variables. (phase1_getc, phase1_ungetc): New functions. (casefold): New variable. (read_token): Use phase1_getc, phase1_ungetc instead of do_getc, do_ungetc. (read_object): Likewise. If casefold is true, apply Unicode case-folding to symbols before looking them up in the hash table. Recognize all #! directives supported by Guile. (extract_whole_file): Initialize phase1_pushback_length, casefold. * gettext-tools/tests/xgettext-scheme-6: New file. * gettext-tools/tests/Makefile.am (TESTS): Add it.
2026-01-27 01:44:30 +00:00 · 2024-08-16 22:36:30 +02:00 · 2024-08-16 22:36:30 +02:00 · d5cd59cbc6
commit d5cd59cbc6
parent 2c8f439035
4 changed files with 272 additions and 68 deletions
--- a/autogen.sh
+++ b/autogen.sh
@ -250,10 +250,12 @@ if ! $skip_gnulib; then
    sys_stat
    sys_time
    trim
+    unicase/u8-casefold
    unictype/ctype-space
    unictype/syntax-java-whitespace
    unilbrk/ulc-width-linebreaks
    uniname/uniname
+    uninorm/nfc
    unistd
    unistr/u8-check
    unistr/u8-mbtouc
--- a/gettext-tools/src/x-scheme.c
+++ b/gettext-tools/src/x-scheme.c
@ -33,6 +33,8 @@
 #include "attribute.h"
 #include "message.h"
 #include "xgettext.h"
+#include "unicase.h"
+#include "uninorm.h"
 #include "xg-pos.h"
 #include "xg-mixed-string.h"
 #include "xg-arglist-context.h"
@ -46,6 +48,8 @@

 #define _(s) gettext(s)

+#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
+

 /* The Scheme syntax is described in R5RS and following standards:
     - R5RS: https://conservatory.scheme.org/schemers/Documents/Standards/R5RS/HTML/
@ -75,6 +79,13 @@
   - The syntax code assigned to each character, and how tokens are built
     up from characters (single escape, multiple escape etc.).

+   - Directives:
+       #!r6rs          (see R6RS § 4) turns R6RS compliance on
+       #!fold-case     (see R7RS § 2.1) turns case-folding of identifiers on
+       #!no-fold-case  (see R7RS § 2.1) turns case-folding of identifiers off
+       #!curly-infix                    (guile specific)
+       #!curly-infix-and-bracket-lists  (guile specific)
+
   - Comment syntax:
       ';' up to end of line
       '#;' <datum> (see R6RS § 4.2.3, R7RS § 2.2)
@ -173,7 +184,7 @@ static FILE *fp;

 /* Fetch the next character from the input file.  */
 static int
-do_getc ()
+phase0_getc ()
 {
  int c = getc (fp);

@ -183,22 +194,59 @@ do_getc ()
        error (EXIT_FAILURE, errno,
               _("error while reading \"%s\""), real_file_name);
    }
-  else if (c == '\n')
-   line_number++;

  return c;
 }

 /* Put back the last fetched character, not EOF.  */
-static void
-do_ungetc (int c)
+MAYBE_UNUSED static void
+phase0_ungetc (int c)
 {
-  if (c == '\n')
-    line_number--;
  ungetc (c, fp);
 }


+/* 1. line_number handling.  */
+
+/* Maximum used.
+   Must be larger than the longest possible directive.  */
+#define MAX_PHASE1_PUSHBACK 32
+static unsigned char phase1_pushback[MAX_PHASE1_PUSHBACK];
+static int phase1_pushback_length;
+
+/* Read the next single character from the input file.  */
+static int
+phase1_getc ()
+{
+  int c;
+
+  if (phase1_pushback_length)
+    c = phase1_pushback[--phase1_pushback_length];
+  else
+    c = phase0_getc ();
+
+  if (c == '\n')
+    ++line_number;
+
+  return c;
+}
+
+/* Supports MAX_PHASE1_PUSHBACK characters of pushback.  */
+static void
+phase1_ungetc (int c)
+{
+  if (c != EOF)
+    {
+      if (c == '\n')
+        --line_number;
+
+      if (phase1_pushback_length == SIZEOF (phase1_pushback))
+        abort ();
+      phase1_pushback[phase1_pushback_length++] = c;
+    }
+}
+
+
 /* ========================== Reading of tokens.  ========================== */


@ -206,6 +254,10 @@ do_ungetc (int c)
   False to follow R6RS and R7RS.  */
 static bool follow_guile;

+/* True if all read identifiers are to be casefolded, i.e. essentially mapped
+   to lower case.  */
+static bool casefold;
+
 /* A token consists of a sequence of characters.  */
 struct token
 {
@ -253,14 +305,14 @@ read_token (struct token *tp, int first)

  for (;;)
    {
-      int c = do_getc ();
+      int c = phase1_getc ();

      if (c == EOF)
        break;
      if (c == ' ' || c == '\r' || c == '\f' || c == '\t' || c == '\n'
          || c == '"' || c == '(' || c == ')' || c == ';')
        {
-          do_ungetc (c);
+          phase1_ungetc (c);
          break;
        }
      grow_token (tp);
@ -708,7 +760,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
              _("too deeply nested objects"));
  for (;;)
    {
-      int ch = do_getc ();
+      int ch = phase1_getc ();
      bool seen_underscore_prefix = false;

      switch (ch)
@ -736,7 +788,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
            comment_start ();
            for (;;)
              {
-                int c = do_getc ();
+                int c = phase1_getc ();
                if (c == EOF || c == '\n')
                  break;
                if (c != ';')
@ -803,6 +855,29 @@ read_object (struct object *op, flag_region_ty *outer_region)
                        if (inner.type == t_symbol)
                          {
                            char *symbol_name = string_of_object (&inner);
+                            if (casefold)
+                              {
+                                char *symbol_name_converted =
+                                  from_current_source_encoding (symbol_name,
+                                                                lc_outside,
+                                                                logical_file_name,
+                                                                line_number);
+                                size_t symbol_name_casefolded_len;
+                                char *symbol_name_casefolded =
+                                  (char *)
+                                  u8_casefold ((uint8_t *) symbol_name_converted,
+                                               strlen (symbol_name_converted) + 1,
+                                               NULL, UNINORM_NFC,
+                                               NULL, &symbol_name_casefolded_len);
+                                if (symbol_name_converted != symbol_name)
+                                  free (symbol_name_converted);
+                                if (symbol_name_casefolded != NULL)
+                                  {
+                                    free (symbol_name);
+                                    symbol_name = symbol_name_casefolded;
+                                  }
+                              }
+
                            void *keyword_value;

                            if (hash_find_entry (&keywords,
@ -864,11 +939,11 @@ read_object (struct object *op, flag_region_ty *outer_region)

        case ',':
          {
-            int c = do_getc ();
+            int c = phase1_getc ();
            /* The ,@ handling inside lists is wrong anyway, because
               ,@form expands to an unknown number of elements.  */
            if (c != EOF && c != '@')
-              do_ungetc (c);
+              phase1_ungetc (c);
          }
          FALLTHROUGH;
        case '\'':
@ -892,7 +967,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
        case '#':
          /* Dispatch macro handling.  */
          {
-            int dmc = do_getc ();
+            int dmc = phase1_getc ();
            if (dmc == EOF)
              /* Invalid input.  Be tolerant, no error message.  */
              {
@ -903,7 +978,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
            switch (dmc)
              {
              case '(': /* Vector */
-                do_ungetc (dmc);
+                phase1_ungetc (dmc);
                {
                  struct object inner;
                  ++nesting_depth;
@ -934,7 +1009,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
              case 'y':
                {
                  struct token token;
-                  do_ungetc (dmc);
+                  phase1_ungetc (dmc);
                  read_token (&token, '#');
                  if ((token.charcount == 2
                       && (token.chars[1] == 'a' || token.chars[1] == 'c'
@ -960,9 +1035,9 @@ read_object (struct object *op, flag_region_ty *outer_region)
                                  && token.chars[2] == 'u'
                                  && token.chars[3] == '8'))))
                    {
-                      int c = do_getc ();
+                      int c = phase1_getc ();
                      if (c != EOF)
-                        do_ungetc (c);
+                        phase1_ungetc (c);
                      if (c == '(')
                        {
                          /* Homogenous vector syntax:
@ -1015,7 +1090,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
              case 'I': case 'i':
                {
                  struct token token;
-                  do_ungetc (dmc);
+                  phase1_ungetc (dmc);
                  read_token (&token, '#');
                  if (is_number (&token))
                    {
@ -1030,9 +1105,9 @@ read_object (struct object *op, flag_region_ty *outer_region)
                      if (token.charcount == 2
                          && (token.chars[1] == 'e' || token.chars[1] == 'i'))
                        {
-                          int c = do_getc ();
+                          int c = phase1_getc ();
                          if (c != EOF)
-                            do_ungetc (c);
+                            phase1_ungetc (c);
                          if (c == '(')
                            {
                              /* Homogenous vector syntax:
@ -1080,50 +1155,124 @@ read_object (struct object *op, flag_region_ty *outer_region)
                }

              case '!':
-                /* Block comment '#! ... !#'.  See
-                   <https://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>.  */
+                /* Directive or block comment.  */
                {
-                  int c;
-
-                  comment_start ();
-                  c = do_getc ();
-                  for (;;)
+                  const char * const directives[] =
                    {
-                      if (c == EOF)
-                        break;
-                      if (c == '!')
+                      "r6rs",
+                      "fold-case",
+                      "no-fold-case",
+                      "curly-infix",
+                      "curly-infix-and-bracket-lists"
+                    };
+                  int num_directives = SIZEOF (directives);
+                  enum { max_directive_len = 29 };
+                  bool seen_directive = false;
+                  int d;
+                  for (d = 0; d < num_directives; d++)
+                    {
+                      const char *directive = directives[d];
+                      int directive_len = strlen (directive);
+                      int c[max_directive_len];
+                      int i;
+                      for (i = 0; i < directive_len; i++)
                        {
-                          c = do_getc ();
-                          if (c == EOF)
-                            break;
-                          if (c == '#')
+                          c[i] = phase1_getc ();
+                          if (c[i] != directive[i])
                            {
-                              comment_line_end (0);
+                              phase1_ungetc (c[i]);
                              break;
                            }
-                          else
-                            comment_add ('!');
                        }
-                      else
+                      if (i == directive_len)
                        {
-                          /* We skip all leading white space.  */
-                          if (!(buflen == 0 && (c == ' ' || c == '\t')))
-                            comment_add (c);
-                          if (c == '\n')
+                          int e = phase1_getc ();
+                          /* Like in read_token.  */
+                          if (e == ' '
+                              || e == '\r' || e == '\f' || e == '\t' || e == '\n'
+                              || e == '"' || e == '(' || e == ')' || e == ';')
                            {
-                              comment_line_end (1);
-                              comment_start ();
+                              /* Seen the directive.  */
+                              phase1_ungetc (e);
+                              seen_directive = true;
+                              switch (d)
+                                {
+                                case 0: /* #!r6rs */
+                                  follow_guile = false;
+                                  break;
+                                case 1: /* #!fold-case */
+                                  casefold = true;
+                                  break;
+                                case 2: /* #!no-fold-case */
+                                  casefold = false;
+                                  break;
+                                case 3: /* #!curly-infix */
+                                case 4: /* #!curly-infix-and-bracket-lists */
+                                  if_error (IF_SEVERITY_WARNING,
+                                            logical_file_name, line_number, (size_t)(-1),
+                                            false,
+                                            _("Unsupported Guile directive \"%s\"."),
+                                            directive);
+                                  break;
+                                default:
+                                  abort ();
+                                }
+                              break;
                            }
-                          c = do_getc ();
+                          phase1_ungetc (e);
+                        }
+                      while (i > 0)
+                        {
+                          i--;
+                          phase1_ungetc (c[i]);
                        }
                    }
-                  if (c == EOF)
+                  if (!seen_directive)
+                    /* Block comment '#! ... !#'.  See
+                       <https://www.gnu.org/software/guile/manual/html_node/Block-Comments.html>.  */
                    {
-                      /* EOF not allowed here.  But be tolerant.  */
-                      op->type = t_eof;
-                      return;
+                      int c;
+
+                      comment_start ();
+                      c = phase1_getc ();
+                      for (;;)
+                        {
+                          if (c == EOF)
+                            break;
+                          if (c == '!')
+                            {
+                              c = phase1_getc ();
+                              if (c == EOF)
+                                break;
+                              if (c == '#')
+                                {
+                                  comment_line_end (0);
+                                  break;
+                                }
+                              else
+                                comment_add ('!');
+                            }
+                          else
+                            {
+                              /* We skip all leading white space.  */
+                              if (!(buflen == 0 && (c == ' ' || c == '\t')))
+                                comment_add (c);
+                              if (c == '\n')
+                                {
+                                  comment_line_end (1);
+                                  comment_start ();
+                                }
+                              c = phase1_getc ();
+                            }
+                        }
+                      if (c == EOF)
+                        {
+                          /* EOF not allowed here.  But be tolerant.  */
+                          op->type = t_eof;
+                          return;
+                        }
+                      last_comment_line = line_number;
                    }
-                  last_comment_line = line_number;
                  continue;
                }

@ -1136,14 +1285,14 @@ read_object (struct object *op, flag_region_ty *outer_region)
                  int c;

                  comment_start ();
-                  c = do_getc ();
+                  c = phase1_getc ();
                  for (;;)
                    {
                      if (c == EOF)
                        break;
                      if (c == '|')
                        {
-                          c = do_getc ();
+                          c = phase1_getc ();
                          if (c == EOF)
                            break;
                          if (c == '#')
@ -1156,14 +1305,14 @@ read_object (struct object *op, flag_region_ty *outer_region)
                              depth--;
                              comment_add ('|');
                              comment_add ('#');
-                              c = do_getc ();
+                              c = phase1_getc ();
                            }
                          else
                            comment_add ('|');
                        }
                      else if (c == '#')
                        {
-                          c = do_getc ();
+                          c = phase1_getc ();
                          if (c == EOF)
                            break;
                          comment_add ('#');
@ -1171,7 +1320,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
                            {
                              depth++;
                              comment_add ('|');
-                              c = do_getc ();
+                              c = phase1_getc ();
                            }
                        }
                      else
@ -1184,7 +1333,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
                              comment_line_end (1);
                              comment_start ();
                            }
-                          c = do_getc ();
+                          c = phase1_getc ();
                        }
                    }
                  if (c == EOF)
@ -1219,23 +1368,23 @@ read_object (struct object *op, flag_region_ty *outer_region)

                  for (;;)
                    {
-                      int c = do_getc ();
+                      int c = phase1_getc ();

                      if (c == EOF)
                        break;
                      if (c == '\\')
                        {
-                          c = do_getc ();
+                          c = phase1_getc ();
                          if (c == EOF)
                            break;
                        }
                      else if (c == '}')
                        {
-                          c = do_getc ();
+                          c = phase1_getc ();
                          if (c == '#')
                            break;
                          if (c != EOF)
-                            do_ungetc (c);
+                            phase1_ungetc (c);
                          c = '}';
                        }
                      grow_token (op->token);
@ -1251,7 +1400,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
                /* Character.  */
                {
                  struct token token;
-                  int c = do_getc ();
+                  int c = phase1_getc ();
                  if (c != EOF)
                    {
                      read_token (&token, c);
@ -1285,7 +1434,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
                {
                  int c;
                  do
-                    c = do_getc ();
+                    c = phase1_getc ();
                  while (c >= '0' && c <= '9');
                  /* c should be one of {'a'|'b'|'c'|'e'|'i'|'s'|'u'}.
                     But be tolerant.  */
@ -1321,7 +1470,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
          /* GIMP script-fu extension: '_' before a string literal is
             considered a gettext call on the string.  */
          {
-            int c = do_getc ();
+            int c = phase1_getc ();
            if (c == EOF)
              /* Invalid input.  Be tolerant, no error message.  */
              {
@ -1330,7 +1479,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
              }
            if (c != '"')
              {
-                do_ungetc (c);
+                phase1_ungetc (c);

                /* If '_' is not followed by a string literal,
                   consider it a part of symbol.  */
@ -1351,7 +1500,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
            op->line_number_at_start = line_number;
            for (;;)
              {
-                int c = do_getc ();
+                int c = phase1_getc ();
                if (c == EOF)
                  /* Invalid input.  Be tolerant, no error message.  */
                  break;
@ -1359,7 +1508,7 @@ read_object (struct object *op, flag_region_ty *outer_region)
                  break;
                if (c == '\\')
                  {
-                    c = do_getc ();
+                    c = phase1_getc ();
                    if (c == EOF)
                      /* Invalid input.  Be tolerant, no error message.  */
                      break;
@ -1464,6 +1613,10 @@ extract_whole_file (FILE *f,
  logical_file_name = xstrdup (logical_filename);
  line_number = 1;

+  phase1_pushback_length = 0;
+
+  casefold = false;
+
  last_comment_line = -1;
  last_non_comment_line = -1;

--- a/gettext-tools/tests/Makefile.am
+++ b/gettext-tools/tests/Makefile.am
@ -153,7 +153,7 @@ TESTS = gettext-1 gettext-2 \
 	xgettext-python-stackovfl-3 xgettext-python-stackovfl-4 \
 	xgettext-ruby-1 \
 	xgettext-scheme-1 xgettext-scheme-2 xgettext-scheme-3 \
-	xgettext-scheme-4 xgettext-scheme-5 \
+	xgettext-scheme-4 xgettext-scheme-5 xgettext-scheme-6 \
 	xgettext-scheme-format-1 xgettext-scheme-format-2 \
 	xgettext-scheme-stackovfl-1 xgettext-scheme-stackovfl-2 \
 	xgettext-sh-1 xgettext-sh-2 xgettext-sh-3 xgettext-sh-4 xgettext-sh-5 \
--- a/gettext-tools/tests/xgettext-scheme-6
+++ b/gettext-tools/tests/xgettext-scheme-6
@ -0,0 +1,49 @@
+#!/bin/sh
+. "${srcdir=.}/init.sh"; path_prepend_ . ../src
+
+# Test Scheme support: #!fold-case and #!no-fold-case directives.
+
+cat <<\EOF > xg-sc-7.scm
+(display (gettext "orange"))
+(Display (GetText "mango"))
+#!fold-case
+(Display (GetText "apple"))
+(DISPLAY (GETTEXT "banana"))
+#!fold-case
+(Display (GetText "pear"))
+(FORMAT T (GETTEXT "Got ~D dollars" n))
+#!no-fold-case
+(display (gettext "cherry"))
+(Display (GetText "plum"))
+(FORMAT T (GETTEXT "Got ~D euros" n))
+EOF
+
+: ${XGETTEXT=xgettext}
+${XGETTEXT} --omit-header --no-location -d xg-sc-7 xg-sc-7.scm || Exit 1
+
+cat <<\EOF > xg-sc-7.ok
+msgid "orange"
+msgstr ""
+
+msgid "apple"
+msgstr ""
+
+msgid "banana"
+msgstr ""
+
+msgid "pear"
+msgstr ""
+
+#, scheme-format
+msgid "Got ~D dollars"
+msgstr ""
+
+msgid "cherry"
+msgstr ""
+EOF
+
+: ${DIFF=diff}
+${DIFF} xg-sc-7.ok xg-sc-7.po
+result=$?
+
+exit $result