cldr-plurals: Fix parsing of plurals.xml from CLDR 38 or newer.

Reported by Michele Locati at <https://savannah.gnu.org/bugs/?66378>. * gettext-tools/src/cldr-plural-exp.c (eval_relation): Accept operands 'c', 'e'. * gettext-tools/src/cldr-plural.y (yylex): Likewise. * gettext-tools/src/cldr-plurals.c (extract_rules): Test for type='cardinal'. (get_XcY_end, force_spaces, remove_XcY, remove_empty_examples): New functions. (main): Invoke force_spaces, remove_XcY, remove_empty_examples. * gettext-tools/tests/cldr-plurals-1: Add more tests. * AUTHORS: Add Michele Locati.
2026-01-25 23:27:58 +00:00 · 2026-01-13 11:23:14 +01:00 · 2026-01-13 11:23:14 +01:00 · 77441e0283
commit 77441e0283
parent c558619bb3
5 changed files with 205 additions and 18 deletions
--- a/3
+++ b/3
@ -91,3 +91,6 @@ Assigns past and future changes.
 GETTEXT         John Darrington
 Assigns past and future changes.
 (No contributions so far.)
+
+GETTEXT         Michele Locati
+Assigns past and future changes.
--- a/gettext-tools/src/cldr-plural-exp.c
+++ b/gettext-tools/src/cldr-plural-exp.c
@ -1,5 +1,5 @@
 /* Unicode CLDR plural rule parser and converter.
-   Copyright (C) 2015-2025 Free Software Foundation, Inc.
+   Copyright (C) 2015-2026 Free Software Foundation, Inc.

   This file was written by Daiki Ueno <ueno@gnu.org>, 2015.

@ -133,10 +133,11 @@ eval_relation (struct cldr_plural_relation_ty *relation)
      break;
    case 'f': case 't':
    case 'v': case 'w':
+    case 'c': case 'e':
      {
        /* Since plural expression in gettext only supports unsigned
           integer, turn relations whose operand is either 'f', 't',
-           'v', or 'w' into a constant truth value.  */
+           'v', 'w', 'c', or 'e' into a constant truth value.  */
        /* FIXME: check mod?  */
        for (size_t i = 0; i < relation->ranges->nitems; i++)
          {
--- a/gettext-tools/src/cldr-plural.y
+++ b/gettext-tools/src/cldr-plural.y
@ -1,5 +1,5 @@
 /* Unicode CLDR plural rule parser and converter.
-   Copyright (C) 2015-2025 Free Software Foundation, Inc.
+   Copyright (C) 2015-2026 Free Software Foundation, Inc.

   This file was written by Daiki Ueno <ueno@gnu.org>, 2015.

@ -428,7 +428,8 @@ yylex (YYSTYPE *lval, struct cldr_plural_parse_args *arg)
          {
            switch (ident[0])
              {
-              case 'n': case 'i': case 'f': case 't': case 'v': case 'w':
+              // See https://unicode.org/reports/tr35/tr35-numbers.html#table-plural-operand-meanings
+              case 'n': case 'i': case 'f': case 't': case 'v': case 'w': case 'c': case 'e':
                arg->cp = exp;
                lval->ival = ident[0];
                sb_free (&buffer);
--- a/gettext-tools/src/cldr-plurals.c
+++ b/gettext-tools/src/cldr-plurals.c
@ -1,5 +1,5 @@
 /* Unicode CLDR plural rule parser and converter
-   Copyright (C) 2015-2025 Free Software Foundation, Inc.
+   Copyright (C) 2015-2026 Free Software Foundation, Inc.

   This file was written by Daiki Ueno <ueno@gnu.org>, 2015.

@ -39,6 +39,11 @@
 #define _(s) gettext(s)


+/**
+ * Extract the rules from a CLDR plurals.xml file
+ * @return NULL in case of errors, the CLDR rules otherwise
+ * @example "one: i = 1 and v = 0 @integer 1; other: @integer 0, 2~16, 100, 1000, 10000, 100000, 1000000, \u2026 @decimal 0.0~1.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, \u2026"
+ */
 static char *
 extract_rules (FILE *fp,
               const char *real_filename, const char *logical_filename,
@ -71,8 +76,15 @@ extract_rules (FILE *fp,
    for (n = node->children; n; n = n->next)
      {
        if (n->type == XML_ELEMENT_NODE
-            && xmlStrEqual (n->name, BAD_CAST "plurals"))
-          break;
+            && xmlStrEqual (n->name, BAD_CAST "plurals")
+            && xmlHasProp (n, BAD_CAST "type"))
+          {
+            xmlChar *type = xmlGetProp (n, BAD_CAST "type");
+            bool is_cardinal = xmlStrEqual (type, BAD_CAST "cardinal") != 0;
+            xmlFree (type);
+            if (is_cardinal)
+               break;
+          }
      }
    if (!n)
      {
@ -156,6 +168,157 @@ extract_rules (FILE *fp,
  return sb_xdupfree_c (&buffer);
 }

+/**
+ * Find the position after the string in format XcY (eg "1c9")
+ * @param str the possible starting position of the string XcY
+ * @return NULL if str does not start with a XcY string,
+ *         the position of str after the XcY string (and after a comma/spaces
+           after it) otherwise
+ */
+static const char *
+get_XcY_end (const char *str)
+{
+  bool found_c = false;
+  if (str[0] < '0' || str[0] > '9')
+    return NULL;
+  str++;
+  while (str[0] != '\0')
+    {
+      if (str[0] == 'c')
+        {
+          if (found_c || str[1] < '0' || str[1] > '9')
+            return NULL;
+          found_c = true;
+        }
+      else if ((str[0] < '0' || str[0] > '9') && str[0] != '.')
+        break;
+      str++;
+    }
+  if (!found_c)
+    return NULL;
+  while (str[0] == ' ')
+    str++;
+  if (str[0] == ',')
+    {
+      str++;
+      while (str[0] == ' ')
+        str++;
+    }
+  return str;
+}
+
+static void
+force_spaces (char *input)
+{
+  while (input[0] != '\0')
+    {
+      if (c_isspace (input[0]))
+        input[0] = ' ';
+      input++;
+    }
+}
+
+static char *
+remove_XcY (const char *input)
+{
+  const char *p = (char *) input;
+  const char *p_next;
+  struct string_buffer buffer;
+  sb_init (&buffer);
+  for (;;)
+    {
+      int comma_and_spaces = -1;
+      const char *p_next1 = strstr (p, "@integer ");
+      const char *p_next2 = strstr (p, "@decimal ");
+      if (p_next1 == NULL && p_next2 == NULL)
+        {
+          sb_append_c (&buffer, p);
+          break;
+        }
+      if (p_next1 != NULL && (p_next2 == NULL || p_next1 < p_next2))
+        p_next = p_next1 + /* strlen ("@integer ") */ 9;
+      else
+        p_next = p_next2 + /* strlen ("@decimal ") */ 9;
+      while (p < p_next)
+        sb_append1 (&buffer, *p++);
+      while (p[0] == ' ')
+        sb_append1 (&buffer, *p++);
+      for (;;)
+        {
+          const char *XcY_end;
+          if (p[0] < '0' || p[0] > '9')
+            break;
+          XcY_end = get_XcY_end (p);
+          if (XcY_end != NULL)
+            {
+              p = XcY_end;
+              continue;
+            }
+          if (comma_and_spaces >= 0)
+            {
+              sb_append1 (&buffer, ',');
+              while (comma_and_spaces > 0)
+                {
+                  sb_append1 (&buffer, ' ');
+                  comma_and_spaces--;
+                }
+            }
+          while ((p[0] >= '0' && p[0] <= '9') || p[0] == '.' || p[0] == '~')
+            {
+              sb_append1 (&buffer, p[0]);
+              p++;
+            }
+          if (p[0] != ',')
+            break;
+          comma_and_spaces = 0;
+          p++;
+          while (p[0] == ' ')
+            {
+              comma_and_spaces++;
+              p++;
+            }
+        }
+      if (comma_and_spaces > 0 && (
+          (p[0] == '\xE2' && p[1] == '\x80' && p[2] == '\xA6')
+          ||
+          (p[0] == '.' && p[1] == '.' && p[2] == '.')
+      ))
+        {
+          sb_append1 (&buffer, ',');
+          while (comma_and_spaces > 0)
+            {
+              sb_append1 (&buffer, ' ');
+              comma_and_spaces--;
+            }
+        }
+    }
+  return sb_dupfree_c (&buffer);
+}
+
+static void
+remove_empty_examples (char *input)
+{
+  const char *prefixes[] =
+    {
+      " @integer \xE2\x80\xA6", " @integer ...",
+      " @decimal \xE2\x80\xA6", " @decimal ..."
+    };
+  int num_prefixes = sizeof (prefixes) / sizeof (prefixes[0]);
+  int i;
+  for (i = 0; i < num_prefixes; i++)
+    {
+      const char *prefix = prefixes[i];
+      size_t prefix_length = strlen (prefix);
+      char *p = input;
+      while ((p = strstr (p, prefix)) != NULL)
+        {
+          memmove (p, p + prefix_length, strlen (p + prefix_length) + 1);
+          while (p[0] == ' ')
+            memmove (p, p + 1, strlen (p + 1) + 1);
+        }
+    }
+}
+
 /* Display usage information and exit.  */
 static void
 usage (int status)
@ -306,6 +469,16 @@ There is NO WARRANTY, to the extent permitted by law.\n\
        printf ("%s\n", extracted_rules);
      else
        {
+          force_spaces (extracted_rules);
+          {
+            char *tmp = remove_XcY (extracted_rules);
+            if (tmp != NULL)
+              {
+                free (extracted_rules);
+                extracted_rules = tmp;
+                remove_empty_examples (extracted_rules);
+              }
+          }
          struct cldr_plural_rule_list_ty *result =
            cldr_plural_parse (extracted_rules);
          if (result == NULL)
--- a/gettext-tools/tests/cldr-plurals-1
+++ b/gettext-tools/tests/cldr-plurals-1
@ -3,7 +3,8 @@

 : ${DIFF=diff}

-# Test conversion from CLDR to gettext, for Arabic and Russian
+
+# Test if we have XML support

 LC_ALL=C "$top_builddir/src/cldr-plurals" ru /dev/null 2>&1 | grep 'extraction is not supported' > /dev/null 2>&1
 test $? = 0 && {
@ -11,6 +12,9 @@ test $? = 0 && {
  Exit 77
 }

+
+# Test conversion from CLDR to gettext, for Arabic
+
 cat > ar.ok <<\EOF
 nplurals=6; plural=(n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5);
 EOF
@ -22,6 +26,9 @@ LC_ALL=C tr -d '\r' < ar.tmp > ar.out || Exit 1

 ${DIFF} ar.ok ar.out || Exit 1

+
+# Test conversion from CLDR to gettext, for Russian
+
 cat > ru.ok <<\EOF
 nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<12 || n%100>14) ? 1 : 2);
 EOF
@ -33,31 +40,33 @@ LC_ALL=C tr -d '\r' < ru.tmp > ru.out || Exit 1

 ${DIFF} ru.ok ru.out || Exit 1

+
 # Test extraction from CLDR

 cat > foo.in <<\EOF
 <supplementalData>
  <plurals type="cardinal">
    <pluralRules locales="foo">
-      <pluralRule count="one">i = 1 and v = 0 @integer 1</pluralRule>
-      <pluralRule count="other"> @integer 0, 2~16, 100, 1000, 10000, 100000, 1000000, … @decimal 0.0~1.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, …</pluralRule>
+      <pluralRule count="one">i = 0,1 @integer 0, 1 @decimal 0.0~1.5</pluralRule>
+      <pluralRule count="many">e = 0 and i != 0 and i % 1000000 = 0 and v = 0 or e != 0..5 @integer 1000000, 1c6, 2c6, 3c6, 4c6, 5c6, 6c6, … @decimal 1.0000001c6, 1.1c6, 2.0000001c6, 2.1c6, 3.0000001c6, 3.1c6, …</pluralRule>
+      <pluralRule count="other"> @integer 2~17, 100, 1000, 10000, 100000, 1c3, 2c3, 3c3, 4c3, 5c3, 6c3, … @decimal 2.0~3.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 1.0001c3, 1.1c3, 2.0001c3, 2.1c3, 3.0001c3, 3.1c3, …</pluralRule>
    </pluralRules>
  </plurals>
 </supplementalData>
 EOF

-"$top_builddir/src/cldr-plurals" foo foo.in > foo.tmp
-cat > foo.ok <<\EOF
-nplurals=2; plural=(n != 1);
-EOF
-LC_ALL=C tr -d '\r' < foo.tmp > foo.out || Exit 1
-${DIFF} foo.ok foo.out || Exit 1
-
 "$top_builddir/src/cldr-plurals" -c foo foo.in > foo.cldr.tmp
 cat > foo.cldr.ok <<\EOF
-one: i = 1 and v = 0 @integer 1; other:  @integer 0, 2~16, 100, 1000, 10000, 100000, 1000000, … @decimal 0.0~1.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, …
+one: i = 0,1 @integer 0, 1 @decimal 0.0~1.5; many: e = 0 and i != 0 and i % 1000000 = 0 and v = 0 or e != 0..5 @integer 1000000, 1c6, 2c6, 3c6, 4c6, 5c6, 6c6, … @decimal 1.0000001c6, 1.1c6, 2.0000001c6, 2.1c6, 3.0000001c6, 3.1c6, …; other:  @integer 2~17, 100, 1000, 10000, 100000, 1c3, 2c3, 3c3, 4c3, 5c3, 6c3, … @decimal 2.0~3.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 1.0001c3, 1.1c3, 2.0001c3, 2.1c3, 3.0001c3, 3.1c3, …
 EOF
 LC_ALL=C tr -d '\r' < foo.cldr.tmp > foo.cldr.out || Exit 1
 ${DIFF} foo.cldr.ok foo.cldr.out || Exit 1

+"$top_builddir/src/cldr-plurals" foo foo.in > foo.tmp
+cat > foo.ok <<\EOF
+nplurals=3; plural=(n==0 || n==1 ? 0 : n!=0 && n%1000000==0 ? 1 : 2);
+EOF
+LC_ALL=C tr -d '\r' < foo.tmp > foo.out || Exit 1
+${DIFF} foo.ok foo.out || Exit 1
+
 Exit 0