cldr-plurals: Fix parsing of plurals.xml from CLDR 38 or newer.

Reported by Michele Locati at <https://savannah.gnu.org/bugs/?66378>.

* gettext-tools/src/cldr-plural-exp.c (eval_relation): Accept operands 'c', 'e'.
* gettext-tools/src/cldr-plural.y (yylex): Likewise.
* gettext-tools/src/cldr-plurals.c (extract_rules): Test for type='cardinal'.
(get_XcY_end, force_spaces, remove_XcY, remove_empty_examples): New functions.
(main): Invoke force_spaces, remove_XcY, remove_empty_examples.
* gettext-tools/tests/cldr-plurals-1: Add more tests.
* AUTHORS: Add Michele Locati.
This commit is contained in:
Michele Locati 2026-01-13 11:23:14 +01:00 committed by Bruno Haible
parent c558619bb3
commit 77441e0283
5 changed files with 205 additions and 18 deletions

View File

@ -91,3 +91,6 @@ Assigns past and future changes.
GETTEXT John Darrington
Assigns past and future changes.
(No contributions so far.)
GETTEXT Michele Locati
Assigns past and future changes.

View File

@ -1,5 +1,5 @@
/* Unicode CLDR plural rule parser and converter.
Copyright (C) 2015-2025 Free Software Foundation, Inc.
Copyright (C) 2015-2026 Free Software Foundation, Inc.
This file was written by Daiki Ueno <ueno@gnu.org>, 2015.
@ -133,10 +133,11 @@ eval_relation (struct cldr_plural_relation_ty *relation)
break;
case 'f': case 't':
case 'v': case 'w':
case 'c': case 'e':
{
/* Since plural expression in gettext only supports unsigned
integer, turn relations whose operand is either 'f', 't',
'v', or 'w' into a constant truth value. */
'v', 'w', 'c', or 'e' into a constant truth value. */
/* FIXME: check mod? */
for (size_t i = 0; i < relation->ranges->nitems; i++)
{

View File

@ -1,5 +1,5 @@
/* Unicode CLDR plural rule parser and converter.
Copyright (C) 2015-2025 Free Software Foundation, Inc.
Copyright (C) 2015-2026 Free Software Foundation, Inc.
This file was written by Daiki Ueno <ueno@gnu.org>, 2015.
@ -428,7 +428,8 @@ yylex (YYSTYPE *lval, struct cldr_plural_parse_args *arg)
{
switch (ident[0])
{
case 'n': case 'i': case 'f': case 't': case 'v': case 'w':
// See https://unicode.org/reports/tr35/tr35-numbers.html#table-plural-operand-meanings
case 'n': case 'i': case 'f': case 't': case 'v': case 'w': case 'c': case 'e':
arg->cp = exp;
lval->ival = ident[0];
sb_free (&buffer);

View File

@ -1,5 +1,5 @@
/* Unicode CLDR plural rule parser and converter
Copyright (C) 2015-2025 Free Software Foundation, Inc.
Copyright (C) 2015-2026 Free Software Foundation, Inc.
This file was written by Daiki Ueno <ueno@gnu.org>, 2015.
@ -39,6 +39,11 @@
#define _(s) gettext(s)
/**
* Extract the rules from a CLDR plurals.xml file
* @return NULL in case of errors, the CLDR rules otherwise
* @example "one: i = 1 and v = 0 @integer 1; other: @integer 0, 2~16, 100, 1000, 10000, 100000, 1000000, \u2026 @decimal 0.0~1.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, \u2026"
*/
static char *
extract_rules (FILE *fp,
const char *real_filename, const char *logical_filename,
@ -71,8 +76,15 @@ extract_rules (FILE *fp,
for (n = node->children; n; n = n->next)
{
if (n->type == XML_ELEMENT_NODE
&& xmlStrEqual (n->name, BAD_CAST "plurals"))
break;
&& xmlStrEqual (n->name, BAD_CAST "plurals")
&& xmlHasProp (n, BAD_CAST "type"))
{
xmlChar *type = xmlGetProp (n, BAD_CAST "type");
bool is_cardinal = xmlStrEqual (type, BAD_CAST "cardinal") != 0;
xmlFree (type);
if (is_cardinal)
break;
}
}
if (!n)
{
@ -156,6 +168,157 @@ extract_rules (FILE *fp,
return sb_xdupfree_c (&buffer);
}
/**
* Find the position after the string in format XcY (eg "1c9")
* @param str the possible starting position of the string XcY
* @return NULL if str does not start with a XcY string,
* the position of str after the XcY string (and after a comma/spaces
after it) otherwise
*/
static const char *
get_XcY_end (const char *str)
{
bool found_c = false;
if (str[0] < '0' || str[0] > '9')
return NULL;
str++;
while (str[0] != '\0')
{
if (str[0] == 'c')
{
if (found_c || str[1] < '0' || str[1] > '9')
return NULL;
found_c = true;
}
else if ((str[0] < '0' || str[0] > '9') && str[0] != '.')
break;
str++;
}
if (!found_c)
return NULL;
while (str[0] == ' ')
str++;
if (str[0] == ',')
{
str++;
while (str[0] == ' ')
str++;
}
return str;
}
static void
force_spaces (char *input)
{
while (input[0] != '\0')
{
if (c_isspace (input[0]))
input[0] = ' ';
input++;
}
}
static char *
remove_XcY (const char *input)
{
const char *p = (char *) input;
const char *p_next;
struct string_buffer buffer;
sb_init (&buffer);
for (;;)
{
int comma_and_spaces = -1;
const char *p_next1 = strstr (p, "@integer ");
const char *p_next2 = strstr (p, "@decimal ");
if (p_next1 == NULL && p_next2 == NULL)
{
sb_append_c (&buffer, p);
break;
}
if (p_next1 != NULL && (p_next2 == NULL || p_next1 < p_next2))
p_next = p_next1 + /* strlen ("@integer ") */ 9;
else
p_next = p_next2 + /* strlen ("@decimal ") */ 9;
while (p < p_next)
sb_append1 (&buffer, *p++);
while (p[0] == ' ')
sb_append1 (&buffer, *p++);
for (;;)
{
const char *XcY_end;
if (p[0] < '0' || p[0] > '9')
break;
XcY_end = get_XcY_end (p);
if (XcY_end != NULL)
{
p = XcY_end;
continue;
}
if (comma_and_spaces >= 0)
{
sb_append1 (&buffer, ',');
while (comma_and_spaces > 0)
{
sb_append1 (&buffer, ' ');
comma_and_spaces--;
}
}
while ((p[0] >= '0' && p[0] <= '9') || p[0] == '.' || p[0] == '~')
{
sb_append1 (&buffer, p[0]);
p++;
}
if (p[0] != ',')
break;
comma_and_spaces = 0;
p++;
while (p[0] == ' ')
{
comma_and_spaces++;
p++;
}
}
if (comma_and_spaces > 0 && (
(p[0] == '\xE2' && p[1] == '\x80' && p[2] == '\xA6')
||
(p[0] == '.' && p[1] == '.' && p[2] == '.')
))
{
sb_append1 (&buffer, ',');
while (comma_and_spaces > 0)
{
sb_append1 (&buffer, ' ');
comma_and_spaces--;
}
}
}
return sb_dupfree_c (&buffer);
}
static void
remove_empty_examples (char *input)
{
const char *prefixes[] =
{
" @integer \xE2\x80\xA6", " @integer ...",
" @decimal \xE2\x80\xA6", " @decimal ..."
};
int num_prefixes = sizeof (prefixes) / sizeof (prefixes[0]);
int i;
for (i = 0; i < num_prefixes; i++)
{
const char *prefix = prefixes[i];
size_t prefix_length = strlen (prefix);
char *p = input;
while ((p = strstr (p, prefix)) != NULL)
{
memmove (p, p + prefix_length, strlen (p + prefix_length) + 1);
while (p[0] == ' ')
memmove (p, p + 1, strlen (p + 1) + 1);
}
}
}
/* Display usage information and exit. */
static void
usage (int status)
@ -306,6 +469,16 @@ There is NO WARRANTY, to the extent permitted by law.\n\
printf ("%s\n", extracted_rules);
else
{
force_spaces (extracted_rules);
{
char *tmp = remove_XcY (extracted_rules);
if (tmp != NULL)
{
free (extracted_rules);
extracted_rules = tmp;
remove_empty_examples (extracted_rules);
}
}
struct cldr_plural_rule_list_ty *result =
cldr_plural_parse (extracted_rules);
if (result == NULL)

View File

@ -3,7 +3,8 @@
: ${DIFF=diff}
# Test conversion from CLDR to gettext, for Arabic and Russian
# Test if we have XML support
LC_ALL=C "$top_builddir/src/cldr-plurals" ru /dev/null 2>&1 | grep 'extraction is not supported' > /dev/null 2>&1
test $? = 0 && {
@ -11,6 +12,9 @@ test $? = 0 && {
Exit 77
}
# Test conversion from CLDR to gettext, for Arabic
cat > ar.ok <<\EOF
nplurals=6; plural=(n==0 ? 0 : n==1 ? 1 : n==2 ? 2 : n%100>=3 && n%100<=10 ? 3 : n%100>=11 && n%100<=99 ? 4 : 5);
EOF
@ -22,6 +26,9 @@ LC_ALL=C tr -d '\r' < ar.tmp > ar.out || Exit 1
${DIFF} ar.ok ar.out || Exit 1
# Test conversion from CLDR to gettext, for Russian
cat > ru.ok <<\EOF
nplurals=3; plural=(n%10==1 && n%100!=11 ? 0 : n%10>=2 && n%10<=4 && (n%100<12 || n%100>14) ? 1 : 2);
EOF
@ -33,31 +40,33 @@ LC_ALL=C tr -d '\r' < ru.tmp > ru.out || Exit 1
${DIFF} ru.ok ru.out || Exit 1
# Test extraction from CLDR
cat > foo.in <<\EOF
<supplementalData>
<plurals type="cardinal">
<pluralRules locales="foo">
<pluralRule count="one">i = 1 and v = 0 @integer 1</pluralRule>
<pluralRule count="other"> @integer 0, 2~16, 100, 1000, 10000, 100000, 1000000, … @decimal 0.0~1.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, …</pluralRule>
<pluralRule count="one">i = 0,1 @integer 0, 1 @decimal 0.0~1.5</pluralRule>
<pluralRule count="many">e = 0 and i != 0 and i % 1000000 = 0 and v = 0 or e != 0..5 @integer 1000000, 1c6, 2c6, 3c6, 4c6, 5c6, 6c6, … @decimal 1.0000001c6, 1.1c6, 2.0000001c6, 2.1c6, 3.0000001c6, 3.1c6, …</pluralRule>
<pluralRule count="other"> @integer 2~17, 100, 1000, 10000, 100000, 1c3, 2c3, 3c3, 4c3, 5c3, 6c3, … @decimal 2.0~3.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 1.0001c3, 1.1c3, 2.0001c3, 2.1c3, 3.0001c3, 3.1c3, …</pluralRule>
</pluralRules>
</plurals>
</supplementalData>
EOF
"$top_builddir/src/cldr-plurals" foo foo.in > foo.tmp
cat > foo.ok <<\EOF
nplurals=2; plural=(n != 1);
EOF
LC_ALL=C tr -d '\r' < foo.tmp > foo.out || Exit 1
${DIFF} foo.ok foo.out || Exit 1
"$top_builddir/src/cldr-plurals" -c foo foo.in > foo.cldr.tmp
cat > foo.cldr.ok <<\EOF
one: i = 1 and v = 0 @integer 1; other: @integer 0, 2~16, 100, 1000, 10000, 100000, 1000000, … @decimal 0.0~1.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, …
one: i = 0,1 @integer 0, 1 @decimal 0.0~1.5; many: e = 0 and i != 0 and i % 1000000 = 0 and v = 0 or e != 0..5 @integer 1000000, 1c6, 2c6, 3c6, 4c6, 5c6, 6c6, … @decimal 1.0000001c6, 1.1c6, 2.0000001c6, 2.1c6, 3.0000001c6, 3.1c6, …; other: @integer 2~17, 100, 1000, 10000, 100000, 1c3, 2c3, 3c3, 4c3, 5c3, 6c3, … @decimal 2.0~3.5, 10.0, 100.0, 1000.0, 10000.0, 100000.0, 1000000.0, 1.0001c3, 1.1c3, 2.0001c3, 2.1c3, 3.0001c3, 3.1c3, …
EOF
LC_ALL=C tr -d '\r' < foo.cldr.tmp > foo.cldr.out || Exit 1
${DIFF} foo.cldr.ok foo.cldr.out || Exit 1
"$top_builddir/src/cldr-plurals" foo foo.in > foo.tmp
cat > foo.ok <<\EOF
nplurals=3; plural=(n==0 || n==1 ? 0 : n!=0 && n%1000000==0 ? 1 : 2);
EOF
LC_ALL=C tr -d '\r' < foo.tmp > foo.out || Exit 1
${DIFF} foo.ok foo.out || Exit 1
Exit 0