mirror of
https://https.git.savannah.gnu.org/git/gettext.git
synced 2026-01-26 15:39:11 +00:00
xgettext: In language Tcl, support \x, \u, \U escapes as specified in Tcl 8.6.
* gettext-tools/src/x-tcl.c: Update comments. (phase1_pushback): Increase size to 5. (do_getc_escaped): For \x, parse only up to 2 hexadecimal characters. Handle '\U'. (do_getc_escaped_low_surrogate): New function. (accumulate_word): After reading a high surrogate, see if it is followed by a low surrogate. * gettext-tools/tests/xgettext-tcl-4: Change expected outcome for \x. Add test cases for \u with surrogates and for \U. * gettext-tools/tests/xgettext-tcl-5: Add more test cases. * NEWS: Mention the change.
This commit is contained in:
parent
fa2fbdbcbf
commit
0458d3efda
2
NEWS
2
NEWS
@ -15,6 +15,8 @@ Version 0.21.2 - February 2023
|
||||
- C, C++: xgettext now supports gettext-like functions that take wide strings
|
||||
(of type 'const wchar_t *', 'const char16_t *', or 'const char32_t *') as
|
||||
arguments.
|
||||
- Tcl: xgettext now supports the \x, \u, and \U escapes as defined in
|
||||
Tcl 8.6.
|
||||
|
||||
* xgettext:
|
||||
- The xgettext option '--sorted-output' is now deprecated.
|
||||
|
||||
@ -55,7 +55,8 @@
|
||||
#define SIZEOF(a) (sizeof(a) / sizeof(a[0]))
|
||||
|
||||
|
||||
/* The Tcl syntax is defined in the Tcl.n manual page.
|
||||
/* The Tcl syntax is defined in the Tcl.n manual page, see
|
||||
https://www.tcl-lang.org/man/tcl8.6/TclCmd/Tcl.htm .
|
||||
Summary of Tcl syntax:
|
||||
Like sh syntax, except that `...` is replaced with [...]. In detail:
|
||||
- In a preprocessing pass, backslash-newline-anywhitespace is replaced
|
||||
@ -69,7 +70,7 @@
|
||||
- The list of resulting words is split into commands by semicolon and
|
||||
newline.
|
||||
- '#' at the beginning of a command introduces a comment until end of line.
|
||||
The parser is implemented in tcl8.3.3/generic/tclParse.c. */
|
||||
The parser is implemented in tcl8.6/generic/tclParse.c. */
|
||||
|
||||
|
||||
/* ====================== Keyword set customization. ====================== */
|
||||
@ -174,7 +175,7 @@ do_ungetc (int c)
|
||||
/* An int that becomes a space when casted to 'unsigned char'. */
|
||||
#define BS_NL (UCHAR_MAX + 1 + ' ')
|
||||
|
||||
static int phase1_pushback[1];
|
||||
static int phase1_pushback[5];
|
||||
static int phase1_pushback_length;
|
||||
|
||||
static int
|
||||
@ -472,7 +473,7 @@ static int brace_nesting_depth;
|
||||
|
||||
|
||||
/* Read an escape sequence. The value is an ISO-8859-1 character (in the
|
||||
range 0x00..0xff) or a Unicode character (in the range 0x0000..0xffff). */
|
||||
range 0x00..0xff) or a Unicode character (in the range 0x0000..0x10FFFF). */
|
||||
static int
|
||||
do_getc_escaped ()
|
||||
{
|
||||
@ -499,14 +500,17 @@ do_getc_escaped ()
|
||||
return '\v';
|
||||
case 'x':
|
||||
{
|
||||
int n = 0;
|
||||
unsigned int n = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0;; i++)
|
||||
for (i = 0; i < 2; i++)
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF || !c_isxdigit ((unsigned char) c))
|
||||
break;
|
||||
{
|
||||
phase1_ungetc (c);
|
||||
break;
|
||||
}
|
||||
|
||||
if (c >= '0' && c <= '9')
|
||||
n = (n << 4) + (c - '0');
|
||||
@ -515,12 +519,11 @@ do_getc_escaped ()
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
n = (n << 4) + (c - 'a' + 10);
|
||||
}
|
||||
phase1_ungetc (c);
|
||||
return (i > 0 ? (unsigned char) n : 'x');
|
||||
}
|
||||
case 'u':
|
||||
{
|
||||
int n = 0;
|
||||
unsigned int n = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
@ -532,6 +535,29 @@ do_getc_escaped ()
|
||||
break;
|
||||
}
|
||||
|
||||
if (c >= '0' && c <= '9')
|
||||
n = (n << 4) + (c - '0');
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
n = (n << 4) + (c - 'A' + 10);
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
n = (n << 4) + (c - 'a' + 10);
|
||||
}
|
||||
return (i > 0 ? n : 'u');
|
||||
}
|
||||
case 'U':
|
||||
{
|
||||
unsigned int n = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < 8; i++)
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF || !c_isxdigit ((unsigned char) c) || n >= 0x11000)
|
||||
{
|
||||
phase1_ungetc (c);
|
||||
break;
|
||||
}
|
||||
|
||||
if (c >= '0' && c <= '9')
|
||||
n = (n << 4) + (c - '0');
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
@ -572,6 +598,58 @@ do_getc_escaped ()
|
||||
}
|
||||
}
|
||||
|
||||
/* Read an escape sequence for a low surrogate Unicode character.
|
||||
The value is in the range 0xDC00..0xDFFF.
|
||||
Return -1 when none was seen. */
|
||||
static int
|
||||
do_getc_escaped_low_surrogate ()
|
||||
{
|
||||
int c;
|
||||
|
||||
c = phase1_getc ();
|
||||
switch (c)
|
||||
{
|
||||
case 'u':
|
||||
{
|
||||
unsigned char buf[4];
|
||||
unsigned int n = 0;
|
||||
unsigned int i;
|
||||
|
||||
for (i = 0; i < 4; i++)
|
||||
{
|
||||
c = phase1_getc ();
|
||||
if (c == EOF || !c_isxdigit ((unsigned char) c))
|
||||
{
|
||||
phase1_ungetc (c);
|
||||
while (i > 0)
|
||||
phase1_ungetc (buf[--i]);
|
||||
phase1_ungetc ('u');
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (c >= '0' && c <= '9')
|
||||
n = (n << 4) + (c - '0');
|
||||
else if (c >= 'A' && c <= 'F')
|
||||
n = (n << 4) + (c - 'A' + 10);
|
||||
else if (c >= 'a' && c <= 'f')
|
||||
n = (n << 4) + (c - 'a' + 10);
|
||||
}
|
||||
if (n >= 0xdc00 && n <= 0xdfff)
|
||||
return n;
|
||||
else
|
||||
{
|
||||
while (i > 0)
|
||||
phase1_ungetc (buf[--i]);
|
||||
phase1_ungetc ('u');
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
default:
|
||||
phase1_ungetc (c);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
enum terminator
|
||||
{
|
||||
@ -699,31 +777,48 @@ accumulate_word (struct word *wp, enum terminator looking_for,
|
||||
}
|
||||
else if (c == '\\')
|
||||
{
|
||||
unsigned int uc;
|
||||
unsigned char utf8buf[6];
|
||||
int count;
|
||||
int i;
|
||||
|
||||
uc = do_getc_escaped ();
|
||||
assert (uc < 0x10000);
|
||||
count = u8_uctomb (utf8buf, uc, 6);
|
||||
if (count < 0)
|
||||
unsigned int uc = do_getc_escaped ();
|
||||
assert (uc < 0x110000);
|
||||
if (uc >= 0xd800 && uc <= 0xdfff)
|
||||
{
|
||||
if (uc < 0xdc00)
|
||||
{
|
||||
/* Saw a high surrogate Unicode character.
|
||||
Is it followed by a low surrogate Unicode character? */
|
||||
c = phase2_getc ();
|
||||
if (c == '\\')
|
||||
{
|
||||
int uc2 = do_getc_escaped_low_surrogate ();
|
||||
if (uc2 >= 0)
|
||||
{
|
||||
/* Saw a low surrogate Unicode character. */
|
||||
assert (uc2 >= 0xdc00 && uc2 <= 0xdfff);
|
||||
uc = 0x10000 + ((uc - 0xd800) << 10) + (uc2 - 0xdc00);
|
||||
goto saw_unicode_escape;
|
||||
}
|
||||
}
|
||||
phase2_ungetc (c);
|
||||
}
|
||||
error_with_progname = false;
|
||||
error (0, 0, _("%s:%d: warning: invalid Unicode character"),
|
||||
logical_file_name, line_number);
|
||||
error_with_progname = true;
|
||||
goto done_escape;
|
||||
}
|
||||
else
|
||||
{
|
||||
assert (count > 0);
|
||||
if (wp->type == t_string)
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
grow_token (wp->token);
|
||||
wp->token->chars[wp->token->charcount++] = utf8buf[i];
|
||||
}
|
||||
}
|
||||
saw_unicode_escape:
|
||||
{
|
||||
unsigned char utf8buf[6];
|
||||
int count = u8_uctomb (utf8buf, uc, 6);
|
||||
int i;
|
||||
assert (count > 0);
|
||||
if (wp->type == t_string)
|
||||
for (i = 0; i < count; i++)
|
||||
{
|
||||
grow_token (wp->token);
|
||||
wp->token->chars[wp->token->charcount++] = utf8buf[i];
|
||||
}
|
||||
}
|
||||
done_escape: ;
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@ -7,11 +7,17 @@ cat <<\EOF > xg-t-4.tcl
|
||||
puts [_ "Hello\u200e\u201cWorld\u201d"]
|
||||
puts [_ "x\u20y\x20z"]
|
||||
puts [_ "\xFF20"]
|
||||
puts [_ "\UFF20"]
|
||||
puts [_ "\uD83D\udc1c"]
|
||||
# Does not work yet in Tcl 8.6:
|
||||
# puts [_ "\U0001F41C"]
|
||||
EOF
|
||||
|
||||
: ${XGETTEXT=xgettext}
|
||||
${XGETTEXT} --add-comments --no-location -k_ -o xg-t-4.tmp xg-t-4.tcl 2>xg-t-4.err
|
||||
test $? = 0 || { cat xg-t-4.err; Exit 1; }
|
||||
result=$?
|
||||
cat xg-t-4.err
|
||||
test $result = 0 || Exit 1
|
||||
func_filter_POT_Creation_Date xg-t-4.tmp xg-t-4.pot
|
||||
|
||||
cat <<\EOF > xg-t-4.ok
|
||||
@ -39,7 +45,13 @@ msgstr ""
|
||||
msgid "x y z"
|
||||
msgstr ""
|
||||
|
||||
msgid " "
|
||||
msgid "ÿ20"
|
||||
msgstr ""
|
||||
|
||||
msgid "@"
|
||||
msgstr ""
|
||||
|
||||
msgid "🐜"
|
||||
msgstr ""
|
||||
EOF
|
||||
|
||||
|
||||
@ -11,6 +11,26 @@ cat <<\EOF > xg-t-5b.tcl
|
||||
puts [_ "\udc1c"]
|
||||
EOF
|
||||
|
||||
cat <<\EOF > xg-t-5c.tcl
|
||||
puts [_ "\uD83D\n"]
|
||||
EOF
|
||||
|
||||
cat <<\EOF > xg-t-5d.tcl
|
||||
puts [_ "\uD83D\u"]
|
||||
EOF
|
||||
|
||||
cat <<\EOF > xg-t-5e.tcl
|
||||
puts [_ "\uD83D\u9843"]
|
||||
EOF
|
||||
|
||||
cat <<\EOF > xg-t-5f.tcl
|
||||
puts [_ "\uD83D\ud913"]
|
||||
EOF
|
||||
|
||||
cat <<\EOF > xg-t-5g.tcl
|
||||
puts [_ "\udc1c\ud83d"]
|
||||
EOF
|
||||
|
||||
: ${XGETTEXT=xgettext}
|
||||
LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5a.tcl 2>xg-t-5.err
|
||||
result=$?
|
||||
@ -23,4 +43,34 @@ result=$?
|
||||
cat xg-t-5.err
|
||||
test $result = 0 || Exit 1
|
||||
|
||||
: ${XGETTEXT=xgettext}
|
||||
LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5c.tcl 2>xg-t-5.err
|
||||
result=$?
|
||||
cat xg-t-5.err
|
||||
test $result = 0 || Exit 1
|
||||
|
||||
: ${XGETTEXT=xgettext}
|
||||
LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5d.tcl 2>xg-t-5.err
|
||||
result=$?
|
||||
cat xg-t-5.err
|
||||
test $result = 0 || Exit 1
|
||||
|
||||
: ${XGETTEXT=xgettext}
|
||||
LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5e.tcl 2>xg-t-5.err
|
||||
result=$?
|
||||
cat xg-t-5.err
|
||||
test $result = 0 || Exit 1
|
||||
|
||||
: ${XGETTEXT=xgettext}
|
||||
LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5f.tcl 2>xg-t-5.err
|
||||
result=$?
|
||||
cat xg-t-5.err
|
||||
test $result = 0 || Exit 1
|
||||
|
||||
: ${XGETTEXT=xgettext}
|
||||
LANGUAGE= LC_ALL=C ${XGETTEXT} --no-location -k_ -d xg-t-5.tmp xg-t-5g.tcl 2>xg-t-5.err
|
||||
result=$?
|
||||
cat xg-t-5.err
|
||||
test $result = 0 || Exit 1
|
||||
|
||||
exit 0
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user