mirror of
https://codeberg.org/landley/toybox.git
synced 2026-01-26 14:13:25 +00:00
Tweak utf8towc() to return -1 earlier sometimes (instead of -2), and add test
program to compare against libc output.
This commit is contained in:
parent
6e76693639
commit
b3e70932b6
@ -345,20 +345,17 @@ int utf8towc(wchar_t *wc, char *str, unsigned len)
|
||||
if (len && *str<128) return !!(*wc = *str);
|
||||
|
||||
result = first = *(s = str++);
|
||||
if (result<0xc2 || result>0xf4) return -1;
|
||||
for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) {
|
||||
if (mask>21) return -1;
|
||||
if (!--len) return -2;
|
||||
c = *(str++);
|
||||
if ((c&0xc0) != 0x80) return -1;
|
||||
if (((c = *(str++))&0xc0) != 0x80) return -1;
|
||||
result = (result<<6)|(c&0x3f);
|
||||
}
|
||||
result &= (1<<mask)-1;
|
||||
c = str-s;
|
||||
if (mask==6) return -1;
|
||||
|
||||
// Avoid overlong encodings
|
||||
if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2])
|
||||
return -1;
|
||||
if (result<(unsigned []){0x80,0x800,0x10000}[c-2]) return -1;
|
||||
|
||||
// Limit unicode so it can't encode anything UTF-16 can't.
|
||||
if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1;
|
||||
|
||||
42
toys/example/test_utf8towc.c
Normal file
42
toys/example/test_utf8towc.c
Normal file
@ -0,0 +1,42 @@
|
||||
/* test_utf8towc() against libc mbrtowc()
|
||||
*
|
||||
* Copyright 2017 Rob Landley <rob@landley.net>
|
||||
|
||||
USE_TEST_UTF8TOWC(NEWTOY(test_utf8towc, 0, TOYFLAG_USR|TOYFLAG_BIN))
|
||||
|
||||
config TEST_UTF8TOWC
|
||||
bool "test_utf8towc"
|
||||
default n
|
||||
help
|
||||
usage: test_utf8towc
|
||||
|
||||
Print differences between toybox's utf8 conversion routines vs libc du jour.
|
||||
*/
|
||||
|
||||
#include "toys.h"
|
||||
|
||||
void test_utf8towc_main(void)
|
||||
{
|
||||
mbstate_t mb;
|
||||
int len1, len2;
|
||||
unsigned u, h;
|
||||
wchar_t wc1, wc2;
|
||||
|
||||
setlocale(LC_ALL, "en_US.UTF-8");
|
||||
|
||||
memset(&mb, 0, sizeof(mb));
|
||||
for (u=1; u; u++) {
|
||||
char *str = (void *)&h;
|
||||
|
||||
wc1 = wc2 = 0;
|
||||
len2 = 4;
|
||||
h = htonl(u);
|
||||
while (!*str) str++, len2--;
|
||||
|
||||
len1 = mbrtowc(&wc1, str, len2, &mb);
|
||||
if (len1<0) memset(&mb, 0, sizeof(mb));
|
||||
len2 = utf8towc(&wc2, str, len2);
|
||||
if (len1 != len2 || wc1 != wc2)
|
||||
printf("%x %d %x %d %x\n", u, len1, wc1, len2, wc2);
|
||||
}
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user