Tweak utf8towc() to return -1 earlier sometimes (instead of -2), and add test

program to compare against libc output.
This commit is contained in:
Rob Landley 2017-09-05 02:36:24 -05:00
parent 6e76693639
commit b3e70932b6
2 changed files with 45 additions and 6 deletions

View File

@ -345,20 +345,17 @@ int utf8towc(wchar_t *wc, char *str, unsigned len)
if (len && *str<128) return !!(*wc = *str);
result = first = *(s = str++);
if (result<0xc2 || result>0xf4) return -1;
for (mask = 6; (first&0xc0)==0xc0; mask += 5, first <<= 1) {
if (mask>21) return -1;
if (!--len) return -2;
c = *(str++);
if ((c&0xc0) != 0x80) return -1;
if (((c = *(str++))&0xc0) != 0x80) return -1;
result = (result<<6)|(c&0x3f);
}
result &= (1<<mask)-1;
c = str-s;
if (mask==6) return -1;
// Avoid overlong encodings
if (mask==6 || mask>21 || result<(unsigned []){0x80,0x800,0x10000}[c-2])
return -1;
if (result<(unsigned []){0x80,0x800,0x10000}[c-2]) return -1;
// Limit unicode so it can't encode anything UTF-16 can't.
if (result>0x10ffff || (result>=0xd800 && result<=0xdfff)) return -1;

View File

@ -0,0 +1,42 @@
/* test_utf8towc() against libc mbrtowc()
*
* Copyright 2017 Rob Landley <rob@landley.net>
USE_TEST_UTF8TOWC(NEWTOY(test_utf8towc, 0, TOYFLAG_USR|TOYFLAG_BIN))
config TEST_UTF8TOWC
bool "test_utf8towc"
default n
help
usage: test_utf8towc
Print differences between toybox's utf8 conversion routines vs libc du jour.
*/
#include "toys.h"
void test_utf8towc_main(void)
{
mbstate_t mb;
int len1, len2;
unsigned u, h;
wchar_t wc1, wc2;
setlocale(LC_ALL, "en_US.UTF-8");
memset(&mb, 0, sizeof(mb));
for (u=1; u; u++) {
char *str = (void *)&h;
wc1 = wc2 = 0;
len2 = 4;
h = htonl(u);
while (!*str) str++, len2--;
len1 = mbrtowc(&wc1, str, len2, &mb);
if (len1<0) memset(&mb, 0, sizeof(mb));
len2 = utf8towc(&wc2, str, len2);
if (len1 != len2 || wc1 != wc2)
printf("%x %d %x %d %x\n", u, len1, wc1, len2, wc2);
}
}