handle non-BMP characters

issue #1376
This commit is contained in:
tildearrow 2023-08-19 16:49:58 -05:00
parent 14d3baae56
commit c8222fd491
2 changed files with 66 additions and 21 deletions

View File

@ -1114,11 +1114,20 @@ static int u8to16s(wchar_t* dest, const char* src, size_t limit) {
int ch, p;
char chs;
p=0;
while (src[p]!=0 && ret<limit) {
while (src[p]!=0 && ret<limit-1) {
ch=decodeUTF8s((const unsigned char*)&src[p],&chs);
dest[ret++]=(unsigned short)ch;
// surrogates
if (ch>=0x10000) {
ch-=0x10000;
if (ret+1>=limit-1) break;
dest[ret++]=(unsigned short)(0xd800|((ch>>10)&0x3ff));
dest[ret++]=(unsigned short)(0xdc00|(ch&0x3ff));
} else if (ch<0xd800 || ch>0xdfff) {
dest[ret++]=(unsigned short)ch;
}
p+=chs;
}
dest[ret]=0;
return ret;
}
@ -1176,19 +1185,34 @@ dirent_mbstowcs_s(
static int u16to8s(char* dest, const wchar_t* src, size_t limit) {
size_t ret=0;
unsigned int next=0;
for (; (*src)!=0; src++) {
if ((*src)<0x80) {
if (ret+1>=limit-1) break;
dest[ret++]=(*src);
} else if ((*src)<0x800) {
if (ret+2>=limit-1) break;
dest[ret++]=(0xc0+(((*src)>>6)&31));
dest[ret++]=(0x80+((*src)&63));
if ((*src)>=0xd800 && (*src)<0xdc00) {
next=0x10000+(((*src)&0x3ff)<<10);
continue;
} else if ((*src)>=0xdc00 && (*src)<0xe000) {
next|=(*src)&0x3ff;
} else {
next=(*src);
}
if (next<0x80) {
if (ret+1>=limit-1) break;
dest[ret++]=next;
} else if (next<0x800) {
if (ret+2>=limit-1) break;
dest[ret++]=(0xc0+((next>>6)&31));
dest[ret++]=(0x80+(next&63));
} else if (next<0x10000) {
if (ret+3>=limit-1) break;
dest[ret++]=(0xe0+(((*src)>>12)&15));
dest[ret++]=(0x80+(((*src)>>6)&63));
dest[ret++]=(0x80+((*src)&63));
dest[ret++]=(0xe0+((next>>12)&15));
dest[ret++]=(0x80+((next>>6)&63));
dest[ret++]=(0x80+(next&63));
} else {
if (ret+4>=limit-1) break;
dest[ret++]=(0xf0+((next>>18)&7));
dest[ret++]=(0x80+((next>>12)&63));
dest[ret++]=(0x80+((next>>6)&63));
dest[ret++]=(0x80+(next&63));
}
}
dest[ret]=0;

View File

@ -88,7 +88,14 @@ WString utf8To16(const char* s) {
p=0;
while (s[p]!=0) {
ch=decodeUTF8((const unsigned char*)&s[p],chs);
ret+=(unsigned short)ch;
// surrogates
if (ch>=0x10000) {
ch-=0x10000;
ret+=(unsigned short)(0xd800|((ch>>10)&0x3ff));
ret+=(unsigned short)(0xdc00|(ch&0x3ff));
} else if (ch<0xd800 || ch>0xdfff) {
ret+=(unsigned short)ch;
}
p+=chs;
}
return ret;
@ -96,16 +103,30 @@ WString utf8To16(const char* s) {
String utf16To8(const wchar_t* s) {
String ret;
unsigned int next=0;
for (size_t i=0; i<wcslen(s); i++) {
if (s[i]<0x80) {
ret+=s[i];
} else if (s[i]<0x800) {
ret+=(0xc0+((s[i]>>6)&31));
ret+=(0x80+((s[i])&63));
if (s[i]>=0xd800 && s[i]<0xdc00) {
next=0x10000+((s[i]&0x3ff)<<10);
continue;
} else if (s[i]>=0xdc00 && s[i]<0xe000) {
next|=s[i]&0x3ff;
} else {
ret+=(0xe0+((s[i]>>12)&15));
ret+=(0x80+((s[i]>>6)&63));
ret+=(0x80+((s[i])&63));
next=s[i];
}
if (next<0x80) {
ret+=next;
} else if (next<0x800) {
ret+=(0xc0+((next>>6)&31));
ret+=(0x80+((next)&63));
} else if (next<0x10000) {
ret+=(0xe0+((next>>12)&15));
ret+=(0x80+((next>>6)&63));
ret+=(0x80+((next)&63));
} else {
ret+=(0xf0+((next>>18)&7));
ret+=(0x80+((next>>12)&63));
ret+=(0x80+((next>>6)&63));
ret+=(0x80+((next)&63));
}
}
return ret;