From c8222fd491ca56fd73ac563d5ca50efb4c05a951 Mon Sep 17 00:00:00 2001 From: tildearrow Date: Sat, 19 Aug 2023 16:49:58 -0500 Subject: [PATCH] handle non-BMP characters issue #1376 --- extern/igfd/dirent/dirent.h | 48 +++++++++++++++++++++++++++---------- src/utfutils.cpp | 39 +++++++++++++++++++++++------- 2 files changed, 66 insertions(+), 21 deletions(-) diff --git a/extern/igfd/dirent/dirent.h b/extern/igfd/dirent/dirent.h index 7d5c8abe..dccfe5e8 100644 --- a/extern/igfd/dirent/dirent.h +++ b/extern/igfd/dirent/dirent.h @@ -1114,11 +1114,20 @@ static int u8to16s(wchar_t* dest, const char* src, size_t limit) { int ch, p; char chs; p=0; - while (src[p]!=0 && ret=0x10000) { + ch-=0x10000; + if (ret+1>=limit-1) break; + dest[ret++]=(unsigned short)(0xd800|((ch>>10)&0x3ff)); + dest[ret++]=(unsigned short)(0xdc00|(ch&0x3ff)); + } else if (ch<0xd800 || ch>0xdfff) { + dest[ret++]=(unsigned short)ch; + } p+=chs; } + dest[ret]=0; return ret; } @@ -1176,19 +1185,34 @@ dirent_mbstowcs_s( static int u16to8s(char* dest, const wchar_t* src, size_t limit) { size_t ret=0; + unsigned int next=0; for (; (*src)!=0; src++) { - if ((*src)<0x80) { - if (ret+1>=limit-1) break; - dest[ret++]=(*src); - } else if ((*src)<0x800) { - if (ret+2>=limit-1) break; - dest[ret++]=(0xc0+(((*src)>>6)&31)); - dest[ret++]=(0x80+((*src)&63)); + if ((*src)>=0xd800 && (*src)<0xdc00) { + next=0x10000+(((*src)&0x3ff)<<10); + continue; + } else if ((*src)>=0xdc00 && (*src)<0xe000) { + next|=(*src)&0x3ff; } else { + next=(*src); + } + if (next<0x80) { + if (ret+1>=limit-1) break; + dest[ret++]=next; + } else if (next<0x800) { + if (ret+2>=limit-1) break; + dest[ret++]=(0xc0+((next>>6)&31)); + dest[ret++]=(0x80+(next&63)); + } else if (next<0x10000) { if (ret+3>=limit-1) break; - dest[ret++]=(0xe0+(((*src)>>12)&15)); - dest[ret++]=(0x80+(((*src)>>6)&63)); - dest[ret++]=(0x80+((*src)&63)); + dest[ret++]=(0xe0+((next>>12)&15)); + dest[ret++]=(0x80+((next>>6)&63)); + dest[ret++]=(0x80+(next&63)); + } else { + if (ret+4>=limit-1) break; + dest[ret++]=(0xf0+((next>>18)&7)); + dest[ret++]=(0x80+((next>>12)&63)); + dest[ret++]=(0x80+((next>>6)&63)); + dest[ret++]=(0x80+(next&63)); } } dest[ret]=0; diff --git a/src/utfutils.cpp b/src/utfutils.cpp index 889b9a9b..408a5e71 100644 --- a/src/utfutils.cpp +++ b/src/utfutils.cpp @@ -88,7 +88,14 @@ WString utf8To16(const char* s) { p=0; while (s[p]!=0) { ch=decodeUTF8((const unsigned char*)&s[p],chs); - ret+=(unsigned short)ch; + // surrogates + if (ch>=0x10000) { + ch-=0x10000; + ret+=(unsigned short)(0xd800|((ch>>10)&0x3ff)); + ret+=(unsigned short)(0xdc00|(ch&0x3ff)); + } else if (ch<0xd800 || ch>0xdfff) { + ret+=(unsigned short)ch; + } p+=chs; } return ret; @@ -96,16 +103,30 @@ WString utf8To16(const char* s) { String utf16To8(const wchar_t* s) { String ret; + unsigned int next=0; for (size_t i=0; i>6)&31)); - ret+=(0x80+((s[i])&63)); + if (s[i]>=0xd800 && s[i]<0xdc00) { + next=0x10000+((s[i]&0x3ff)<<10); + continue; + } else if (s[i]>=0xdc00 && s[i]<0xe000) { + next|=s[i]&0x3ff; } else { - ret+=(0xe0+((s[i]>>12)&15)); - ret+=(0x80+((s[i]>>6)&63)); - ret+=(0x80+((s[i])&63)); + next=s[i]; + } + if (next<0x80) { + ret+=next; + } else if (next<0x800) { + ret+=(0xc0+((next>>6)&31)); + ret+=(0x80+((next)&63)); + } else if (next<0x10000) { + ret+=(0xe0+((next>>12)&15)); + ret+=(0x80+((next>>6)&63)); + ret+=(0x80+((next)&63)); + } else { + ret+=(0xf0+((next>>18)&7)); + ret+=(0x80+((next>>12)&63)); + ret+=(0x80+((next>>6)&63)); + ret+=(0x80+((next)&63)); } } return ret;