// Scintilla source code edit control /** @file UniConversion.cxx ** Functions to handle UTF-8 and UTF-16 strings. **/ // Copyright 1998-2001 by Neil Hodgson // The License.txt file describes the conditions under which this software may be distributed. #include #include "UniConversion.h" enum { SURROGATE_LEAD_FIRST = 0xD800 }; enum { SURROGATE_TRAIL_FIRST = 0xDC00 }; enum { SURROGATE_TRAIL_LAST = 0xDFFF }; unsigned int UTF8Length(const wchar_t *uptr, unsigned int tlen) { unsigned int len = 0; for (unsigned int i = 0; i < tlen && uptr[i];) { unsigned int uch = uptr[i]; if (uch < 0x80) { len++; } else if (uch < 0x800) { len += 2; } else if ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_TRAIL_LAST)) { len += 4; i++; } else { len += 3; } i++; } return len; } void UTF8FromUTF16(const wchar_t *uptr, unsigned int tlen, char *putf, unsigned int len) { int k = 0; for (unsigned int i = 0; i < tlen && uptr[i];) { unsigned int uch = uptr[i]; if (uch < 0x80) { putf[k++] = static_cast(uch); } else if (uch < 0x800) { putf[k++] = static_cast(0xC0 | (uch >> 6)); putf[k++] = static_cast(0x80 | (uch & 0x3f)); } else if ((uch >= SURROGATE_LEAD_FIRST) && (uch <= SURROGATE_TRAIL_LAST)) { // Half a surrogate pair i++; unsigned int xch = 0x10000 + ((uch & 0x3ff) << 10) + (uptr[i] & 0x3ff); putf[k++] = static_cast(0xF0 | (xch >> 18)); putf[k++] = static_cast(0x80 | ((xch >> 12) & 0x3f)); putf[k++] = static_cast(0x80 | ((xch >> 6) & 0x3f)); putf[k++] = static_cast(0x80 | (xch & 0x3f)); } else { putf[k++] = static_cast(0xE0 | (uch >> 12)); putf[k++] = static_cast(0x80 | ((uch >> 6) & 0x3f)); putf[k++] = static_cast(0x80 | (uch & 0x3f)); } i++; } putf[len] = '\0'; } unsigned int UTF8CharLength(unsigned char ch) { if (ch < 0x80) { return 1; } else if (ch < 0x80 + 0x40 + 0x20) { return 2; } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { return 3; } else { return 4; } } unsigned int UTF16Length(const char *s, unsigned int len) { unsigned int ulen = 0; unsigned int charLen; for (unsigned int i=0; i(s[i]); if (ch < 0x80) { charLen = 1; } else if (ch < 0x80 + 0x40 + 0x20) { charLen = 2; } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { charLen = 3; } else { charLen = 4; ulen++; } i += charLen; ulen++; } return ulen; } unsigned int UTF16FromUTF8(const char *s, unsigned int len, wchar_t *tbuf, unsigned int tlen) { unsigned int ui=0; const unsigned char *us = reinterpret_cast(s); unsigned int i=0; while ((i((ch & 0x1F) << 6); ch = us[i++]; tbuf[ui] = static_cast(tbuf[ui] + (ch & 0x7F)); } else if (ch < 0x80 + 0x40 + 0x20 + 0x10) { tbuf[ui] = static_cast((ch & 0xF) << 12); ch = us[i++]; tbuf[ui] = static_cast(tbuf[ui] + ((ch & 0x7F) << 6)); ch = us[i++]; tbuf[ui] = static_cast(tbuf[ui] + (ch & 0x7F)); } else { // Outside the BMP so need two surrogates int val = (ch & 0x7) << 18; ch = us[i++]; val += (ch & 0x3F) << 12; ch = us[i++]; val += (ch & 0x3F) << 6; ch = us[i++]; val += (ch & 0x3F); tbuf[ui] = static_cast(((val - 0x10000) >> 10) + SURROGATE_LEAD_FIRST); ui++; tbuf[ui] = static_cast((val & 0x3ff) + SURROGATE_TRAIL_FIRST); } ui++; } return ui; } int UTF8BytesOfLead[256]; static bool initialisedBytesOfLead = false; static int BytesFromLead(int leadByte) { if (leadByte < 0xC2) { // Single byte or invalid return 1; } else if (leadByte < 0xE0) { return 2; } else if (leadByte < 0xF0) { return 3; } else if (leadByte < 0xF5) { return 4; } else { // Characters longer than 4 bytes not possible in current UTF-8 return 1; } } void UTF8BytesOfLeadInitialise() { if (!initialisedBytesOfLead) { for (int i=0;i<256;i++) { UTF8BytesOfLead[i] = BytesFromLead(i); } initialisedBytesOfLead = true; } } // Return both the width of the first character in the string and a status // saying whether it is valid or invalid. // Most invalid sequences return a width of 1 so are treated as isolated bytes but // the non-characters *FFFE, *FFFF and FDD0 .. FDEF return 3 or 4 as they can be // reasonably treated as code points in some circumstances. They will, however, // not have associated glyphs. int UTF8Classify(const unsigned char *us, int len) { // For the rules: http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8 if (*us < 0x80) { // Single bytes easy return 1; } else if (*us > 0xf4) { // Characters longer than 4 bytes not possible in current UTF-8 return UTF8MaskInvalid | 1; } else if (*us >= 0xf0) { // 4 bytes if (len < 4) return UTF8MaskInvalid | 1; if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2]) && UTF8IsTrailByte(us[3])) { if (((us[1] & 0xf) == 0xf) && (us[2] == 0xbf) && ((us[3] == 0xbe) || (us[3] == 0xbf))) { // *FFFE or *FFFF non-character return UTF8MaskInvalid | 4; } if (*us == 0xf4) { // Check if encoding a value beyond the last Unicode character 10FFFF if (us[1] > 0x8f) { return UTF8MaskInvalid | 1; } else if (us[1] == 0x8f) { if (us[2] > 0xbf) { return UTF8MaskInvalid | 1; } else if (us[2] == 0xbf) { if (us[3] > 0xbf) { return UTF8MaskInvalid | 1; } } } } else if ((*us == 0xf0) && ((us[1] & 0xf0) == 0x80)) { // Overlong return UTF8MaskInvalid | 1; } return 4; } else { return UTF8MaskInvalid | 1; } } else if (*us >= 0xe0) { // 3 bytes if (len < 3) return UTF8MaskInvalid | 1; if (UTF8IsTrailByte(us[1]) && UTF8IsTrailByte(us[2])) { if ((*us == 0xe0) && ((us[1] & 0xe0) == 0x80)) { // Overlong return UTF8MaskInvalid | 1; } if ((*us == 0xed) && ((us[1] & 0xe0) == 0xa0)) { // Surrogate return UTF8MaskInvalid | 1; } if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbe)) { // U+FFFE non-character - 3 bytes long return UTF8MaskInvalid | 3; } if ((*us == 0xef) && (us[1] == 0xbf) && (us[2] == 0xbf)) { // U+FFFF non-character - 3 bytes long return UTF8MaskInvalid | 3; } if ((*us == 0xef) && (us[1] == 0xb7) && (((us[2] & 0xf0) == 0x90) || ((us[2] & 0xf0) == 0xa0))) { // U+FDD0 .. U+FDEF return UTF8MaskInvalid | 3; } return 3; } else { return UTF8MaskInvalid | 1; } } else if (*us >= 0xc2) { // 2 bytes if (len < 2) return UTF8MaskInvalid | 1; if (UTF8IsTrailByte(us[1])) { return 2; } else { return UTF8MaskInvalid | 1; } } else { // 0xc0 .. 0xc1 is overlong encoding // 0x80 .. 0xbf is trail byte return UTF8MaskInvalid | 1; } }