#include "uniString.h" #include #include #include using namespace std; using namespace uniString; /*****************************************************************/ //////////////// various helping funcs ///////////////////////// // helper function that throws exceptions static void throwBadUTF8Code(utf8::value_type v, int position) throw(badUnicodeData) { ostringstream o; o << "Bad UTF-8 code (" << hex << (int)v << ") at position " << position; throw uniString::badUnicodeData(o.str()); } static void throwBadUTF16Code(utf16::value_type v, int position) throw(badUnicodeData) { ostringstream o; o << "Bad UTF-16 code (" << hex << (int)v << ") at position " << position; throw uniString::badUnicodeData(o.str()); } bool utf8::isValid(bool allowIncompleteEndingSequence) const throw() { int position = 0; for (utf8::const_iterator i = begin(); i != end(); ++i, ++position) { utf8::value_type v8 = (*i); if (v8 & 0x80) { // count number of follow up bytes utf8::value_type follow_up_mask = 0xc0; utf8::value_type leading_value_mask = 0x3f; int follow_up_bytes = 0; while ((v8 & follow_up_mask) == follow_up_mask) { if (follow_up_mask == 0xff) { return false; } ++follow_up_bytes; follow_up_mask = (follow_up_mask >> 1) | 0x80; leading_value_mask = leading_value_mask >> 1; } // we should always have follow up bytes since 0x80 is illegal if (!follow_up_bytes) { return false; } utf32::value_type v = v8 & leading_value_mask; while(follow_up_bytes--) { ++i; ++position; if (i == end()) { if (allowIncompleteEndingSequence) { break; } else { return false; } } v8 = *i; if ((v8 & 0xc0) != 0x80) // follow ups must begin with 10xxxxxx { return false; } v = (v << 6) | (v8 & 0x3f); } } } return true; } // convert utf8 to utf32 /* u32 - utf32 string to set ibegin,iend - template iterators for the beginning and end of the UTF-8 bitstream allowIncompleteEndingSequence - if true then we just ignore missing values at the very end of the bitstream. Otherwise we throw an exception. */ template static void Utf8ToUtf32(utf32 &u32, ITER ibegin, ITER iend, bool allowIncompleteEndingSequence) throw(badUnicodeData) { // use temp so an exception leaves this string in tact utf32 newValue; int position = 0; for (ITER i = ibegin; i != iend; ++i,++position) { utf8::value_type v8 = (*i); if (!(v8 & 0x80)) { newValue.push_back(v8); } else { // count number of follow up bytes utf8::value_type follow_up_mask = 0xc0; utf8::value_type leading_value_mask = 0x3f; int follow_up_bytes = 0; while ((v8 & follow_up_mask) == follow_up_mask) { if (follow_up_mask == 0xff) { throwBadUTF8Code(v8, position); } ++follow_up_bytes; follow_up_mask = (follow_up_mask >> 1) | 0x80; leading_value_mask = leading_value_mask >> 1; } // we should always have follow up bytes since 0x80 is illegal if (!follow_up_bytes) { throwBadUTF8Code(v8, position); } utf32::value_type v = v8 & leading_value_mask; while (follow_up_bytes--) { ++i; ++position; if (i == iend) { if (allowIncompleteEndingSequence) { break; } else { throw badUnicodeData("Bad UTF-8 data. Ending sequence is incomplete"); } } v8 = *i; if ((v8 & 0xc0) != 0x80) // follow ups must begin with 10xxxxxx { throwBadUTF8Code(v8, position); } v = (v << 6) | (v8 & 0x3f); } if (v != Utf16BOM) { newValue.push_back(v); } } } u32.clear(); u32 = newValue; } ///////////////////// byte swap stuff for UTF-16 ////////////////////////////////////////////////////// // endian swap of 16 bit value static inline utf16::value_type byteSwap(utf16::value_type nValue) throw() { return (((nValue>> 8)) | (nValue << 8)); } #if __BYTE_ORDER == __LITTLE_ENDIAN // push a value into a UTF-16 encoding string based on the endian of the value and what // the machine natively stores. On entry "v" is in the native format of the machine since // it was just converted from the 32 bit code point. template static void Utf16EndianPush(T &u16, utf16::value_type v, bool littleEndian) throw() { if (littleEndian) { u16.push_back(v); } else { u16.push_back(byteSwap(v)); } } // convert a UTF-16 value to machine native value static inline utf16::value_type unswap(utf16::value_type v, bool littleEndianData) throw() { return (littleEndianData ? v : byteSwap(v)); } #else template static void Utf16EndianPush(T &u16, utf16::value_type v, bool littleEndian) throw() { if (!littleEndian) { u16.push_back(v); } else { u16.push_back(byteSwap(v)); } } static inline utf16::value_type unswap(utf16::value_type v,bool littleEndianData) throw() { return (!littleEndianData ? v : byteSwap(v)); } #endif /////////////////////////////////// // assign UTF-32 from UTF-16 encoding /* u32 - utf32 string to set ibegin,iend - iterators for UTF-16 encoding source assumeLittleEndian - assume the UTF-16 encoding is little endian unless a BOM is detected allowIncompleteEndingSequence - if true ignore final code point if UTF-16 sequence is incomplete, otherwise throw an exception */ template static void Utf16ToUtf32(utf32 &u32, ITER ibegin, ITER iend, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { bool littleEndianData = assumeLittleEndian; utf32 newValue; int position = 0; for (ITER i = ibegin; i != iend; ++i,++position) { utf16::value_type w1 = (utf16::value_type)(*i); // yes, use utf16 value even for wstring since we know it's good if (w1 == 0xfeff) { littleEndianData = leSystem; continue; } else if (w1 == 0xfffe) { littleEndianData = !leSystem; continue; } w1 = unswap(w1,littleEndianData); if (w1 < 0xd800 || w1 > 0xdfff) { newValue.push_back(w1); } else if (w1 > 0xdbff) { throwBadUTF16Code(w1,position); } else { ++i; ++position; if (i == iend) { if (allowIncompleteEndingSequence) { break; } else { throw badUnicodeData("Bad UTF-16 data. Ending sequence is incomplete"); } } utf16::value_type w2 = (*i); w2 = unswap(w2, littleEndianData); if (w2 < 0xdc00 || w2 > 0xdfff) { throwBadUTF16Code(w2, position); } utf32::value_type v1 = w1 & 0x03ff; utf32::value_type v2 = w2 & 0x03ff; newValue.push_back((v1 << 10) | v2); } } u32.clear(); u32 = newValue; } template static void Utf32CodeToUtf8(utf32::value_type v, U8 &u8) throw() { if (v < 0x00000080) { // only allow \t, \r, \n if in 0-31 ranage // otherwise expat in the DNAS will refuse if (v <= 31) { if (v == 9 || v == 10 || v == 13) { u8.push_back((utf8::value_type)v); } } else { u8.push_back((utf8::value_type)v); } } else { utf32::value_type maxTopValue = 0x0000003f; utf8::value_type topValueBitPattern = 0x80; vector buf; // filter out the extended control characters just incase if (v >= 0x80 && v <= 0x9F) { u8.push_back((utf8::value_type)0x3F); return; } // and also filter this so we don't insert BOMs else if (v == 0xFFFE || v == 0xFFFF) { return; } while (v > maxTopValue) { buf.push_back(0x00000080 | (v & 0x0000003f)); v = v >> 6; maxTopValue = maxTopValue >> 1; topValueBitPattern = ((topValueBitPattern >> 1) | 0x80); } buf.push_back(topValueBitPattern | v); u8.insert(u8.end(), buf.rbegin(), buf.rend()); } } template static void Utf32ToUtf8(const utf32 &u32, U8 &u8, bool leadingBOM) throw() { u8.clear(); if (leadingBOM) { // we rarely want a BOM in utf-8. But I've run into template bugs where // the utf8 constructor is accidentally getting called with true for the BOM // this compile time flag checks for that Utf32CodeToUtf8(Utf16BOM,u8); } for (utf32::const_iterator i = u32.begin(); i != u32.end(); ++i) { Utf32CodeToUtf8(*i, u8); } } // create UTF-16 from unicode (according to rfc2781) template static void Utf32ToUtf16(const utf32 &u32, U16 &u16, bool leadingBOM, bool littleEndian) throw() { u16.clear(); if (leadingBOM) { Utf16EndianPush(u16, Utf16BOM, littleEndian); } for (utf32::const_iterator i = u32.begin(); i != u32.end(); ++i) { utf32::value_type v = *i; if (v < 0x00010000) { Utf16EndianPush(u16, v, littleEndian); } else { utf32::value_type vp = v - 0x00010000; utf16::value_type w1 = 0xd800; utf16::value_type w2 = 0xdc00; w1 = w1 | ((vp & 0x000ffc00) >> 10); w2 = w2 | (vp & 0x000003ff); Utf16EndianPush(u16,w1,littleEndian); Utf16EndianPush(u16,w2,littleEndian); } } } utf32::utf32(const __int8 *s, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, s, s + strlen((const char *)s), allowIncompleteEndingSequence); } utf32::utf32(const __int8 *s, size_t len, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, s, s + len, allowIncompleteEndingSequence); } utf32::utf32(const std::string &s, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, s.begin(), s.end(), allowIncompleteEndingSequence); } utf32::utf32(const utf8 &u8, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, u8.begin(), u8.end(), allowIncompleteEndingSequence); } utf32::utf32(const utf8::value_type *u8, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, u8, u8 + strlen((const char *)u8), allowIncompleteEndingSequence); } utf32::utf32(const utf8::value_type *u8,size_t len, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, u8, u8 + len, allowIncompleteEndingSequence); } utf32::utf32(const utf16 &u16,bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, u16.begin(), u16.end(), assumeLittleEndian, allowIncompleteEndingSequence); } utf32::utf32(const utf16::value_type *u16, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { utf16::size_type len = 0; const utf16::value_type *tmp = u16; if (tmp && *tmp) { while(*(tmp++)) { ++len; } } Utf16ToUtf32(*this, u16, u16 + len, assumeLittleEndian, allowIncompleteEndingSequence); } utf32::utf32(const utf16::value_type *u16, size_t len, bool assumeLittleEndian ,bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, u16, u16 + len, assumeLittleEndian, allowIncompleteEndingSequence); } #ifdef _WIN32 utf32::utf32(const std::wstring &w, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, w.begin(), w.end(), assumeLittleEndian, allowIncompleteEndingSequence); } utf32::utf32(const wchar_t *u16, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { size_t len = 0; const wchar_t *tmp = u16; if (tmp && *tmp) { while(*(tmp++)) { ++len; } } Utf16ToUtf32(*this,u16,u16+len,assumeLittleEndian,allowIncompleteEndingSequence); } utf32::utf32(const wchar_t *u16, size_t len, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, u16, u16 + len, assumeLittleEndian, allowIncompleteEndingSequence); } #endif void utf32::assign(const __int8 *s, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, s, s + strlen((const char *)s), allowIncompleteEndingSequence); } void utf32::assign(const __int8 *s, size_t len, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, s, s + len, allowIncompleteEndingSequence); } void utf32::assign(const std::string &s, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, s.begin(), s.end(), allowIncompleteEndingSequence); } void utf32::assign(const utf8 &u8, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, u8.begin(), u8.end(), allowIncompleteEndingSequence); } void utf32::assign(const utf8::value_type *u8, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, u8, u8 + strlen((const char *)u8), allowIncompleteEndingSequence); } void utf32::assignAsHighBitANSI(const utf8::value_type *u8) throw() { if (u8) { while (*u8) { push_back(*u8); ++u8; } } } void utf32::assign(const utf8::value_type *u8, size_t len, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf8ToUtf32(*this, u8, u8 + len, allowIncompleteEndingSequence); } void utf32::assign(const utf16 &u16, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, u16.begin(), u16.end(), assumeLittleEndian, allowIncompleteEndingSequence); } void utf32::assign(const utf16::value_type *u16, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { utf16::size_type len = 0; const utf16::value_type *tmp = u16; if (tmp && *tmp) { while(*(tmp++)) { ++len; } } Utf16ToUtf32(*this, u16, u16 + len, assumeLittleEndian, allowIncompleteEndingSequence); } void utf32::assign(const utf16::value_type *u16, size_t len, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, u16, u16 + len, assumeLittleEndian, allowIncompleteEndingSequence); } #ifdef _WIN32 void utf32::assign(const std::wstring &w, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, w.begin(), w.end(), assumeLittleEndian, allowIncompleteEndingSequence); } void utf32::assign(const wchar_t *u16, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { size_t len = 0; const wchar_t *tmp = u16; if (tmp && *tmp) { while(*(tmp++)) { ++len; } } Utf16ToUtf32(*this, u16, u16 + len, assumeLittleEndian, allowIncompleteEndingSequence); } void utf32::assign(const wchar_t *u16, size_t len, bool assumeLittleEndian, bool allowIncompleteEndingSequence) throw(badUnicodeData) { Utf16ToUtf32(*this, u16, u16 + len, assumeLittleEndian, allowIncompleteEndingSequence); } #endif void utf32::assignFromLatinExtended(const std::string &s) throw() { clear(); for (string::const_iterator i = s.begin(); i != s.end(); ++i) { push_back((utf32::value_type)(*i)); } } void utf32::assignFromLatinExtended(const __uint8 *s) throw() { clear(); if (s) { while (*s) { push_back(*(s++)); } } } void utf32::assignFromLatinExtended(const __uint8 *s, size_t len) throw() { clear(); if (s && len > 0) { while (len--) { push_back(*(s++)); } } } utf8 utf32::toUtf8(bool leadingBOM) const throw() { utf8 u8; Utf32ToUtf8(*this, u8, leadingBOM); return u8; } void utf32::toUtf8(utf8 &u8, bool leadingBOM) const throw() { Utf32ToUtf8(*this, u8, leadingBOM); } void utf32::toUtf8(std::string &s, bool leadingBOM) const throw() { Utf32ToUtf8(*this, s, leadingBOM); } utf16 utf32::toUtf16(bool leadingBOM, bool littleEndian) const throw() { utf16 u16; Utf32ToUtf16(*this, u16, leadingBOM, littleEndian); return u16; } void utf32::toUtf16(utf16 &u16, bool leadingBOM, bool littleEndian) const throw() { Utf32ToUtf16(*this, u16, leadingBOM, littleEndian); } #ifdef _WIN32 void utf32::toUtf16(std::wstring &w, bool leadingBOM, bool littleEndian) const throw() { Utf32ToUtf16(*this, w, leadingBOM, littleEndian); } #endif namespace uniString { template class xmlEscapes: public map { public: xmlEscapes() { static const typename T::value_type lessthan[] = {(typename T::value_type)'&',(typename T::value_type)'l',(typename T::value_type)'t',(typename T::value_type)';',(typename T::value_type)0}; static const typename T::value_type greaterthan[] = {(typename T::value_type)'&',(typename T::value_type)'g',(typename T::value_type)'t',(typename T::value_type)';',(typename T::value_type)0}; static const typename T::value_type ampersand[] = {(typename T::value_type)'&',(typename T::value_type)'a',(typename T::value_type)'m',(typename T::value_type)'p',(typename T::value_type)';',(typename T::value_type)0}; static const typename T::value_type apostrophe[] = {(typename T::value_type)'&',(typename T::value_type)'a',(typename T::value_type)'p',(typename T::value_type)'o',(typename T::value_type)'s',(typename T::value_type)';',(typename T::value_type)0}; static const typename T::value_type quote[] = {(typename T::value_type)'&',(typename T::value_type)'q',(typename T::value_type)'u',(typename T::value_type)'o',(typename T::value_type)'t',(typename T::value_type)';',(typename T::value_type)0}; (*this)['<'] = T(lessthan); (*this)['>'] = T(greaterthan); (*this)['&'] = T(ampersand); (*this)['\''] = T(apostrophe); (*this)['"'] = T(quote); } }; } static const uniString::xmlEscapes gUtf32XmlEscapes; static const uniString::xmlEscapes gUtf16XmlEscapes; static const uniString::xmlEscapes gUtf8XmlEscapes; template static T xml_escape(const T &t, const uniString::xmlEscapes &m) throw() { T result; for (typename T::const_iterator i = t.begin(); i != t.end(); ++i) { typename uniString::xmlEscapes::const_iterator e = m.find(*i); if (e != m.end()) { result.insert(result.end(),(*e).second.begin(),(*e).second.end()); } else { result.push_back(*i); } } return result; } utf32 utf32::escapeXML() const throw() { return xml_escape(*this, gUtf32XmlEscapes); } utf16 utf16::escapeXML() const throw() { return xml_escape(*this, gUtf16XmlEscapes); } utf8 utf8::escapeXML() const throw() { return xml_escape(*this, gUtf8XmlEscapes); } /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// void utf8::assign(const utf32 &u32, bool leadingBOM) throw() { u32.toUtf8(*this, leadingBOM); } void utf16::assign(const utf32 &u32, bool leadingBOM, bool littleEndian) throw() { u32.toUtf16(*this, leadingBOM, littleEndian); } //////////////////////////////////////////////////////////////////////////////////////////// string utf8::toANSI(bool allowHighBitCodePoints) const throw() { string result; bool utf32MethodWorked = false; try { // convert to utf32 so we can easily remove code points const utf32 u32(*this); if (allowHighBitCodePoints) { for (utf32::const_iterator i = u32.begin(); i != u32.end(); ++i) { result.push_back((char)*i); } } else { for (utf32::const_iterator i = u32.begin(); i != u32.end(); ++i) { if ((*i) <= 0x7f) { result.push_back((char)*i); } else { result.push_back((char)'?'); } } } utf32MethodWorked = true; } catch(...) { } // if the string actually has high bit ANSI values (for instance, from a badly // formed playlist), we should still do something sensible. if (!utf32MethodWorked) { result.clear(); if (allowHighBitCodePoints) { for (utf8::const_iterator i = begin(); i != end(); ++i) { result.push_back((char)*i); } } else { for (utf8::const_iterator i = begin(); i != end(); ++i) { if ((*i) <= 0x7f) { result.push_back((char)*i); } else { result.push_back((char)'?'); } } } } return result; } #ifdef _WIN32 wstring utf8::toWString() const throw(badUnicodeData) { utf32 u32(*this); wstring result; u32.toUtf16(result); return result; } #endif int utf8::toInt() const throw() { return ::atoi((*this).hideAsString().c_str()); }