diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index e3320951..97309913 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -24,52 +24,81 @@ ***************************************************************************/ #include "tstring.h" -#include "unicode.h" #include "tdebug.h" #include "tstringlist.h" -#include - #include -namespace TagLib { +// Determine if the compiler supports codecvt. + +#if (defined(_MSC_VER) && _MSC_VER >= 1600) // VC++2010 or later +# define TAGLIB_USE_CODECVT +#endif + +#ifdef TAGLIB_USE_CODECVT +# include + typedef std::codecvt_utf8_utf16 utf8_utf16_t; +#else +# include "unicode.h" +#endif + +using namespace TagLib; + +namespace { inline unsigned short byteSwap(unsigned short x) { +#if defined(_MSC_VER) && (_MSC_VER >= 1400) // VC++2005 or later + + return _byteswap_ushort(x); + +#else + return (((x) >> 8) & 0xff) | (((x) & 0xff) << 8); + +#endif } inline unsigned short combine(unsigned char c1, unsigned char c2) { return (c1 << 8) | c2; } + + String::Type wcharByteOrder() + { + // Detect CPU endian. + union { + TagLib::ushort w; + char c; + } x = { 0x1234 }; + + if(x.c == 0x34) + return String::UTF16LE; + else + return String::UTF16BE; + } } -using namespace TagLib; class String::StringPrivate : public RefCounter { public: StringPrivate(const wstring &s) : RefCounter(), - data(s), - CString(0) {} + data(s) {} StringPrivate() : - RefCounter(), - CString(0) {} - - ~StringPrivate() { - delete [] CString; - } - - wstring data; + RefCounter() {} /*! - * This is only used to hold the a pointer to the most recent value of - * toCString. + * Stores string in UTF-16. The byte order depends on the CPU endian. */ - char *CString; + TagLib::wstring data; + + /*! + * This is only used to hold the the most recent value of toCString(). + */ + std::string cstring; }; String String::null; @@ -90,74 +119,72 @@ String::String(const std::string &s, Type t) { d = new StringPrivate; - if(t == UTF16 || t == UTF16BE || t == UTF16LE) { + if(t == Latin1) + copyFromLatin1(&s[0], s.length()); + else if(t == String::UTF8) + copyFromUTF8(&s[0], s.length()); + else { debug("String::String() -- A std::string should not contain UTF16."); - return; } - - d->data.resize(s.length()); - wstring::iterator targetIt = d->data.begin(); - - for(std::string::const_iterator it = s.begin(); it != s.end(); it++) { - *targetIt = uchar(*it); - ++targetIt; - } - - prepare(t); } String::String(const wstring &s, Type t) { - d = new StringPrivate(s); - prepare(t); + d = new StringPrivate; + + if(t == UTF16 || t == UTF16BE || t == UTF16LE) + copyFromUTF16(s.c_str(), s.length(), t); + else { + debug("String::String() -- A TagLib::wstring should not contain Latin1 or UTF-8."); + } } String::String(const wchar_t *s, Type t) { - d = new StringPrivate(s); - prepare(t); + d = new StringPrivate; + + if(t == UTF16 || t == UTF16BE || t == UTF16LE) + copyFromUTF16(s, ::wcslen(s), t); + else { + debug("String::String() -- A const wchar_t * should not contain Latin1 or UTF-8."); + } } String::String(const char *s, Type t) { d = new StringPrivate; - if(t == UTF16 || t == UTF16BE || t == UTF16LE) { + if(t == Latin1) + copyFromLatin1(s, ::strlen(s)); + else if(t == String::UTF8) + copyFromUTF8(s, ::strlen(s)); + else { debug("String::String() -- A const char * should not contain UTF16."); - return; } - - const size_t length = ::strlen(s); - d->data.resize(length); - - wstring::iterator targetIt = d->data.begin(); - - for(size_t i = 0; i < length; i++) { - *targetIt = uchar(s[i]); - ++targetIt; - } - - prepare(t); } String::String(wchar_t c, Type t) { d = new StringPrivate; - d->data += c; - prepare(t); + + if(t == UTF16 || t == UTF16BE || t == UTF16LE) + copyFromUTF16(&c, 1, t); + else { + debug("String::String() -- A const wchar_t should not contain Latin1 or UTF-8."); + } } String::String(char c, Type t) { d = new StringPrivate; - if(t == UTF16 || t == UTF16BE || t == UTF16LE) { - debug("String::String() -- A std::string should not contain UTF16."); - return; + if(t == Latin1 || t == UTF8) { + d->data.resize(1); + d->data[0] = static_cast(c); + } + else { + debug("String::String() -- A char should not contain UTF16."); } - - d->data += uchar(c); - prepare(t); } String::String(const ByteVector &v, Type t) @@ -167,31 +194,12 @@ String::String(const ByteVector &v, Type t) if(v.isEmpty()) return; - if(t == Latin1 || t == UTF8) { - - int length = 0; - d->data.resize(v.size()); - wstring::iterator targetIt = d->data.begin(); - for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) { - *targetIt = uchar(*it); - ++targetIt; - ++length; - } - d->data.resize(length); - } - else { - d->data.resize(v.size() / 2); - wstring::iterator targetIt = d->data.begin(); - - for(ByteVector::ConstIterator it = v.begin(); - it != v.end() && it + 1 != v.end() && combine(*it, *(it + 1)); - it += 2) - { - *targetIt = combine(*it, *(it + 1)); - ++targetIt; - } - } - prepare(t); + if(t == Latin1) + copyFromLatin1(v.data(), v.size()); + else if(t == UTF8) + copyFromUTF8(v.data(), v.size()); + else + copyFromUTF16(v.data(), v.size(), t); } //////////////////////////////////////////////////////////////////////////////// @@ -205,72 +213,62 @@ String::~String() std::string String::to8Bit(bool unicode) const { std::string s; - s.resize(d->data.size()); if(!unicode) { + s.resize(d->data.size()); + std::string::iterator targetIt = s.begin(); for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) { - *targetIt = char(*it); + *targetIt = static_cast(*it); ++targetIt; } - return s; } + else { + s.resize(d->data.size() * 4 + 1); - const size_t outputBufferSize = d->data.size() * 3 + 1; +#ifdef TAGLIB_USE_CODECVT - Unicode::UTF16 *sourceBuffer = new Unicode::UTF16[d->data.size() + 1]; - Unicode::UTF8 *targetBuffer = new Unicode::UTF8[outputBufferSize]; + std::mbstate_t st = 0; + const wchar_t *source; + char *target; + std::codecvt_base::result result = utf8_utf16_t().out( + st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target); - for(size_t i = 0; i < d->data.size(); i++) - sourceBuffer[i] = Unicode::UTF16(d->data[i]); + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } - const Unicode::UTF16 *source = sourceBuffer; - Unicode::UTF8 *target = targetBuffer; +#else - Unicode::ConversionResult result = - Unicode::ConvertUTF16toUTF8(&source, sourceBuffer + d->data.size(), - &target, targetBuffer + outputBufferSize, - Unicode::lenientConversion); + const Unicode::UTF16 *source = &d->data[0]; + Unicode::UTF8 *target = reinterpret_cast(&s[0]); - if(result != Unicode::conversionOK) { - debug("String::to8Bit() - Unicode conversion error."); + Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8( + &source, source + d->data.size(), + &target, target + s.size(), + Unicode::lenientConversion); + + if(result != Unicode::conversionOK) { + debug("String::to8Bit() - Unicode conversion error."); + } + +#endif + + s.resize(::strlen(s.c_str())); } - const size_t newSize = target - targetBuffer; - s.resize(newSize); - targetBuffer[newSize] = 0; - - s = (char *) targetBuffer; - - delete [] sourceBuffer; - delete [] targetBuffer; - return s; } -TagLib::wstring String::toWString() const +const TagLib::wstring &String::toWString() const { return d->data; } const char *String::toCString(bool unicode) const { - delete [] d->CString; - - std::string buffer = to8Bit(unicode); - d->CString = new char[buffer.size() + 1]; - -#if defined(_MSC_VER) && (_MSC_VER >= 1400) // VC++2005 or later - - strcpy_s(d->CString, buffer.size() + 1, buffer.c_str()); - -#else - - strcpy(d->CString, buffer.c_str()); - -#endif - - return d->CString; + d->cstring = to8Bit(unicode); + return d->cstring.c_str(); } String::Iterator String::begin() @@ -359,9 +357,10 @@ String &String::append(const String &s) String String::upper() const { - String s; + static const int shift = 'A' - 'a'; - static int shift = 'A' - 'a'; + String s; + s.d->data.reserve(d->data.size()); for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); ++it) { if(*it >= 'a' && *it <= 'z') @@ -407,8 +406,38 @@ ByteVector String::data(Type t) const } case UTF8: { - std::string s = to8Bit(true); - v.setData(s.c_str(), static_cast(s.length())); + v.resize(d->data.size() * 4 + 1); + +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const wchar_t *source; + char *target; + std::codecvt_base::result result = utf8_utf16_t().out( + st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + + const Unicode::UTF16 *source = &d->data[0]; + Unicode::UTF8 *target = reinterpret_cast(v.data()); + + Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8( + &source, source + d->data.size(), + &target, target + v.size(), + Unicode::lenientConversion); + + if(result != Unicode::conversionOK) { + debug("String::to8Bit() - Unicode conversion error."); + } + +#endif + + v.resize(::strlen(v.data())); + break; } case UTF16: @@ -552,21 +581,38 @@ String String::number(int n) // static return s; } -TagLib::wchar &String::operator[](int i) +TagLib::wchar &String::operator[](size_t i) { detach(); return d->data[i]; } -const TagLib::wchar &String::operator[](int i) const +const TagLib::wchar &String::operator[](size_t i) const { return d->data[i]; } bool String::operator==(const String &s) const { - return d == s.d || d->data == s.d->data; + return (d == s.d || d->data == s.d->data); +} + +bool String::operator==(const char *s) const +{ + for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) { + if(*it != static_cast(*s)) + return false; + + s++; + } + + return true; +} + +bool String::operator==(const wchar_t *s) const +{ + return (d->data == s); } bool String::operator!=(const String &s) const @@ -633,14 +679,7 @@ String &String::operator=(const std::string &s) delete d; d = new StringPrivate; - - d->data.resize(s.size()); - - wstring::iterator targetIt = d->data.begin(); - for(std::string::const_iterator it = s.begin(); it != s.end(); it++) { - *targetIt = uchar(*it); - ++targetIt; - } + copyFromLatin1(s.c_str(), s.length()); return *this; } @@ -649,7 +688,9 @@ String &String::operator=(const wstring &s) { if(d->deref()) delete d; + d = new StringPrivate(s); + return *this; } @@ -657,7 +698,10 @@ String &String::operator=(const wchar_t *s) { if(d->deref()) delete d; - d = new StringPrivate(s); + + d = new StringPrivate; + copyFromUTF16(s, ::wcslen(s), WCharByteOrder); + return *this; } @@ -665,8 +709,11 @@ String &String::operator=(char c) { if(d->deref()) delete d; + d = new StringPrivate; - d->data += uchar(c); + d->data.resize(1); + d->data[0] = static_cast(c); + return *this; } @@ -674,8 +721,11 @@ String &String::operator=(wchar_t c) { if(d->deref()) delete d; + d = new StringPrivate; - d->data += c; + d->data.resize(1); + d->data[0] = c; + return *this; } @@ -685,15 +735,7 @@ String &String::operator=(const char *s) delete d; d = new StringPrivate; - - const size_t length = ::strlen(s); - d->data.resize(length); - - wstring::iterator targetIt = d->data.begin(); - for(size_t i = 0; i < length; i++) { - *targetIt = uchar(s[i]); - ++targetIt; - } + copyFromLatin1(s, ::strlen(s)); return *this; } @@ -704,20 +746,10 @@ String &String::operator=(const ByteVector &v) delete d; d = new StringPrivate; - d->data.resize(v.size()); - wstring::iterator targetIt = d->data.begin(); - - uint i = 0; - - for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) { - *targetIt = uchar(*it); - ++targetIt; - ++i; - } + copyFromLatin1(v.data(), v.size()); // If we hit a null in the ByteVector, shrink the string again. - - d->data.resize(i); + d->data.resize(::wcslen(d->data.c_str())); return *this; } @@ -743,70 +775,111 @@ void String::detach() // private members //////////////////////////////////////////////////////////////////////////////// -void String::prepare(Type t) +void String::copyFromLatin1(const char *s, size_t length) { - switch(t) { - case UTF16: - { - if(d->data.size() >= 1 && (d->data[0] == 0xfeff || d->data[0] == 0xfffe)) { - bool swap = d->data[0] != 0xfeff; - d->data.erase(d->data.begin(), d->data.begin() + 1); - if(swap) { - for(uint i = 0; i < d->data.size(); i++) - d->data[i] = byteSwap((unsigned short)d->data[i]); - } - } + d->data.resize(length); + + for(size_t i = 0; i < length; ++i) + d->data[i] = static_cast(s[i]); +} + +void String::copyFromUTF8(const char *s, size_t length) +{ + d->data.resize(length); + +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const char *source; + wchar_t *target; + std::codecvt_base::result result = utf8_utf16_t().in( + st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + + const Unicode::UTF8 *source = reinterpret_cast(s); + Unicode::UTF16 *target = &d->data[0]; + + Unicode::ConversionResult result = Unicode::ConvertUTF8toUTF16( + &source, source + length, + &target, target + length, + Unicode::lenientConversion); + + if(result != Unicode::conversionOK) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#endif + + d->data.resize(::wcslen(d->data.c_str())); +} + +void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) +{ + bool swap; + if(t == UTF16) { + if(length >= 1 && s[0] == 0xfeff) + swap = false; // Same as CPU endian. No need to swap bytes. + else if(length >= 1 && s[0] == 0xfffe) + swap = true; // Not same as CPU endian. Need to swap bytes. else { - debug("String::prepare() - Invalid UTF16 string."); - d->data.erase(d->data.begin(), d->data.end()); - } - break; - } - case UTF8: - { - const size_t bufferSize = d->data.size() + 1; - Unicode::UTF8 *sourceBuffer = new Unicode::UTF8[bufferSize]; - Unicode::UTF16 *targetBuffer = new Unicode::UTF16[bufferSize]; - - unsigned int i = 0; - for(; i < d->data.size(); i++) - sourceBuffer[i] = Unicode::UTF8(d->data[i]); - sourceBuffer[i] = 0; - - const Unicode::UTF8 *source = sourceBuffer; - Unicode::UTF16 *target = targetBuffer; - - Unicode::ConversionResult result = - Unicode::ConvertUTF8toUTF16(&source, sourceBuffer + bufferSize, - &target, targetBuffer + bufferSize, - Unicode::lenientConversion); - - if(result != Unicode::conversionOK) { - debug("String::prepare() - Unicode conversion error."); + debug("String::copyFromUTF16() - Invalid UTF16 string."); + return; } - const size_t newSize = target != targetBuffer ? target - targetBuffer - 1 : 0; - d->data.resize(newSize); - - for(size_t i = 0; i < newSize; i++) - d->data[i] = targetBuffer[i]; - - delete [] sourceBuffer; - delete [] targetBuffer; - - break; + s++; + length--; } - case UTF16LE: - { - for(uint i = 0; i < d->data.size(); i++) - d->data[i] = byteSwap((unsigned short)d->data[i]); - break; - } - default: - break; + else + swap = (t != WCharByteOrder); + + d->data.resize(length); + memcpy(&d->data[0], s, length * sizeof(wchar_t)); + + if(swap) { + for(size_t i = 0; i < length; ++i) + d->data[i] = byteSwap(static_cast(s[i])); } } +void String::copyFromUTF16(const char *s, size_t length, Type t) +{ + if(sizeof(wchar_t) == 2) + copyFromUTF16(reinterpret_cast(s), length / 2, t); + else + { + bool swap; + if(t == UTF16) { + if(length >= 2 && *reinterpret_cast(s) == 0xfeff) + swap = false; // Same as CPU endian. No need to swap bytes. + else if(length >= 2 && *reinterpret_cast(s) == 0xfffe) + swap = true; // Not same as CPU endian. Need to swap bytes. + else { + debug("String::copyFromUTF16() - Invalid UTF16 string."); + return; + } + + s += 2; + length -= 2; + } + else + swap = (t != WCharByteOrder); + + d->data.resize(length / 2); + for(size_t i = 0; i < length / 2; ++i) { + d->data[i] = swap ? combine(*s, *(s + 1)) : combine(*(s + 1), *s); + s += 2; + } + } +} + +String::Type String::WCharByteOrder = wcharByteOrder(); + + //////////////////////////////////////////////////////////////////////////////// // related functions //////////////////////////////////////////////////////////////////////////////// diff --git a/taglib/toolkit/tstring.h b/taglib/toolkit/tstring.h index a1ba164c..c502e56c 100644 --- a/taglib/toolkit/tstring.h +++ b/taglib/toolkit/tstring.h @@ -98,8 +98,7 @@ namespace TagLib { */ UTF16 = 1, /*! - * UTF16 big endian. 16 bit characters. This is the encoding used - * internally by TagLib. + * UTF16 big endian. 16 bit characters. */ UTF16BE = 2, /*! @@ -135,12 +134,12 @@ namespace TagLib { /*! * Makes a deep copy of the data in \a s. */ - String(const wstring &s, Type t = UTF16BE); + String(const wstring &s, Type t = WCharByteOrder); /*! * Makes a deep copy of the data in \a s. */ - String(const wchar_t *s, Type t = UTF16BE); + String(const wchar_t *s, Type t = WCharByteOrder); /*! * Makes a deep copy of the data in \a c. @@ -187,7 +186,7 @@ namespace TagLib { /*! * Returns a wstring version of the TagLib string as a wide string. */ - wstring toWString() const; + const TagLib::wstring &toWString() const; /*! * Creates and returns a C-String based on the data. This string is still @@ -335,18 +334,30 @@ namespace TagLib { /*! * Returns a reference to the character at position \a i. */ - wchar &operator[](int i); + wchar &operator[](size_t i); /*! * Returns a const reference to the character at position \a i. */ - const wchar &operator[](int i) const; + const wchar &operator[](size_t i) const; + + /*! + * Compares each character of the String with each character in \a s and + * returns true if the strings match. + */ + bool operator==(const String &s) const; + + /*! + * Compares each character of the String with each character in \a s and + * returns true if the strings match. + */ + bool operator==(const char *s) const; /*! * Compares each character of the String with each character of \a s and * returns true if the strings match. */ - bool operator==(const String &s) const; + bool operator==(const wchar_t *s) const; /*! * Compares each character of the String with each character of \a s and @@ -442,12 +453,35 @@ namespace TagLib { private: /*! - * This checks to see if the string is in \e UTF-16 (with BOM) or \e UTF-8 - * format and if so converts it to \e UTF-16BE for internal use. \e Latin1 - * does not require conversion since it is a subset of \e UTF-16BE and - * \e UTF16-BE requires no conversion since it is used internally. + * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order) + * and copies it to the internal buffer. */ - void prepare(Type t); + void copyFromLatin1(const char *s, size_t length); + + /*! + * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order) + * and copies it to the internal buffer. + */ + void copyFromUTF8(const char *s, size_t length); + + /*! + * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into + * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer. + */ + void copyFromUTF16(const wchar_t *s, size_t length, Type t); + + /*! + * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into + * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer. + */ + void copyFromUTF16(const char *s, size_t length, Type t); + + /*! + * Indicates which byte order of UTF-16 is used to store strings internally. + * + * \note Set to \e UTF16BE or \e UTF16LE at run time. + */ + static Type WCharByteOrder; class StringPrivate; StringPrivate *d; diff --git a/taglib/toolkit/unicode.h b/taglib/toolkit/unicode.h index cf7eb3c5..b9de0ea2 100644 --- a/taglib/toolkit/unicode.h +++ b/taglib/toolkit/unicode.h @@ -115,8 +115,8 @@ namespace Unicode { typedef unsigned long UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ +typedef wchar_t UTF16; /* TagLib assumes that wchar_t is sufficient for UTF-16. */ +typedef unsigned char UTF8; /* typically 8 bits */ typedef unsigned char Boolean; /* 0 or 1 */ typedef enum { diff --git a/tests/test_string.cpp b/tests/test_string.cpp index 1e37d7a2..c67fa41e 100644 --- a/tests/test_string.cpp +++ b/tests/test_string.cpp @@ -116,7 +116,7 @@ public: CPPUNIT_ASSERT_EQUAL(a, String(d, String::UTF16)); } - // this test is expected to print "TagLib: String::prepare() - + // this test is expected to print "TagLib: String::copyFromUTF16() - // Invalid UTF16 string." on the console 3 times void testUTF16DecodeInvalidBOM() {