From 9b19453059c3e2da9b79ab2eb56ce6ea7a3d65ab Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Sun, 17 Mar 2013 12:51:00 +0900 Subject: [PATCH 1/9] Some improvements of String --- taglib/toolkit/tstring.cpp | 390 +++++++++++++++++-------------------- taglib/toolkit/tstring.h | 46 +++-- taglib/toolkit/unicode.h | 4 +- 3 files changed, 215 insertions(+), 225 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index e3320951..02059fff 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -29,47 +29,65 @@ #include "tstringlist.h" #include - #include -namespace TagLib { +using namespace TagLib; + +namespace { inline unsigned short byteSwap(unsigned short x) { +#if defined(_MSC_VER) && (_MSC_VER >= 1400) // VC++2005 or later + + return _byteswap_ushort(x); + +#else + return (((x) >> 8) & 0xff) | (((x) & 0xff) << 8); + +#endif } inline unsigned short combine(unsigned char c1, unsigned char c2) { return (c1 << 8) | c2; } + + String::Type wcharByteOrder() + { + // Detect CPU endian. + union { + wchar_t w; + char c[2]; + } x = { 0xfeff }; + + if(x.c[0] == 0xfe) + return String::UTF16BE; + else + return String::UTF16LE; + } } -using namespace TagLib; class String::StringPrivate : public RefCounter { public: StringPrivate(const wstring &s) : RefCounter(), - data(s), - CString(0) {} + data(s) {} StringPrivate() : - RefCounter(), - CString(0) {} - - ~StringPrivate() { - delete [] CString; - } - - wstring data; + RefCounter() {} /*! - * This is only used to hold the a pointer to the most recent value of - * toCString. + * Stores string in UTF-16. The byte order depends on the CPU endian. */ - char *CString; + TagLib::wstring data; + + /*! + * This is only used to hold the the most recent value of toCString(). + */ + std::string cstring; }; String String::null; @@ -90,74 +108,72 @@ String::String(const std::string &s, Type t) { d = new StringPrivate; - if(t == UTF16 || t == UTF16BE || t == UTF16LE) { + if(t == Latin1) + copyFromLatin1(&s[0], s.length()); + else if(t == String::UTF8) + copyFromUTF8(&s[0], s.length()); + else { debug("String::String() -- A std::string should not contain UTF16."); - return; } - - d->data.resize(s.length()); - wstring::iterator targetIt = d->data.begin(); - - for(std::string::const_iterator it = s.begin(); it != s.end(); it++) { - *targetIt = uchar(*it); - ++targetIt; - } - - prepare(t); } String::String(const wstring &s, Type t) { - d = new StringPrivate(s); - prepare(t); + d = new StringPrivate; + + if(t == UTF16 || t == UTF16BE || t == UTF16LE) + copyFromUTF16(s.c_str(), s.length(), t); + else { + debug("String::String() -- A TagLib::wstring should not contain Latin1 or UTF-8."); + } } String::String(const wchar_t *s, Type t) { - d = new StringPrivate(s); - prepare(t); + d = new StringPrivate; + + if(t == UTF16 || t == UTF16BE || t == UTF16LE) + copyFromUTF16(s, ::wcslen(s), t); + else { + debug("String::String() -- A const wchar_t * should not contain Latin1 or UTF-8."); + } } String::String(const char *s, Type t) { d = new StringPrivate; - if(t == UTF16 || t == UTF16BE || t == UTF16LE) { + if(t == Latin1) + copyFromLatin1(s, ::strlen(s)); + else if(t == String::UTF8) + copyFromUTF8(s, ::strlen(s)); + else { debug("String::String() -- A const char * should not contain UTF16."); - return; } - - const size_t length = ::strlen(s); - d->data.resize(length); - - wstring::iterator targetIt = d->data.begin(); - - for(size_t i = 0; i < length; i++) { - *targetIt = uchar(s[i]); - ++targetIt; - } - - prepare(t); } String::String(wchar_t c, Type t) { d = new StringPrivate; - d->data += c; - prepare(t); + + if(t == UTF16 || t == UTF16BE || t == UTF16LE) + copyFromUTF16(&c, 1, t); + else { + debug("String::String() -- A const wchar_t should not contain Latin1 or UTF-8."); + } } String::String(char c, Type t) { d = new StringPrivate; - if(t == UTF16 || t == UTF16BE || t == UTF16LE) { - debug("String::String() -- A std::string should not contain UTF16."); - return; + if(t == Latin1 || t == UTF8) { + d->data.resize(1); + d->data[0] = static_cast(c); + } + else { + debug("String::String() -- A char should not contain UTF16."); } - - d->data += uchar(c); - prepare(t); } String::String(const ByteVector &v, Type t) @@ -167,31 +183,12 @@ String::String(const ByteVector &v, Type t) if(v.isEmpty()) return; - if(t == Latin1 || t == UTF8) { - - int length = 0; - d->data.resize(v.size()); - wstring::iterator targetIt = d->data.begin(); - for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) { - *targetIt = uchar(*it); - ++targetIt; - ++length; - } - d->data.resize(length); - } - else { - d->data.resize(v.size() / 2); - wstring::iterator targetIt = d->data.begin(); - - for(ByteVector::ConstIterator it = v.begin(); - it != v.end() && it + 1 != v.end() && combine(*it, *(it + 1)); - it += 2) - { - *targetIt = combine(*it, *(it + 1)); - ++targetIt; - } - } - prepare(t); + if(t == Latin1) + copyFromLatin1(v.data(), v.size()); + else if(t == UTF8) + copyFromUTF8(v.data(), v.size()); + else + copyFromUTF16(v.data(), v.size(), t); } //////////////////////////////////////////////////////////////////////////////// @@ -205,72 +202,46 @@ String::~String() std::string String::to8Bit(bool unicode) const { std::string s; - s.resize(d->data.size()); if(!unicode) { + s.resize(d->data.size()); + std::string::iterator targetIt = s.begin(); for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) { - *targetIt = char(*it); + *targetIt = static_cast(*it); ++targetIt; } - return s; } + else { + s.resize(d->data.size() * 4 + 1); - const size_t outputBufferSize = d->data.size() * 3 + 1; + const Unicode::UTF16 *source = &d->data[0]; + Unicode::UTF8 *target = reinterpret_cast(&s[0]); - Unicode::UTF16 *sourceBuffer = new Unicode::UTF16[d->data.size() + 1]; - Unicode::UTF8 *targetBuffer = new Unicode::UTF8[outputBufferSize]; + Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8( + &source, source + d->data.size(), + &target, target + s.size(), + Unicode::lenientConversion); - for(size_t i = 0; i < d->data.size(); i++) - sourceBuffer[i] = Unicode::UTF16(d->data[i]); + if(result != Unicode::conversionOK) { + debug("String::to8Bit() - Unicode conversion error."); + } - const Unicode::UTF16 *source = sourceBuffer; - Unicode::UTF8 *target = targetBuffer; - - Unicode::ConversionResult result = - Unicode::ConvertUTF16toUTF8(&source, sourceBuffer + d->data.size(), - &target, targetBuffer + outputBufferSize, - Unicode::lenientConversion); - - if(result != Unicode::conversionOK) { - debug("String::to8Bit() - Unicode conversion error."); + s.resize(::strlen(s.c_str())); } - const size_t newSize = target - targetBuffer; - s.resize(newSize); - targetBuffer[newSize] = 0; - - s = (char *) targetBuffer; - - delete [] sourceBuffer; - delete [] targetBuffer; - return s; } -TagLib::wstring String::toWString() const +const TagLib::wstring &String::toWString() const { return d->data; } const char *String::toCString(bool unicode) const { - delete [] d->CString; - - std::string buffer = to8Bit(unicode); - d->CString = new char[buffer.size() + 1]; - -#if defined(_MSC_VER) && (_MSC_VER >= 1400) // VC++2005 or later - - strcpy_s(d->CString, buffer.size() + 1, buffer.c_str()); - -#else - - strcpy(d->CString, buffer.c_str()); - -#endif - - return d->CString; + d->cstring = to8Bit(unicode); + return d->cstring.c_str(); } String::Iterator String::begin() @@ -552,14 +523,14 @@ String String::number(int n) // static return s; } -TagLib::wchar &String::operator[](int i) +TagLib::wchar &String::operator[](size_t i) { detach(); return d->data[i]; } -const TagLib::wchar &String::operator[](int i) const +const TagLib::wchar &String::operator[](size_t i) const { return d->data[i]; } @@ -633,14 +604,7 @@ String &String::operator=(const std::string &s) delete d; d = new StringPrivate; - - d->data.resize(s.size()); - - wstring::iterator targetIt = d->data.begin(); - for(std::string::const_iterator it = s.begin(); it != s.end(); it++) { - *targetIt = uchar(*it); - ++targetIt; - } + copyFromLatin1(s.c_str(), s.length()); return *this; } @@ -649,7 +613,9 @@ String &String::operator=(const wstring &s) { if(d->deref()) delete d; + d = new StringPrivate(s); + return *this; } @@ -657,7 +623,10 @@ String &String::operator=(const wchar_t *s) { if(d->deref()) delete d; - d = new StringPrivate(s); + + d = new StringPrivate; + copyFromUTF16(s, ::wcslen(s), WCharByteOrder); + return *this; } @@ -665,8 +634,11 @@ String &String::operator=(char c) { if(d->deref()) delete d; + d = new StringPrivate; - d->data += uchar(c); + d->data.resize(1); + d->data[0] = static_cast(c); + return *this; } @@ -674,8 +646,11 @@ String &String::operator=(wchar_t c) { if(d->deref()) delete d; + d = new StringPrivate; - d->data += c; + d->data.resize(1); + d->data[0] = c; + return *this; } @@ -685,15 +660,7 @@ String &String::operator=(const char *s) delete d; d = new StringPrivate; - - const size_t length = ::strlen(s); - d->data.resize(length); - - wstring::iterator targetIt = d->data.begin(); - for(size_t i = 0; i < length; i++) { - *targetIt = uchar(s[i]); - ++targetIt; - } + copyFromLatin1(s, ::strlen(s)); return *this; } @@ -704,20 +671,10 @@ String &String::operator=(const ByteVector &v) delete d; d = new StringPrivate; - d->data.resize(v.size()); - wstring::iterator targetIt = d->data.begin(); - - uint i = 0; - - for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) { - *targetIt = uchar(*it); - ++targetIt; - ++i; - } + copyFromLatin1(v.data(), v.size()); // If we hit a null in the ByteVector, shrink the string again. - - d->data.resize(i); + d->data.resize(::wcslen(d->data.c_str())); return *this; } @@ -743,70 +700,81 @@ void String::detach() // private members //////////////////////////////////////////////////////////////////////////////// -void String::prepare(Type t) + +void String::copyFromLatin1(const char *s, size_t length) { - switch(t) { - case UTF16: - { - if(d->data.size() >= 1 && (d->data[0] == 0xfeff || d->data[0] == 0xfffe)) { - bool swap = d->data[0] != 0xfeff; - d->data.erase(d->data.begin(), d->data.begin() + 1); - if(swap) { - for(uint i = 0; i < d->data.size(); i++) - d->data[i] = byteSwap((unsigned short)d->data[i]); - } - } - else { - debug("String::prepare() - Invalid UTF16 string."); - d->data.erase(d->data.begin(), d->data.end()); - } - break; - } - case UTF8: - { - const size_t bufferSize = d->data.size() + 1; - Unicode::UTF8 *sourceBuffer = new Unicode::UTF8[bufferSize]; - Unicode::UTF16 *targetBuffer = new Unicode::UTF16[bufferSize]; + d->data.resize(length); - unsigned int i = 0; - for(; i < d->data.size(); i++) - sourceBuffer[i] = Unicode::UTF8(d->data[i]); - sourceBuffer[i] = 0; + for(size_t i = 0; i < length; ++i) + d->data[i] = static_cast(s[i]); +} - const Unicode::UTF8 *source = sourceBuffer; - Unicode::UTF16 *target = targetBuffer; +void String::copyFromUTF8(const char *s, size_t length) +{ + d->data.resize(length); - Unicode::ConversionResult result = - Unicode::ConvertUTF8toUTF16(&source, sourceBuffer + bufferSize, - &target, targetBuffer + bufferSize, - Unicode::lenientConversion); + const Unicode::UTF8 *source = reinterpret_cast(s); + Unicode::UTF16 *target = &d->data[0]; - if(result != Unicode::conversionOK) { - debug("String::prepare() - Unicode conversion error."); - } + Unicode::ConversionResult result = Unicode::ConvertUTF8toUTF16( + &source, source + length, + &target, target + length, + Unicode::lenientConversion); - const size_t newSize = target != targetBuffer ? target - targetBuffer - 1 : 0; - d->data.resize(newSize); + d->data.resize(::wcslen(d->data.c_str())); - for(size_t i = 0; i < newSize; i++) - d->data[i] = targetBuffer[i]; - - delete [] sourceBuffer; - delete [] targetBuffer; - - break; - } - case UTF16LE: - { - for(uint i = 0; i < d->data.size(); i++) - d->data[i] = byteSwap((unsigned short)d->data[i]); - break; - } - default: - break; + if(result != Unicode::conversionOK) { + debug("String::prepare() - Unicode conversion error."); } } +void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) +{ + bool swap; + if(t == UTF16) { + if(length >= 1) { + if(s[0] == 0xfeff) + swap = false; // Same as CPU endian. No need to swap bytes. + else if(s[0] == 0xfffe) + swap = true; // Not same as CPU endian. Need to swap bytes. + else { + debug("String::prepare() - Invalid UTF16 string."); + return; + } + + s++; + length--; + } + } + else + swap = (t != WCharByteOrder); + + d->data.resize(length); + memcpy(&d->data[0], s, length * sizeof(wchar_t)); + + if(swap) { + for(size_t i = 0; i < length; ++i) + d->data[i] = byteSwap(static_cast(s[i])); + } +} + +void String::copyFromUTF16(const char *s, size_t length, Type t) +{ + if(sizeof(wchar_t) == 2) + copyFromUTF16(reinterpret_cast(s), length / 2, t); + else + { + std::vector sourceBuffer(length / 2); + for(size_t i = 0; i < length / 2; ++i) { + sourceBuffer[i] = combine(*s, *(s + 1)); + s += 2; + } + } +} + +String::Type String::WCharByteOrder = wcharByteOrder(); + + //////////////////////////////////////////////////////////////////////////////// // related functions //////////////////////////////////////////////////////////////////////////////// diff --git a/taglib/toolkit/tstring.h b/taglib/toolkit/tstring.h index a1ba164c..ee81f999 100644 --- a/taglib/toolkit/tstring.h +++ b/taglib/toolkit/tstring.h @@ -98,8 +98,7 @@ namespace TagLib { */ UTF16 = 1, /*! - * UTF16 big endian. 16 bit characters. This is the encoding used - * internally by TagLib. + * UTF16 big endian. 16 bit characters. */ UTF16BE = 2, /*! @@ -135,12 +134,12 @@ namespace TagLib { /*! * Makes a deep copy of the data in \a s. */ - String(const wstring &s, Type t = UTF16BE); + String(const wstring &s, Type t = WCharByteOrder); /*! * Makes a deep copy of the data in \a s. */ - String(const wchar_t *s, Type t = UTF16BE); + String(const wchar_t *s, Type t = WCharByteOrder); /*! * Makes a deep copy of the data in \a c. @@ -187,7 +186,7 @@ namespace TagLib { /*! * Returns a wstring version of the TagLib string as a wide string. */ - wstring toWString() const; + const TagLib::wstring &toWString() const; /*! * Creates and returns a C-String based on the data. This string is still @@ -335,12 +334,12 @@ namespace TagLib { /*! * Returns a reference to the character at position \a i. */ - wchar &operator[](int i); + wchar &operator[](size_t i); /*! * Returns a const reference to the character at position \a i. */ - const wchar &operator[](int i) const; + const wchar &operator[](size_t i) const; /*! * Compares each character of the String with each character of \a s and @@ -442,12 +441,35 @@ namespace TagLib { private: /*! - * This checks to see if the string is in \e UTF-16 (with BOM) or \e UTF-8 - * format and if so converts it to \e UTF-16BE for internal use. \e Latin1 - * does not require conversion since it is a subset of \e UTF-16BE and - * \e UTF16-BE requires no conversion since it is used internally. + * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order) + * and copies it to the internal buffer. */ - void prepare(Type t); + void copyFromLatin1(const char *s, size_t length); + + /*! + * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order) + * and copies it to the internal buffer. + */ + void copyFromUTF8(const char *s, size_t length); + + /*! + * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into + * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer. + */ + void copyFromUTF16(const wchar_t *s, size_t length, Type t); + + /*! + * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into + * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer. + */ + void copyFromUTF16(const char *s, size_t length, Type t); + + /*! + * Indicates which byte order of UTF-16 is used to store strings internally. + * + * \note Set to \e UTF16BE or \e UTF16LE at run time. + */ + static Type WCharByteOrder; class StringPrivate; StringPrivate *d; diff --git a/taglib/toolkit/unicode.h b/taglib/toolkit/unicode.h index cf7eb3c5..b9de0ea2 100644 --- a/taglib/toolkit/unicode.h +++ b/taglib/toolkit/unicode.h @@ -115,8 +115,8 @@ namespace Unicode { typedef unsigned long UTF32; /* at least 32 bits */ -typedef unsigned short UTF16; /* at least 16 bits */ -typedef unsigned char UTF8; /* typically 8 bits */ +typedef wchar_t UTF16; /* TagLib assumes that wchar_t is sufficient for UTF-16. */ +typedef unsigned char UTF8; /* typically 8 bits */ typedef unsigned char Boolean; /* 0 or 1 */ typedef enum { From de19ad72abb247c19a6c200ff08aa70e23414def Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Sun, 17 Mar 2013 19:40:01 +0900 Subject: [PATCH 2/9] Fixed CPU endian detection --- taglib/toolkit/tstring.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index 02059fff..e7caa40d 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -57,14 +57,14 @@ namespace { { // Detect CPU endian. union { - wchar_t w; - char c[2]; - } x = { 0xfeff }; + TagLib::ushort w; + char c; + } x = { 0x1234 }; - if(x.c[0] == 0xfe) - return String::UTF16BE; - else + if(x.c == 0x34) return String::UTF16LE; + else + return String::UTF16BE; } } From 86b7cabf4464ba683bce60ed41356d1fff4ebbb2 Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Sun, 17 Mar 2013 20:00:05 +0900 Subject: [PATCH 3/9] Fix UTF-16 decoding where wchar_t is not 16-bit --- taglib/toolkit/tstring.cpp | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index e7caa40d..b8f50f3d 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -764,9 +764,28 @@ void String::copyFromUTF16(const char *s, size_t length, Type t) copyFromUTF16(reinterpret_cast(s), length / 2, t); else { - std::vector sourceBuffer(length / 2); + bool swap; + if(t == UTF16) { + if(length >= 2) { + if(*reinterpret_cast(s) == 0xfeff) + swap = false; // Same as CPU endian. No need to swap bytes. + else if(*reinterpret_cast(s) == 0xfffe) + swap = true; // Not same as CPU endian. Need to swap bytes. + else { + debug("String::prepare() - Invalid UTF16 string."); + return; + } + + s += 2; + length -= 2; + } + } + else + swap = (t != WCharByteOrder); + + d->data.resize(length / 2); for(size_t i = 0; i < length / 2; ++i) { - sourceBuffer[i] = combine(*s, *(s + 1)); + d->data[i] = swap ? combine(*s, *(s + 1)) : combine(*(s + 1), *s); s += 2; } } From 0792eedd12711c070f142a5284b7b0ea001376b9 Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Sun, 17 Mar 2013 20:47:58 +0900 Subject: [PATCH 4/9] Fix UTF-16 BOM detection --- taglib/toolkit/tstring.cpp | 46 +++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index b8f50f3d..2b723100 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -724,7 +724,7 @@ void String::copyFromUTF8(const char *s, size_t length) d->data.resize(::wcslen(d->data.c_str())); if(result != Unicode::conversionOK) { - debug("String::prepare() - Unicode conversion error."); + debug("String::copyFromUTF8() - Unicode conversion error."); } } @@ -732,19 +732,17 @@ void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) { bool swap; if(t == UTF16) { - if(length >= 1) { - if(s[0] == 0xfeff) - swap = false; // Same as CPU endian. No need to swap bytes. - else if(s[0] == 0xfffe) - swap = true; // Not same as CPU endian. Need to swap bytes. - else { - debug("String::prepare() - Invalid UTF16 string."); - return; - } - - s++; - length--; + if(length >= 1 && s[0] == 0xfeff) + swap = false; // Same as CPU endian. No need to swap bytes. + else if(length >= 1 && s[0] == 0xfffe) + swap = true; // Not same as CPU endian. Need to swap bytes. + else { + debug("String::copyFromUTF16() - Invalid UTF16 string."); + return; } + + s++; + length--; } else swap = (t != WCharByteOrder); @@ -766,19 +764,17 @@ void String::copyFromUTF16(const char *s, size_t length, Type t) { bool swap; if(t == UTF16) { - if(length >= 2) { - if(*reinterpret_cast(s) == 0xfeff) - swap = false; // Same as CPU endian. No need to swap bytes. - else if(*reinterpret_cast(s) == 0xfffe) - swap = true; // Not same as CPU endian. Need to swap bytes. - else { - debug("String::prepare() - Invalid UTF16 string."); - return; - } - - s += 2; - length -= 2; + if(length >= 2 && *reinterpret_cast(s) == 0xfeff) + swap = false; // Same as CPU endian. No need to swap bytes. + else if(length >= 2 && *reinterpret_cast(s) == 0xfffe) + swap = true; // Not same as CPU endian. Need to swap bytes. + else { + debug("String::copyFromUTF16() - Invalid UTF16 string."); + return; } + + s += 2; + length -= 2; } else swap = (t != WCharByteOrder); From 6e3639de9e4a6b984dbcdaa15f109b8256cf7fa6 Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Mon, 18 Mar 2013 02:51:11 +0900 Subject: [PATCH 5/9] Avoid creating new String object when comparing --- taglib/toolkit/tstring.cpp | 25 +++++++++++++++++++++---- taglib/toolkit/tstring.h | 14 +++++++++++++- tests/test_string.cpp | 2 +- 3 files changed, 35 insertions(+), 6 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index 2b723100..e88b70f7 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -330,9 +330,10 @@ String &String::append(const String &s) String String::upper() const { - String s; + static const int shift = 'A' - 'a'; - static int shift = 'A' - 'a'; + String s; + s.d->data.reserve(d->data.size()); for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); ++it) { if(*it >= 'a' && *it <= 'z') @@ -537,7 +538,24 @@ const TagLib::wchar &String::operator[](size_t i) const bool String::operator==(const String &s) const { - return d == s.d || d->data == s.d->data; + return (d == s.d || d->data == s.d->data); +} + +bool String::operator==(const char *s) const +{ + for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) { + if(*it != static_cast(*s)) + return false; + + s++; + } + + return true; +} + +bool String::operator==(const wchar_t *s) const +{ + return (d->data == s); } bool String::operator!=(const String &s) const @@ -700,7 +718,6 @@ void String::detach() // private members //////////////////////////////////////////////////////////////////////////////// - void String::copyFromLatin1(const char *s, size_t length) { d->data.resize(length); diff --git a/taglib/toolkit/tstring.h b/taglib/toolkit/tstring.h index ee81f999..c502e56c 100644 --- a/taglib/toolkit/tstring.h +++ b/taglib/toolkit/tstring.h @@ -342,11 +342,23 @@ namespace TagLib { const wchar &operator[](size_t i) const; /*! - * Compares each character of the String with each character of \a s and + * Compares each character of the String with each character in \a s and * returns true if the strings match. */ bool operator==(const String &s) const; + /*! + * Compares each character of the String with each character in \a s and + * returns true if the strings match. + */ + bool operator==(const char *s) const; + + /*! + * Compares each character of the String with each character of \a s and + * returns true if the strings match. + */ + bool operator==(const wchar_t *s) const; + /*! * Compares each character of the String with each character of \a s and * returns false if the strings match. diff --git a/tests/test_string.cpp b/tests/test_string.cpp index 1e37d7a2..c67fa41e 100644 --- a/tests/test_string.cpp +++ b/tests/test_string.cpp @@ -116,7 +116,7 @@ public: CPPUNIT_ASSERT_EQUAL(a, String(d, String::UTF16)); } - // this test is expected to print "TagLib: String::prepare() - + // this test is expected to print "TagLib: String::copyFromUTF16() - // Invalid UTF16 string." on the console 3 times void testUTF16DecodeInvalidBOM() { From 19ce4d0dfa8c13255f56342f8c179c6a601d9b2d Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Mon, 18 Mar 2013 05:56:48 +0900 Subject: [PATCH 6/9] Use the standard library to convert between UTF-8 and UTF-16 where possible --- taglib/toolkit/tstring.cpp | 88 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 4 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index e88b70f7..530a9c6f 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -31,6 +31,24 @@ #include #include +// Determine if the compiler supports codecvt. + +#ifndef __has_include +# define __has_include(x) 0 +#endif + +#if (((defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)) /* GCC with -std=c++0x option */ \ + || (defined(_MSC_VER) && _MSC_VER >= 1600))) /* VC++2010 or later */ \ + || (defined(__has_include) && __has_include()) /* Clang has */ + +# define TAGLIB_USE_CODECVT +#endif + +#ifdef TAGLIB_USE_CODECVT +# include + typedef std::codecvt_utf8_utf16 utf8_utf16_t; +#endif + using namespace TagLib; namespace { @@ -215,6 +233,20 @@ std::string String::to8Bit(bool unicode) const else { s.resize(d->data.size() * 4 + 1); +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const wchar_t *source; + char *target; + utf8_utf16_t::result result = utf8_utf16_t().out( + st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + const Unicode::UTF16 *source = &d->data[0]; Unicode::UTF8 *target = reinterpret_cast(&s[0]); @@ -227,6 +259,8 @@ std::string String::to8Bit(bool unicode) const debug("String::to8Bit() - Unicode conversion error."); } +#endif + s.resize(::strlen(s.c_str())); } @@ -379,8 +413,38 @@ ByteVector String::data(Type t) const } case UTF8: { - std::string s = to8Bit(true); - v.setData(s.c_str(), static_cast(s.length())); + v.resize(d->data.size() * 4 + 1); + +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const wchar_t *source; + char *target; + utf8_utf16_t::result result = utf8_utf16_t().out( + st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + + const Unicode::UTF16 *source = &d->data[0]; + Unicode::UTF8 *target = reinterpret_cast(v.data()); + + Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8( + &source, source + d->data.size(), + &target, target + v.size(), + Unicode::lenientConversion); + + if(result != Unicode::conversionOK) { + debug("String::to8Bit() - Unicode conversion error."); + } + +#endif + + v.resize(::strlen(v.data()) + 1); + break; } case UTF16: @@ -730,6 +794,20 @@ void String::copyFromUTF8(const char *s, size_t length) { d->data.resize(length); +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const char *source; + wchar_t *target; + utf8_utf16_t::result result = utf8_utf16_t().in( + st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + const Unicode::UTF8 *source = reinterpret_cast(s); Unicode::UTF16 *target = &d->data[0]; @@ -738,11 +816,13 @@ void String::copyFromUTF8(const char *s, size_t length) &target, target + length, Unicode::lenientConversion); - d->data.resize(::wcslen(d->data.c_str())); - if(result != Unicode::conversionOK) { debug("String::copyFromUTF8() - Unicode conversion error."); } + +#endif + + d->data.resize(::wcslen(d->data.c_str())); } void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) From a842220fe6f3211d046b7718c8aba7e0b0f23acf Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Mon, 18 Mar 2013 06:08:05 +0900 Subject: [PATCH 7/9] Revert "Use the standard library to convert between UTF-8 and UTF-16 where possible" This reverts commit 19ce4d0dfa8c13255f56342f8c179c6a601d9b2d. --- taglib/toolkit/tstring.cpp | 88 ++------------------------------------ 1 file changed, 4 insertions(+), 84 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index 530a9c6f..e88b70f7 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -31,24 +31,6 @@ #include #include -// Determine if the compiler supports codecvt. - -#ifndef __has_include -# define __has_include(x) 0 -#endif - -#if (((defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__)) /* GCC with -std=c++0x option */ \ - || (defined(_MSC_VER) && _MSC_VER >= 1600))) /* VC++2010 or later */ \ - || (defined(__has_include) && __has_include()) /* Clang has */ - -# define TAGLIB_USE_CODECVT -#endif - -#ifdef TAGLIB_USE_CODECVT -# include - typedef std::codecvt_utf8_utf16 utf8_utf16_t; -#endif - using namespace TagLib; namespace { @@ -233,20 +215,6 @@ std::string String::to8Bit(bool unicode) const else { s.resize(d->data.size() * 4 + 1); -#ifdef TAGLIB_USE_CODECVT - - std::mbstate_t st = 0; - const wchar_t *source; - char *target; - utf8_utf16_t::result result = utf8_utf16_t().out( - st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target); - - if(result != utf8_utf16_t::ok) { - debug("String::copyFromUTF8() - Unicode conversion error."); - } - -#else - const Unicode::UTF16 *source = &d->data[0]; Unicode::UTF8 *target = reinterpret_cast(&s[0]); @@ -259,8 +227,6 @@ std::string String::to8Bit(bool unicode) const debug("String::to8Bit() - Unicode conversion error."); } -#endif - s.resize(::strlen(s.c_str())); } @@ -413,38 +379,8 @@ ByteVector String::data(Type t) const } case UTF8: { - v.resize(d->data.size() * 4 + 1); - -#ifdef TAGLIB_USE_CODECVT - - std::mbstate_t st = 0; - const wchar_t *source; - char *target; - utf8_utf16_t::result result = utf8_utf16_t().out( - st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target); - - if(result != utf8_utf16_t::ok) { - debug("String::copyFromUTF8() - Unicode conversion error."); - } - -#else - - const Unicode::UTF16 *source = &d->data[0]; - Unicode::UTF8 *target = reinterpret_cast(v.data()); - - Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8( - &source, source + d->data.size(), - &target, target + v.size(), - Unicode::lenientConversion); - - if(result != Unicode::conversionOK) { - debug("String::to8Bit() - Unicode conversion error."); - } - -#endif - - v.resize(::strlen(v.data()) + 1); - + std::string s = to8Bit(true); + v.setData(s.c_str(), static_cast(s.length())); break; } case UTF16: @@ -794,20 +730,6 @@ void String::copyFromUTF8(const char *s, size_t length) { d->data.resize(length); -#ifdef TAGLIB_USE_CODECVT - - std::mbstate_t st = 0; - const char *source; - wchar_t *target; - utf8_utf16_t::result result = utf8_utf16_t().in( - st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target); - - if(result != utf8_utf16_t::ok) { - debug("String::copyFromUTF8() - Unicode conversion error."); - } - -#else - const Unicode::UTF8 *source = reinterpret_cast(s); Unicode::UTF16 *target = &d->data[0]; @@ -816,13 +738,11 @@ void String::copyFromUTF8(const char *s, size_t length) &target, target + length, Unicode::lenientConversion); + d->data.resize(::wcslen(d->data.c_str())); + if(result != Unicode::conversionOK) { debug("String::copyFromUTF8() - Unicode conversion error."); } - -#endif - - d->data.resize(::wcslen(d->data.c_str())); } void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) From c86ea7bdffc6da24faa5c6917265a416b09016b3 Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Mon, 18 Mar 2013 06:18:50 +0900 Subject: [PATCH 8/9] Use the standard library to convert between UTF-8 and UTF-16 where possible --- taglib/toolkit/tstring.cpp | 85 +++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index e88b70f7..3ed88fd2 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -24,13 +24,24 @@ ***************************************************************************/ #include "tstring.h" -#include "unicode.h" #include "tdebug.h" #include "tstringlist.h" -#include #include +// Determine if the compiler supports codecvt. + +#if (defined(_MSC_VER) && _MSC_VER >= 1600) // VC++2010 or later +# define TAGLIB_USE_CODECVT +#endif + +#ifdef TAGLIB_USE_CODECVT +# include + typedef std::codecvt_utf8_utf16 utf8_utf16_t; +#else +# include "unicode.h" +#endif + using namespace TagLib; namespace { @@ -215,6 +226,20 @@ std::string String::to8Bit(bool unicode) const else { s.resize(d->data.size() * 4 + 1); +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const wchar_t *source; + char *target; + std::codecvt_base::result result = utf8_utf16_t().out( + st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + const Unicode::UTF16 *source = &d->data[0]; Unicode::UTF8 *target = reinterpret_cast(&s[0]); @@ -227,6 +252,8 @@ std::string String::to8Bit(bool unicode) const debug("String::to8Bit() - Unicode conversion error."); } +#endif + s.resize(::strlen(s.c_str())); } @@ -379,8 +406,38 @@ ByteVector String::data(Type t) const } case UTF8: { - std::string s = to8Bit(true); - v.setData(s.c_str(), static_cast(s.length())); + v.resize(d->data.size() * 4 + 1); + +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const wchar_t *source; + char *target; + std::codecvt_base::result result = utf8_utf16_t().out( + st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + + const Unicode::UTF16 *source = &d->data[0]; + Unicode::UTF8 *target = reinterpret_cast(v.data()); + + Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8( + &source, source + d->data.size(), + &target, target + v.size(), + Unicode::lenientConversion); + + if(result != Unicode::conversionOK) { + debug("String::to8Bit() - Unicode conversion error."); + } + +#endif + + v.resize(::strlen(v.data()) + 1); + break; } case UTF16: @@ -730,6 +787,20 @@ void String::copyFromUTF8(const char *s, size_t length) { d->data.resize(length); +#ifdef TAGLIB_USE_CODECVT + + std::mbstate_t st = 0; + const char *source; + wchar_t *target; + std::codecvt_base::result result = utf8_utf16_t().in( + st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target); + + if(result != utf8_utf16_t::ok) { + debug("String::copyFromUTF8() - Unicode conversion error."); + } + +#else + const Unicode::UTF8 *source = reinterpret_cast(s); Unicode::UTF16 *target = &d->data[0]; @@ -738,11 +809,13 @@ void String::copyFromUTF8(const char *s, size_t length) &target, target + length, Unicode::lenientConversion); - d->data.resize(::wcslen(d->data.c_str())); - if(result != Unicode::conversionOK) { debug("String::copyFromUTF8() - Unicode conversion error."); } + +#endif + + d->data.resize(::wcslen(d->data.c_str())); } void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) From 4e05923479ff80234418ecacf0b512f3920080cb Mon Sep 17 00:00:00 2001 From: Tsuda Kageyu Date: Mon, 18 Mar 2013 13:55:49 +0900 Subject: [PATCH 9/9] Removed null termination from return value of String::data() --- taglib/toolkit/tstring.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index 3ed88fd2..97309913 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -436,7 +436,7 @@ ByteVector String::data(Type t) const #endif - v.resize(::strlen(v.data()) + 1); + v.resize(::strlen(v.data())); break; }