diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp index fb6e947a..b6148e69 100644 --- a/taglib/toolkit/tstring.cpp +++ b/taglib/toolkit/tstring.cpp @@ -45,7 +45,7 @@ # include "unicode.h" #endif -namespace +namespace { inline unsigned short combine(unsigned char c1, unsigned char c2) @@ -142,7 +142,7 @@ namespace debug("String::copyFromUTF8() - Unicode conversion error."); } -#endif +#endif } } @@ -151,25 +151,25 @@ namespace TagLib { class String::StringPrivate : public RefCounter { public: - StringPrivate() - : RefCounter() + StringPrivate() + : RefCounter() { } - StringPrivate(const wstring &s) + StringPrivate(const wstring &s) : RefCounter() - , data(s) + , data(s) { } - - StringPrivate(uint n, wchar_t c) + + StringPrivate(uint n, wchar_t c) : RefCounter() - , data(static_cast(n), c) + , data(static_cast(n), c) { } /*! - * Stores string in UTF-16. The byte order depends on the CPU endian. + * Stores string in UTF-16. The byte order depends on the CPU endian. */ TagLib::wstring data; @@ -183,12 +183,12 @@ String String::null; //////////////////////////////////////////////////////////////////////////////// -String::String() +String::String() : d(new StringPrivate()) { } -String::String(const String &s) +String::String(const String &s) : d(s.d) { d->ref(); @@ -210,7 +210,7 @@ String::String(const wstring &s, Type t) : d(new StringPrivate()) { if(t == UTF16 || t == UTF16BE || t == UTF16LE) { - // This looks ugly but needed for the compatibility with TagLib1.8. + // This looks ugly but needed for the compatibility with TagLib1.8. // Should be removed in TabLib2.0. if (t == UTF16BE) t = WCharByteOrder; @@ -228,7 +228,7 @@ String::String(const wchar_t *s, Type t) : d(new StringPrivate()) { if(t == UTF16 || t == UTF16BE || t == UTF16LE) { - // This looks ugly but needed for the compatibility with TagLib1.8. + // This looks ugly but needed for the compatibility with TagLib1.8. // Should be removed in TabLib2.0. if (t == UTF16BE) t = WCharByteOrder; @@ -278,11 +278,11 @@ String::String(const ByteVector &v, Type t) if(v.isEmpty()) return; - if(t == Latin1) + if(t == Latin1) copyFromLatin1(v.data(), v.size()); - else if(t == UTF8) + else if(t == UTF8) copyFromUTF8(v.data(), v.size()); - else + else copyFromUTF16(v.data(), v.size(), t); // If we hit a null in the ByteVector, shrink the string again. @@ -299,25 +299,8 @@ String::~String() std::string String::to8Bit(bool unicode) const { - std::string s; - - if(!unicode) { - s.resize(d->data.size()); - - std::string::iterator targetIt = s.begin(); - for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) { - *targetIt = static_cast(*it); - ++targetIt; - } - } - else { - s.resize(d->data.size() * 4 + 1); - - UTF16toUTF8(&d->data[0], d->data.size(), &s[0], s.size()); - s.resize(::strlen(s.c_str())); - } - - return s; + const ByteVector v = data(unicode ? UTF8 : Latin1); + return std::string(v.data(), v.size()); } TagLib::wstring String::toWString() const @@ -444,7 +427,7 @@ bool String::isNull() const ByteVector String::data(Type t) const { - switch(t) + switch(t) { case Latin1: { @@ -457,14 +440,18 @@ ByteVector String::data(Type t) const return v; } case UTF8: + if(!d->data.empty()) { ByteVector v(size() * 4 + 1, 0); - UTF16toUTF8(&d->data[0], d->data.size(), v.data(), v.size()); + UTF16toUTF8(d->data.c_str(), d->data.size(), v.data(), v.size()); v.resize(::strlen(v.data())); return v; } + else { + return ByteVector::null; + } case UTF16: { ByteVector v(2 + size() * 2, 0); @@ -510,7 +497,7 @@ ByteVector String::data(Type t) const default: { debug("String::data() - Invalid Type value."); - return ByteVector(); + return ByteVector::null; } } } @@ -799,9 +786,9 @@ void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) { bool swap; if(t == UTF16) { - if(length >= 1 && s[0] == 0xfeff) + if(length >= 1 && s[0] == 0xfeff) swap = false; // Same as CPU endian. No need to swap bytes. - else if(length >= 1 && s[0] == 0xfffe) + else if(length >= 1 && s[0] == 0xfffe) swap = true; // Not same as CPU endian. Need to swap bytes. else { debug("String::copyFromUTF16() - Invalid UTF16 string."); @@ -811,7 +798,7 @@ void String::copyFromUTF16(const wchar_t *s, size_t length, Type t) s++; length--; } - else + else swap = (t != WCharByteOrder); d->data.resize(length); @@ -836,9 +823,9 @@ void String::copyFromUTF16(const char *s, size_t length, Type t) ushort bom; ::memcpy(&bom, s, 2); - if(bom == 0xfeff) + if(bom == 0xfeff) swap = false; // Same as CPU endian. No need to swap bytes. - else if(bom == 0xfffe) + else if(bom == 0xfffe) swap = true; // Not same as CPU endian. Need to swap bytes. else { debug("String::copyFromUTF16() - Invalid UTF16 string."); @@ -848,7 +835,7 @@ void String::copyFromUTF16(const char *s, size_t length, Type t) s += 2; length -= 2; } - else + else swap = (t != WCharByteOrder); d->data.resize(length / 2); @@ -858,7 +845,7 @@ void String::copyFromUTF16(const char *s, size_t length, Type t) } } -const String::Type String::WCharByteOrder +const String::Type String::WCharByteOrder = (Utils::SystemByteOrder == Utils::BigEndian) ? String::UTF16BE : String::UTF16LE; } diff --git a/taglib/toolkit/tstring.h b/taglib/toolkit/tstring.h index 21e8518a..2cff5a4b 100644 --- a/taglib/toolkit/tstring.h +++ b/taglib/toolkit/tstring.h @@ -135,7 +135,7 @@ namespace TagLib { /*! * Makes a deep copy of the data in \a s. * - * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless + * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless * of the CPU byte order. If UTF16BE, it will not be swapped. This behavior * will be changed in TagLib2.0. */ @@ -144,7 +144,7 @@ namespace TagLib { /*! * Makes a deep copy of the data in \a s. * - * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless + * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless * of the CPU byte order. If UTF16BE, it will not be swapped. This behavior * will be changed in TagLib2.0. */ @@ -186,7 +186,7 @@ namespace TagLib { virtual ~String(); /*! - * Returns a deep copy of this String as an std::string. The returned string + * Returns a deep copy of this String as an std::string. The returned string * is encoded in UTF8 if \a unicode is true, otherwise Latin1. * * \see toCString() @@ -194,7 +194,7 @@ namespace TagLib { std::string to8Bit(bool unicode = false) const; /*! - * Returns a deep copy of this String as a wstring. The returned string is + * Returns a deep copy of this String as a wstring. The returned string is * encoded in UTF-16 (without BOM/CPU byte order). * * \see toCWString() @@ -202,43 +202,43 @@ namespace TagLib { wstring toWString() const; /*! - * Creates and returns a standard C-style (null-terminated) version of this - * String. The returned string is encoded in UTF8 if \a unicode is true, + * Creates and returns a standard C-style (null-terminated) version of this + * String. The returned string is encoded in UTF8 if \a unicode is true, * otherwise Latin1. - * - * The returned string is still owned by this String and should not be deleted + * + * The returned string is still owned by this String and should not be deleted * by the user. * - * The returned pointer remains valid until this String instance is destroyed + * The returned pointer remains valid until this String instance is destroyed * or toCString() is called again. * * \warning This however has the side effect that the returned string will remain - * in memory in addition to other memory that is consumed by this + * in memory in addition to other memory that is consumed by this * String instance. So, this method should not be used on large strings or * where memory is critical. Consider using to8Bit() instead to avoid it. * * \see to8Bit() */ const char *toCString(bool unicode = false) const; - + /*! - * Returns a standard C-style (null-terminated) wide character version of - * this String. The returned string is encoded in UTF-16 (without BOM/CPU byte + * Returns a standard C-style (null-terminated) wide character version of + * this String. The returned string is encoded in UTF-16 (without BOM/CPU byte * order). - * - * The returned string is still owned by this String and should not be deleted + * + * The returned string is still owned by this String and should not be deleted * by the user. * - * The returned pointer remains valid until this String instance is destroyed + * The returned pointer remains valid until this String instance is destroyed * or any other method of this String is called. * - * \note This returns a pointer to the String's internal data without any + * \note This returns a pointer to the String's internal data without any * conversions. * * \see toWString() */ const wchar_t *toCWString() const; - + /*! * Returns an iterator pointing to the beginning of the string. */ @@ -333,6 +333,8 @@ namespace TagLib { * Returns a ByteVector containing the string's data. If \a t is Latin1 or * UTF8, this will return a vector of 8 bit characters, otherwise it will use * 16 bit characters. + * + * \note The returned data is not null terminated. */ ByteVector data(Type t) const; @@ -484,31 +486,31 @@ namespace TagLib { private: /*! - * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order) + * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order) * and copies it to the internal buffer. */ void copyFromLatin1(const char *s, size_t length); /*! - * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order) + * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order) * and copies it to the internal buffer. */ void copyFromUTF8(const char *s, size_t length); /*! - * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into + * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer. */ void copyFromUTF16(const wchar_t *s, size_t length, Type t); /*! - * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into + * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer. */ void copyFromUTF16(const char *s, size_t length, Type t); - + /*! - * Indicates which byte order of UTF-16 is used to store strings internally. + * Indicates which byte order of UTF-16 is used to store strings internally. * * \note \e String::UTF16BE or \e String::UTF16LE */ diff --git a/tests/test_string.cpp b/tests/test_string.cpp index 9a574b39..1a20ed6f 100644 --- a/tests/test_string.cpp +++ b/tests/test_string.cpp @@ -43,6 +43,7 @@ class TestString : public CppUnit::TestFixture CPPUNIT_TEST(testToInt); CPPUNIT_TEST(testSubstr); CPPUNIT_TEST(testNewline); + CPPUNIT_TEST(testEncode); CPPUNIT_TEST_SUITE_END(); public: @@ -242,6 +243,43 @@ public: CPPUNIT_ASSERT_EQUAL(L'\x0a', String(crlf)[4]); } + void testEncode() + { + String jpn(L"\u65E5\u672C\u8A9E"); + ByteVector jpn1 = jpn.data(String::Latin1); + ByteVector jpn2 = jpn.data(String::UTF8); + ByteVector jpn3 = jpn.data(String::UTF16); + ByteVector jpn4 = jpn.data(String::UTF16LE); + ByteVector jpn5 = jpn.data(String::UTF16BE); + std::string jpn6 = jpn.to8Bit(false); + std::string jpn7 = jpn.to8Bit(true); + + CPPUNIT_ASSERT_EQUAL(ByteVector("\xE5\x2C\x9E"), jpn1); + CPPUNIT_ASSERT_EQUAL(ByteVector("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), jpn2); + CPPUNIT_ASSERT_EQUAL(ByteVector("\xFF\xFE\xE5\x65\x2C\x67\x9E\x8A"), jpn3); + CPPUNIT_ASSERT_EQUAL(ByteVector("\xE5\x65\x2C\x67\x9E\x8A"), jpn4); + CPPUNIT_ASSERT_EQUAL(ByteVector("\x65\xE5\x67\x2C\x8A\x9E"), jpn5); + CPPUNIT_ASSERT_EQUAL(std::string("\xE5\x2C\x9E"), jpn6); + CPPUNIT_ASSERT_EQUAL(std::string("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), jpn7); + + String empty; + ByteVector empty1 = empty.data(String::Latin1); + ByteVector empty2 = empty.data(String::UTF8); + ByteVector empty3 = empty.data(String::UTF16); + ByteVector empty4 = empty.data(String::UTF16LE); + ByteVector empty5 = empty.data(String::UTF16BE); + std::string empty6 = empty.to8Bit(false); + std::string empty7 = empty.to8Bit(true); + + CPPUNIT_ASSERT(empty1.isEmpty()); + CPPUNIT_ASSERT(empty2.isEmpty()); + CPPUNIT_ASSERT_EQUAL(ByteVector("\xFF\xFE"), empty3); + CPPUNIT_ASSERT(empty4.isEmpty()); + CPPUNIT_ASSERT(empty5.isEmpty()); + CPPUNIT_ASSERT(empty6.empty()); + CPPUNIT_ASSERT(empty7.empty()); + } + }; CPPUNIT_TEST_SUITE_REGISTRATION(TestString);