From 584bbc7c78dc93c86101ba519cce36879de09558 Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Wed, 4 Dec 2013 09:07:14 +0900
Subject: [PATCH 1/3] Fixed conversion from empty String to ByteVector

---
 taglib/toolkit/tstring.cpp | 79 ++++++++++++++++----------------------
 taglib/toolkit/tstring.h   | 50 ++++++++++++------------
 tests/test_string.cpp      | 38 ++++++++++++++++++
 3 files changed, 97 insertions(+), 70 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index fb6e947a..b6148e69 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -45,7 +45,7 @@
 # include "unicode.h"
 #endif
 
-namespace 
+namespace
 {
 
   inline unsigned short combine(unsigned char c1, unsigned char c2)
@@ -142,7 +142,7 @@ namespace
       debug("String::copyFromUTF8() - Unicode conversion error.");
     }
 
-#endif 
+#endif
   }
 }
 
@@ -151,25 +151,25 @@ namespace TagLib {
 class String::StringPrivate : public RefCounter
 {
 public:
-  StringPrivate() 
-    : RefCounter() 
+  StringPrivate()
+    : RefCounter()
   {
   }
 
-  StringPrivate(const wstring &s) 
+  StringPrivate(const wstring &s)
     : RefCounter()
-    , data(s) 
+    , data(s)
   {
   }
-  
-  StringPrivate(uint n, wchar_t c) 
+
+  StringPrivate(uint n, wchar_t c)
     : RefCounter()
-    , data(static_cast<size_t>(n), c) 
+    , data(static_cast<size_t>(n), c)
   {
   }
 
   /*!
-   * Stores string in UTF-16. The byte order depends on the CPU endian. 
+   * Stores string in UTF-16. The byte order depends on the CPU endian.
    */
   TagLib::wstring data;
 
@@ -183,12 +183,12 @@ String String::null;
 
 ////////////////////////////////////////////////////////////////////////////////
 
-String::String() 
+String::String()
   : d(new StringPrivate())
 {
 }
 
-String::String(const String &s) 
+String::String(const String &s)
   : d(s.d)
 {
   d->ref();
@@ -210,7 +210,7 @@ String::String(const wstring &s, Type t)
   : d(new StringPrivate())
 {
   if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
-    // This looks ugly but needed for the compatibility with TagLib1.8. 
+    // This looks ugly but needed for the compatibility with TagLib1.8.
     // Should be removed in TabLib2.0.
     if (t == UTF16BE)
       t = WCharByteOrder;
@@ -228,7 +228,7 @@ String::String(const wchar_t *s, Type t)
   : d(new StringPrivate())
 {
   if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
-    // This looks ugly but needed for the compatibility with TagLib1.8. 
+    // This looks ugly but needed for the compatibility with TagLib1.8.
     // Should be removed in TabLib2.0.
     if (t == UTF16BE)
       t = WCharByteOrder;
@@ -278,11 +278,11 @@ String::String(const ByteVector &v, Type t)
   if(v.isEmpty())
     return;
 
-  if(t == Latin1) 
+  if(t == Latin1)
     copyFromLatin1(v.data(), v.size());
-  else if(t == UTF8) 
+  else if(t == UTF8)
     copyFromUTF8(v.data(), v.size());
-  else 
+  else
     copyFromUTF16(v.data(), v.size(), t);
 
   // If we hit a null in the ByteVector, shrink the string again.
@@ -299,25 +299,8 @@ String::~String()
 
 std::string String::to8Bit(bool unicode) const
 {
-  std::string s;
-
-  if(!unicode) {
-    s.resize(d->data.size());
-
-    std::string::iterator targetIt = s.begin();
-    for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
-      *targetIt = static_cast<char>(*it);
-      ++targetIt;
-    }
-  }
-  else {
-    s.resize(d->data.size() * 4 + 1);
-
-    UTF16toUTF8(&d->data[0], d->data.size(), &s[0], s.size());
-    s.resize(::strlen(s.c_str()));
-  }
-
-  return s;
+  const ByteVector v = data(unicode ? UTF8 : Latin1);
+  return std::string(v.data(), v.size());
 }
 
 TagLib::wstring String::toWString() const
@@ -444,7 +427,7 @@ bool String::isNull() const
 
 ByteVector String::data(Type t) const
 {
-  switch(t) 
+  switch(t)
   {
   case Latin1:
     {
@@ -457,14 +440,18 @@ ByteVector String::data(Type t) const
       return v;
     }
   case UTF8:
+    if(!d->data.empty())
     {
       ByteVector v(size() * 4 + 1, 0);
 
-      UTF16toUTF8(&d->data[0], d->data.size(), v.data(), v.size());
+      UTF16toUTF8(d->data.c_str(), d->data.size(), v.data(), v.size());
       v.resize(::strlen(v.data()));
 
       return v;
     }
+    else {
+      return ByteVector::null;
+    }
   case UTF16:
     {
       ByteVector v(2 + size() * 2, 0);
@@ -510,7 +497,7 @@ ByteVector String::data(Type t) const
   default:
     {
       debug("String::data() - Invalid Type value.");
-      return ByteVector();
+      return ByteVector::null;
     }
   }
 }
@@ -799,9 +786,9 @@ void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)
 {
   bool swap;
   if(t == UTF16) {
-    if(length >= 1 && s[0] == 0xfeff) 
+    if(length >= 1 && s[0] == 0xfeff)
       swap = false; // Same as CPU endian. No need to swap bytes.
-    else if(length >= 1 && s[0] == 0xfffe) 
+    else if(length >= 1 && s[0] == 0xfffe)
       swap = true;  // Not same as CPU endian. Need to swap bytes.
     else {
       debug("String::copyFromUTF16() - Invalid UTF16 string.");
@@ -811,7 +798,7 @@ void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)
     s++;
     length--;
   }
-  else 
+  else
     swap = (t != WCharByteOrder);
 
   d->data.resize(length);
@@ -836,9 +823,9 @@ void String::copyFromUTF16(const char *s, size_t length, Type t)
     ushort bom;
     ::memcpy(&bom, s, 2);
 
-    if(bom == 0xfeff) 
+    if(bom == 0xfeff)
       swap = false; // Same as CPU endian. No need to swap bytes.
-    else if(bom == 0xfffe) 
+    else if(bom == 0xfffe)
       swap = true;  // Not same as CPU endian. Need to swap bytes.
     else {
       debug("String::copyFromUTF16() - Invalid UTF16 string.");
@@ -848,7 +835,7 @@ void String::copyFromUTF16(const char *s, size_t length, Type t)
     s += 2;
     length -= 2;
   }
-  else 
+  else
     swap = (t != WCharByteOrder);
 
   d->data.resize(length / 2);
@@ -858,7 +845,7 @@ void String::copyFromUTF16(const char *s, size_t length, Type t)
   }
 }
 
-const String::Type String::WCharByteOrder 
+const String::Type String::WCharByteOrder
   = (Utils::SystemByteOrder == Utils::BigEndian) ? String::UTF16BE : String::UTF16LE;
 
 }
diff --git a/taglib/toolkit/tstring.h b/taglib/toolkit/tstring.h
index 21e8518a..2cff5a4b 100644
--- a/taglib/toolkit/tstring.h
+++ b/taglib/toolkit/tstring.h
@@ -135,7 +135,7 @@ namespace TagLib {
     /*!
      * Makes a deep copy of the data in \a s.
      *
-     * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless 
+     * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless
      * of the CPU byte order.  If UTF16BE, it will not be swapped.  This behavior
      * will be changed in TagLib2.0.
      */
@@ -144,7 +144,7 @@ namespace TagLib {
     /*!
      * Makes a deep copy of the data in \a s.
      *
-     * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless 
+     * /note If \a t is UTF16LE, the byte order of \a s will be swapped regardless
      * of the CPU byte order.  If UTF16BE, it will not be swapped.  This behavior
      * will be changed in TagLib2.0.
      */
@@ -186,7 +186,7 @@ namespace TagLib {
     virtual ~String();
 
     /*!
-     * Returns a deep copy of this String as an std::string.  The returned string 
+     * Returns a deep copy of this String as an std::string.  The returned string
      * is encoded in UTF8 if \a unicode is true, otherwise Latin1.
      *
      * \see toCString()
@@ -194,7 +194,7 @@ namespace TagLib {
     std::string to8Bit(bool unicode = false) const;
 
     /*!
-     * Returns a deep copy of this String as a wstring.  The returned string is 
+     * Returns a deep copy of this String as a wstring.  The returned string is
      * encoded in UTF-16 (without BOM/CPU byte order).
      *
      * \see toCWString()
@@ -202,43 +202,43 @@ namespace TagLib {
     wstring toWString() const;
 
     /*!
-     * Creates and returns a standard C-style (null-terminated) version of this 
-     * String.  The returned string is encoded in UTF8 if \a unicode is true, 
+     * Creates and returns a standard C-style (null-terminated) version of this
+     * String.  The returned string is encoded in UTF8 if \a unicode is true,
      * otherwise Latin1.
-     * 
-     * The returned string is still owned by this String and should not be deleted 
+     *
+     * The returned string is still owned by this String and should not be deleted
      * by the user.
      *
-     * The returned pointer remains valid until this String instance is destroyed 
+     * The returned pointer remains valid until this String instance is destroyed
      * or toCString() is called again.
      *
      * \warning This however has the side effect that the returned string will remain
-     * in memory <b>in addition to</b> other memory that is consumed by this 
+     * in memory <b>in addition to</b> other memory that is consumed by this
      * String instance.  So, this method should not be used on large strings or
      * where memory is critical.  Consider using to8Bit() instead to avoid it.
      *
      * \see to8Bit()
      */
     const char *toCString(bool unicode = false) const;
-    
+
     /*!
-     * Returns a standard C-style (null-terminated) wide character version of 
-     * this String.  The returned string is encoded in UTF-16 (without BOM/CPU byte 
+     * Returns a standard C-style (null-terminated) wide character version of
+     * this String.  The returned string is encoded in UTF-16 (without BOM/CPU byte
      * order).
-     * 
-     * The returned string is still owned by this String and should not be deleted 
+     *
+     * The returned string is still owned by this String and should not be deleted
      * by the user.
      *
-     * The returned pointer remains valid until this String instance is destroyed 
+     * The returned pointer remains valid until this String instance is destroyed
      * or any other method of this String is called.
      *
-     * \note This returns a pointer to the String's internal data without any 
+     * \note This returns a pointer to the String's internal data without any
      * conversions.
      *
      * \see toWString()
      */
     const wchar_t *toCWString() const;
-    
+
     /*!
      * Returns an iterator pointing to the beginning of the string.
      */
@@ -333,6 +333,8 @@ namespace TagLib {
      * Returns a ByteVector containing the string's data.  If \a t is Latin1 or
      * UTF8, this will return a vector of 8 bit characters, otherwise it will use
      * 16 bit characters.
+     *
+     * \note The returned data is not null terminated.
      */
     ByteVector data(Type t) const;
 
@@ -484,31 +486,31 @@ namespace TagLib {
 
   private:
     /*!
-     * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order) 
+     * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order)
      * and copies it to the internal buffer.
      */
     void copyFromLatin1(const char *s, size_t length);
 
     /*!
-     * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order) 
+     * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order)
      * and copies it to the internal buffer.
      */
     void copyFromUTF8(const char *s, size_t length);
 
     /*!
-     * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into 
+     * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
      * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
      */
     void copyFromUTF16(const wchar_t *s, size_t length, Type t);
 
     /*!
-     * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into 
+     * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
      * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
      */
     void copyFromUTF16(const char *s, size_t length, Type t);
-    
+
     /*!
-     * Indicates which byte order of UTF-16 is used to store strings internally. 
+     * Indicates which byte order of UTF-16 is used to store strings internally.
      *
      * \note \e String::UTF16BE or \e String::UTF16LE
      */
diff --git a/tests/test_string.cpp b/tests/test_string.cpp
index 9a574b39..1a20ed6f 100644
--- a/tests/test_string.cpp
+++ b/tests/test_string.cpp
@@ -43,6 +43,7 @@ class TestString : public CppUnit::TestFixture
   CPPUNIT_TEST(testToInt);
   CPPUNIT_TEST(testSubstr);
   CPPUNIT_TEST(testNewline);
+  CPPUNIT_TEST(testEncode);
   CPPUNIT_TEST_SUITE_END();
 
 public:
@@ -242,6 +243,43 @@ public:
     CPPUNIT_ASSERT_EQUAL(L'\x0a', String(crlf)[4]);
   }
 
+  void testEncode()
+  {
+    String jpn(L"\u65E5\u672C\u8A9E");
+    ByteVector jpn1 = jpn.data(String::Latin1);
+    ByteVector jpn2 = jpn.data(String::UTF8);
+    ByteVector jpn3 = jpn.data(String::UTF16);
+    ByteVector jpn4 = jpn.data(String::UTF16LE);
+    ByteVector jpn5 = jpn.data(String::UTF16BE);
+    std::string jpn6 = jpn.to8Bit(false);
+    std::string jpn7 = jpn.to8Bit(true);
+
+    CPPUNIT_ASSERT_EQUAL(ByteVector("\xE5\x2C\x9E"), jpn1);
+    CPPUNIT_ASSERT_EQUAL(ByteVector("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), jpn2);
+    CPPUNIT_ASSERT_EQUAL(ByteVector("\xFF\xFE\xE5\x65\x2C\x67\x9E\x8A"), jpn3);
+    CPPUNIT_ASSERT_EQUAL(ByteVector("\xE5\x65\x2C\x67\x9E\x8A"), jpn4);
+    CPPUNIT_ASSERT_EQUAL(ByteVector("\x65\xE5\x67\x2C\x8A\x9E"), jpn5);
+    CPPUNIT_ASSERT_EQUAL(std::string("\xE5\x2C\x9E"), jpn6);
+    CPPUNIT_ASSERT_EQUAL(std::string("\xE6\x97\xA5\xE6\x9C\xAC\xE8\xAA\x9E"), jpn7);
+
+    String empty;
+    ByteVector empty1 = empty.data(String::Latin1);
+    ByteVector empty2 = empty.data(String::UTF8);
+    ByteVector empty3 = empty.data(String::UTF16);
+    ByteVector empty4 = empty.data(String::UTF16LE);
+    ByteVector empty5 = empty.data(String::UTF16BE);
+    std::string empty6 = empty.to8Bit(false);
+    std::string empty7 = empty.to8Bit(true);
+
+    CPPUNIT_ASSERT(empty1.isEmpty());
+    CPPUNIT_ASSERT(empty2.isEmpty());
+    CPPUNIT_ASSERT_EQUAL(ByteVector("\xFF\xFE"), empty3);
+    CPPUNIT_ASSERT(empty4.isEmpty());
+    CPPUNIT_ASSERT(empty5.isEmpty());
+    CPPUNIT_ASSERT(empty6.empty());
+    CPPUNIT_ASSERT(empty7.empty());
+  }
+
 };
 
 CPPUNIT_TEST_SUITE_REGISTRATION(TestString);

From d0f3e9b1865ae3b3bc0716b6fbdb7ce7d2f6f961 Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Fri, 24 Jan 2014 09:48:39 +0900
Subject: [PATCH 2/3] Use std::string::c_str() rather than &s[0] where a const
 pointer is required.

---
 taglib/toolkit/tstring.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index b6148e69..30f70d2c 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -198,9 +198,9 @@ String::String(const std::string &s, Type t)
   : d(new StringPrivate())
 {
   if(t == Latin1)
-    copyFromLatin1(&s[0], s.length());
+    copyFromLatin1(s.c_str(), s.length());
   else if(t == String::UTF8)
-    copyFromUTF8(&s[0], s.length());
+    copyFromUTF8(s.c_str(), s.length());
   else {
     debug("String::String() -- A std::string should not contain UTF16.");
   }
@@ -778,8 +778,10 @@ void String::copyFromUTF8(const char *s, size_t length)
 {
   d->data.resize(length);
 
-  UTF8toUTF16(s, length, &d->data[0], d->data.size());
-  d->data.resize(::wcslen(d->data.c_str()));
+  if(length >  0) {
+    UTF8toUTF16(s, length, &d->data[0], d->data.size());
+    d->data.resize(::wcslen(d->data.c_str()));
+  }
 }
 
 void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)

From 167513ae57cd2625ab08ce555e4a380171cb2e3d Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Wed, 5 Feb 2014 10:35:07 +0900
Subject: [PATCH 3/3] Avoid using &d->data[0] when d->data is empty.

---
 taglib/toolkit/tstring.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index 30f70d2c..774642a8 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -804,11 +804,13 @@ void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)
     swap = (t != WCharByteOrder);
 
   d->data.resize(length);
-  memcpy(&d->data[0], s, length * sizeof(wchar_t));
+  if(length > 0) {
+    memcpy(&d->data[0], s, length * sizeof(wchar_t));
 
-  if(swap) {
-    for(size_t i = 0; i < length; ++i)
-      d->data[i] = Utils::byteSwap(static_cast<ushort>(s[i]));
+    if(swap) {
+      for(size_t i = 0; i < length; ++i)
+        d->data[i] = Utils::byteSwap(static_cast<ushort>(s[i]));
+    }
   }
 }