From 9b19453059c3e2da9b79ab2eb56ce6ea7a3d65ab Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Sun, 17 Mar 2013 12:51:00 +0900
Subject: [PATCH 1/9] Some improvements of String

---
 taglib/toolkit/tstring.cpp | 390 +++++++++++++++++--------------------
 taglib/toolkit/tstring.h   |  46 +++--
 taglib/toolkit/unicode.h   |   4 +-
 3 files changed, 215 insertions(+), 225 deletions(-)
diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index e3320951..02059fff 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -29,47 +29,65 @@
 #include "tstringlist.h"
 
 #include <iostream>
-
 #include <string.h>
 
-namespace TagLib {
+using namespace TagLib;
+
+namespace {
 
   inline unsigned short byteSwap(unsigned short x)
   {
+#if defined(_MSC_VER) && (_MSC_VER >= 1400)  // VC++2005 or later
+
+    return _byteswap_ushort(x);
+
+#else
+
     return (((x) >> 8) & 0xff) | (((x) & 0xff) << 8);
+
+#endif
   }
 
   inline unsigned short combine(unsigned char c1, unsigned char c2)
   {
     return (c1 << 8) | c2;
   }
+
+  String::Type wcharByteOrder() 
+  {
+    // Detect CPU endian.
+    union {
+      wchar_t w;
+      char c[2];
+    } x = { 0xfeff };
+
+    if(x.c[0] == 0xfe)
+      return String::UTF16BE;
+    else
+      return String::UTF16LE;
+  }
 }
 
-using namespace TagLib;
 
 class String::StringPrivate : public RefCounter
 {
 public:
   StringPrivate(const wstring &s) :
     RefCounter(),
-    data(s),
-    CString(0) {}
+    data(s) {}
 
   StringPrivate() :
-    RefCounter(),
-    CString(0) {}
-
-  ~StringPrivate() {
-    delete [] CString;
-  }
-
-  wstring data;
+    RefCounter() {}
 
   /*!
-   * This is only used to hold the a pointer to the most recent value of
-   * toCString.
+   * Stores string in UTF-16. The byte order depends on the CPU endian. 
    */
-  char *CString;
+  TagLib::wstring data;
+
+  /*!
+   * This is only used to hold the the most recent value of toCString().
+   */
+  std::string cstring;
 };
 
 String String::null;
@@ -90,74 +108,72 @@ String::String(const std::string &s, Type t)
 {
   d = new StringPrivate;
 
-  if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
+  if(t == Latin1)
+    copyFromLatin1(&s[0], s.length());
+  else if(t == String::UTF8)
+    copyFromUTF8(&s[0], s.length());
+  else {
     debug("String::String() -- A std::string should not contain UTF16.");
-    return;
   }
-
-  d->data.resize(s.length());
-  wstring::iterator targetIt = d->data.begin();
-
-  for(std::string::const_iterator it = s.begin(); it != s.end(); it++) {
-    *targetIt = uchar(*it);
-    ++targetIt;
-  }
-
-  prepare(t);
 }
 
 String::String(const wstring &s, Type t)
 {
-  d = new StringPrivate(s);
-  prepare(t);
+  d = new StringPrivate;
+
+  if(t == UTF16 || t == UTF16BE || t == UTF16LE)
+    copyFromUTF16(s.c_str(), s.length(), t);
+  else {
+    debug("String::String() -- A TagLib::wstring should not contain Latin1 or UTF-8.");
+  }
 }
 
 String::String(const wchar_t *s, Type t)
 {
-  d = new StringPrivate(s);
-  prepare(t);
+  d = new StringPrivate;
+
+  if(t == UTF16 || t == UTF16BE || t == UTF16LE)
+    copyFromUTF16(s, ::wcslen(s), t);
+  else {
+    debug("String::String() -- A const wchar_t * should not contain Latin1 or UTF-8.");
+  }
 }
 
 String::String(const char *s, Type t)
 {
   d = new StringPrivate;
 
-  if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
+  if(t == Latin1)
+    copyFromLatin1(s, ::strlen(s));
+  else if(t == String::UTF8)
+    copyFromUTF8(s, ::strlen(s));
+  else {
     debug("String::String() -- A const char * should not contain UTF16.");
-    return;
   }
-
-  const size_t length = ::strlen(s);
-  d->data.resize(length);
-
-  wstring::iterator targetIt = d->data.begin();
-
-  for(size_t i = 0; i < length; i++) {
-    *targetIt = uchar(s[i]);
-    ++targetIt;
-  }
-
-  prepare(t);
 }
 
 String::String(wchar_t c, Type t)
 {
   d = new StringPrivate;
-  d->data += c;
-  prepare(t);
+  
+  if(t == UTF16 || t == UTF16BE || t == UTF16LE)
+    copyFromUTF16(&c, 1, t);
+  else {
+    debug("String::String() -- A const wchar_t should not contain Latin1 or UTF-8.");
+  }
 }
 
 String::String(char c, Type t)
 {
   d = new StringPrivate;
 
-  if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
-    debug("String::String() -- A std::string should not contain UTF16.");
-    return;
+  if(t == Latin1 || t == UTF8) {
+    d->data.resize(1);
+    d->data[0] = static_cast<uchar>(c);
+  }
+  else {
+    debug("String::String() -- A char  should not contain UTF16.");
   }
-
-  d->data += uchar(c);
-  prepare(t);
 }
 
 String::String(const ByteVector &v, Type t)
@@ -167,31 +183,12 @@ String::String(const ByteVector &v, Type t)
   if(v.isEmpty())
     return;
 
-  if(t == Latin1 || t == UTF8) {
-
-    int length = 0;
-    d->data.resize(v.size());
-    wstring::iterator targetIt = d->data.begin();
-    for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) {
-      *targetIt = uchar(*it);
-      ++targetIt;
-      ++length;
-    }
-    d->data.resize(length);
-  }
-  else  {
-    d->data.resize(v.size() / 2);
-    wstring::iterator targetIt = d->data.begin();
-
-    for(ByteVector::ConstIterator it = v.begin();
-        it != v.end() && it + 1 != v.end() && combine(*it, *(it + 1));
-        it += 2)
-    {
-      *targetIt = combine(*it, *(it + 1));
-      ++targetIt;
-    }
-  }
-  prepare(t);
+  if(t == Latin1) 
+    copyFromLatin1(v.data(), v.size());
+  else if(t == UTF8) 
+    copyFromUTF8(v.data(), v.size());
+  else 
+    copyFromUTF16(v.data(), v.size(), t);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -205,72 +202,46 @@ String::~String()
 std::string String::to8Bit(bool unicode) const
 {
   std::string s;
-  s.resize(d->data.size());
 
   if(!unicode) {
+    s.resize(d->data.size());
+
     std::string::iterator targetIt = s.begin();
     for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
-      *targetIt = char(*it);
+      *targetIt = static_cast<char>(*it);
       ++targetIt;
     }
-    return s;
   }
+  else {
+    s.resize(d->data.size() * 4 + 1);
 
-  const size_t outputBufferSize = d->data.size() * 3 + 1;
+    const Unicode::UTF16 *source = &d->data[0];
+    Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(&s[0]);
 
-  Unicode::UTF16 *sourceBuffer = new Unicode::UTF16[d->data.size() + 1];
-  Unicode::UTF8  *targetBuffer = new Unicode::UTF8[outputBufferSize];
+    Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
+      &source, source + d->data.size(),
+      &target, target + s.size(),
+      Unicode::lenientConversion);
 
-  for(size_t i = 0; i < d->data.size(); i++)
-    sourceBuffer[i] = Unicode::UTF16(d->data[i]);
+    if(result != Unicode::conversionOK) {
+      debug("String::to8Bit() - Unicode conversion error.");
+    }
 
-  const Unicode::UTF16 *source = sourceBuffer;
-  Unicode::UTF8 *target = targetBuffer;
-
-  Unicode::ConversionResult result =
-    Unicode::ConvertUTF16toUTF8(&source, sourceBuffer + d->data.size(),
-                                &target, targetBuffer + outputBufferSize,
-                                Unicode::lenientConversion);
-
-  if(result != Unicode::conversionOK) {
-    debug("String::to8Bit() - Unicode conversion error.");
+    s.resize(::strlen(s.c_str()));
   }
 
-  const size_t newSize = target - targetBuffer;
-  s.resize(newSize);
-  targetBuffer[newSize] = 0;
-
-  s = (char *) targetBuffer;
-
-  delete [] sourceBuffer;
-  delete [] targetBuffer;
-
   return s;
 }
 
-TagLib::wstring String::toWString() const
+const TagLib::wstring &String::toWString() const
 {
   return d->data;
 }
 
 const char *String::toCString(bool unicode) const
 {
-  delete [] d->CString;
-
-  std::string buffer = to8Bit(unicode);
-  d->CString = new char[buffer.size() + 1];
-
-#if defined(_MSC_VER) && (_MSC_VER >= 1400)  // VC++2005 or later
-
-  strcpy_s(d->CString, buffer.size() + 1, buffer.c_str());
-
-#else
-
-  strcpy(d->CString, buffer.c_str());
-
-#endif                                          
-
-  return d->CString;
+  d->cstring = to8Bit(unicode);
+  return d->cstring.c_str();
 }
 
 String::Iterator String::begin()
@@ -552,14 +523,14 @@ String String::number(int n) // static
   return s;
 }
 
-TagLib::wchar &String::operator[](int i)
+TagLib::wchar &String::operator[](size_t i)
 {
   detach();
 
   return d->data[i];
 }
 
-const TagLib::wchar &String::operator[](int i) const
+const TagLib::wchar &String::operator[](size_t i) const
 {
   return d->data[i];
 }
@@ -633,14 +604,7 @@ String &String::operator=(const std::string &s)
     delete d;
 
   d = new StringPrivate;
-
-  d->data.resize(s.size());
-
-  wstring::iterator targetIt = d->data.begin();
-  for(std::string::const_iterator it = s.begin(); it != s.end(); it++) {
-    *targetIt = uchar(*it);
-    ++targetIt;
-  }
+  copyFromLatin1(s.c_str(), s.length());
 
   return *this;
 }
@@ -649,7 +613,9 @@ String &String::operator=(const wstring &s)
 {
   if(d->deref())
     delete d;
+  
   d = new StringPrivate(s);
+
   return *this;
 }
 
@@ -657,7 +623,10 @@ String &String::operator=(const wchar_t *s)
 {
   if(d->deref())
     delete d;
-  d = new StringPrivate(s);
+  
+  d = new StringPrivate;
+  copyFromUTF16(s, ::wcslen(s), WCharByteOrder);
+
   return *this;
 }
 
@@ -665,8 +634,11 @@ String &String::operator=(char c)
 {
   if(d->deref())
     delete d;
+  
   d = new StringPrivate;
-  d->data += uchar(c);
+  d->data.resize(1);
+  d->data[0] = static_cast<uchar>(c);
+
   return *this;
 }
 
@@ -674,8 +646,11 @@ String &String::operator=(wchar_t c)
 {
   if(d->deref())
     delete d;
+
   d = new StringPrivate;
-  d->data += c;
+  d->data.resize(1);
+  d->data[0] = c;
+
   return *this;
 }
 
@@ -685,15 +660,7 @@ String &String::operator=(const char *s)
     delete d;
 
   d = new StringPrivate;
-
-  const size_t length = ::strlen(s);
-  d->data.resize(length);
-
-  wstring::iterator targetIt = d->data.begin();
-  for(size_t i = 0; i < length; i++) {
-    *targetIt = uchar(s[i]);
-    ++targetIt;
-  }
+  copyFromLatin1(s, ::strlen(s));
 
   return *this;
 }
@@ -704,20 +671,10 @@ String &String::operator=(const ByteVector &v)
     delete d;
 
   d = new StringPrivate;
-  d->data.resize(v.size());
-  wstring::iterator targetIt = d->data.begin();
-
-  uint i = 0;
-
-  for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) {
-    *targetIt = uchar(*it);
-    ++targetIt;
-    ++i;
-  }
+  copyFromLatin1(v.data(), v.size());
 
   // If we hit a null in the ByteVector, shrink the string again.
-
-  d->data.resize(i);
+  d->data.resize(::wcslen(d->data.c_str()));
 
   return *this;
 }
@@ -743,70 +700,81 @@ void String::detach()
 // private members
 ////////////////////////////////////////////////////////////////////////////////
 
-void String::prepare(Type t)
+
+void String::copyFromLatin1(const char *s, size_t length)
 {
-  switch(t) {
-  case UTF16:
-  {
-    if(d->data.size() >= 1 && (d->data[0] == 0xfeff || d->data[0] == 0xfffe)) {
-      bool swap = d->data[0] != 0xfeff;
-      d->data.erase(d->data.begin(), d->data.begin() + 1);
-      if(swap) {
-        for(uint i = 0; i < d->data.size(); i++)
-          d->data[i] = byteSwap((unsigned short)d->data[i]);
-      }
-    }
-    else {
-      debug("String::prepare() - Invalid UTF16 string.");
-      d->data.erase(d->data.begin(), d->data.end());
-    }
-    break;
-  }
-  case UTF8:
-  {
-    const size_t bufferSize = d->data.size() + 1;
-    Unicode::UTF8  *sourceBuffer = new Unicode::UTF8[bufferSize];
-    Unicode::UTF16 *targetBuffer = new Unicode::UTF16[bufferSize];
+  d->data.resize(length);
 
-    unsigned int i = 0;
-    for(; i < d->data.size(); i++)
-      sourceBuffer[i] = Unicode::UTF8(d->data[i]);
-    sourceBuffer[i] = 0;
+  for(size_t i = 0; i < length; ++i)
+    d->data[i] = static_cast<uchar>(s[i]);
+}
 
-    const Unicode::UTF8 *source = sourceBuffer;
-    Unicode::UTF16 *target = targetBuffer;
+void String::copyFromUTF8(const char *s, size_t length)
+{
+  d->data.resize(length);
 
-    Unicode::ConversionResult result =
-      Unicode::ConvertUTF8toUTF16(&source, sourceBuffer + bufferSize,
-                                  &target, targetBuffer + bufferSize,
-                                  Unicode::lenientConversion);
+  const Unicode::UTF8 *source = reinterpret_cast<const Unicode::UTF8 *>(s);
+  Unicode::UTF16 *target = &d->data[0];
 
-    if(result != Unicode::conversionOK) {
-      debug("String::prepare() - Unicode conversion error.");
-    }
+  Unicode::ConversionResult result = Unicode::ConvertUTF8toUTF16(
+    &source, source + length,
+    &target, target + length,
+    Unicode::lenientConversion);
 
-    const size_t newSize = target != targetBuffer ? target - targetBuffer - 1 : 0;
-    d->data.resize(newSize);
+  d->data.resize(::wcslen(d->data.c_str()));
 
-    for(size_t i = 0; i < newSize; i++)
-      d->data[i] = targetBuffer[i];
-
-    delete [] sourceBuffer;
-    delete [] targetBuffer;
-
-    break;
-  }
-  case UTF16LE:
-  {
-    for(uint i = 0; i < d->data.size(); i++)
-      d->data[i] = byteSwap((unsigned short)d->data[i]);
-    break;
-  }
-  default:
-    break;
+  if(result != Unicode::conversionOK) {
+    debug("String::prepare() - Unicode conversion error.");
   }
 }
 
+void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)
+{
+  bool swap;
+  if(t == UTF16) {
+    if(length >= 1) {
+      if(s[0] == 0xfeff) 
+        swap = false; // Same as CPU endian. No need to swap bytes.
+      else if(s[0] == 0xfffe) 
+        swap = true;  // Not same as CPU endian. Need to swap bytes.
+      else {
+        debug("String::prepare() - Invalid UTF16 string.");
+        return;
+      }
+
+      s++;
+      length--;
+    }
+  }
+  else 
+    swap = (t != WCharByteOrder);
+
+  d->data.resize(length);
+  memcpy(&d->data[0], s, length * sizeof(wchar_t));
+
+  if(swap) {
+    for(size_t i = 0; i < length; ++i)
+      d->data[i] = byteSwap(static_cast<unsigned short>(s[i]));
+  }
+}
+
+void String::copyFromUTF16(const char *s, size_t length, Type t)
+{
+  if(sizeof(wchar_t) == 2) 
+    copyFromUTF16(reinterpret_cast<const wchar_t*>(s), length / 2, t);
+  else
+  {
+    std::vector<wchar_t> sourceBuffer(length / 2);
+    for(size_t i = 0; i < length / 2; ++i) {
+      sourceBuffer[i] = combine(*s, *(s + 1));
+      s += 2;
+    }
+  }
+}
+
+String::Type String::WCharByteOrder = wcharByteOrder();
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // related functions
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/taglib/toolkit/tstring.h b/taglib/toolkit/tstring.h
index a1ba164c..ee81f999 100644
--- a/taglib/toolkit/tstring.h
+++ b/taglib/toolkit/tstring.h
@@ -98,8 +98,7 @@ namespace TagLib {
        */
       UTF16 = 1,
       /*!
-       * UTF16 <i>big endian</i>.  16 bit characters.  This is the encoding used
-       * internally by TagLib.
+       * UTF16 <i>big endian</i>.  16 bit characters.  
        */
       UTF16BE = 2,
       /*!
@@ -135,12 +134,12 @@ namespace TagLib {
     /*!
      * Makes a deep copy of the data in \a s.
      */
-    String(const wstring &s, Type t = UTF16BE);
+    String(const wstring &s, Type t = WCharByteOrder);
 
     /*!
      * Makes a deep copy of the data in \a s.
      */
-    String(const wchar_t *s, Type t = UTF16BE);
+    String(const wchar_t *s, Type t = WCharByteOrder);
 
     /*!
      * Makes a deep copy of the data in \a c.
@@ -187,7 +186,7 @@ namespace TagLib {
     /*!
      * Returns a wstring version of the TagLib string as a wide string.
      */
-    wstring toWString() const;
+    const TagLib::wstring &toWString() const;
 
     /*!
      * Creates and returns a C-String based on the data.  This string is still
@@ -335,12 +334,12 @@ namespace TagLib {
     /*!
      * Returns a reference to the character at position \a i.
      */
-    wchar &operator[](int i);
+    wchar &operator[](size_t i);
 
     /*!
      * Returns a const reference to the character at position \a i.
      */
-    const wchar &operator[](int i) const;
+    const wchar &operator[](size_t i) const;
 
     /*!
      * Compares each character of the String with each character of \a s and
@@ -442,12 +441,35 @@ namespace TagLib {
 
   private:
     /*!
-     * This checks to see if the string is in \e UTF-16 (with BOM) or \e UTF-8
-     * format and if so converts it to \e UTF-16BE for internal use.  \e Latin1
-     * does not require conversion since it is a subset of \e UTF-16BE and
-     * \e UTF16-BE requires no conversion since it is used internally.
+     * Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order) 
+     * and copies it to the internal buffer.
      */
-    void prepare(Type t);
+    void copyFromLatin1(const char *s, size_t length);
+
+    /*!
+     * Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order) 
+     * and copies it to the internal buffer.
+     */
+    void copyFromUTF8(const char *s, size_t length);
+
+    /*!
+     * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into 
+     * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
+     */
+    void copyFromUTF16(const wchar_t *s, size_t length, Type t);
+
+    /*!
+     * Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into 
+     * \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
+     */
+    void copyFromUTF16(const char *s, size_t length, Type t);
+    
+    /*!
+     * Indicates which byte order of UTF-16 is used to store strings internally. 
+     *
+     * \note Set to \e UTF16BE or \e UTF16LE at run time.
+     */
+    static Type WCharByteOrder;
 
     class StringPrivate;
     StringPrivate *d;
diff --git a/taglib/toolkit/unicode.h b/taglib/toolkit/unicode.h
index cf7eb3c5..b9de0ea2 100644
--- a/taglib/toolkit/unicode.h
+++ b/taglib/toolkit/unicode.h
@@ -115,8 +115,8 @@
 namespace Unicode {
 
 typedef unsigned long	UTF32;	/* at least 32 bits */
-typedef unsigned short	UTF16;	/* at least 16 bits */
-typedef unsigned char	UTF8;	/* typically 8 bits */
+typedef wchar_t	      UTF16;	/* TagLib assumes that wchar_t is sufficient for UTF-16. */
+typedef unsigned char	UTF8;	  /* typically 8 bits */
 typedef unsigned char	Boolean; /* 0 or 1 */
 
 typedef enum {

From de19ad72abb247c19a6c200ff08aa70e23414def Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Sun, 17 Mar 2013 19:40:01 +0900
Subject: [PATCH 2/9] Fixed CPU endian detection

---
 taglib/toolkit/tstring.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index 02059fff..e7caa40d 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -57,14 +57,14 @@ namespace {
   {
     // Detect CPU endian.
     union {
-      wchar_t w;
-      char c[2];
-    } x = { 0xfeff };
+      TagLib::ushort w;
+      char c;
+    } x = { 0x1234 };
 
-    if(x.c[0] == 0xfe)
-      return String::UTF16BE;
-    else
+    if(x.c == 0x34)
       return String::UTF16LE;
+    else
+      return String::UTF16BE;
   }
 }
 

From 86b7cabf4464ba683bce60ed41356d1fff4ebbb2 Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Sun, 17 Mar 2013 20:00:05 +0900
Subject: [PATCH 3/9] Fix UTF-16 decoding where wchar_t is not 16-bit

---
 taglib/toolkit/tstring.cpp | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index e7caa40d..b8f50f3d 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -764,9 +764,28 @@ void String::copyFromUTF16(const char *s, size_t length, Type t)
     copyFromUTF16(reinterpret_cast<const wchar_t*>(s), length / 2, t);
   else
   {
-    std::vector<wchar_t> sourceBuffer(length / 2);
+    bool swap;
+    if(t == UTF16) {
+      if(length >= 2) {
+        if(*reinterpret_cast<const TagLib::ushort*>(s) == 0xfeff) 
+          swap = false; // Same as CPU endian. No need to swap bytes.
+        else if(*reinterpret_cast<const TagLib::ushort*>(s) == 0xfffe) 
+          swap = true;  // Not same as CPU endian. Need to swap bytes.
+        else {
+          debug("String::prepare() - Invalid UTF16 string.");
+          return;
+        }
+
+        s += 2;
+        length -= 2;
+      }
+    }
+    else 
+      swap = (t != WCharByteOrder);
+
+    d->data.resize(length / 2);
     for(size_t i = 0; i < length / 2; ++i) {
-      sourceBuffer[i] = combine(*s, *(s + 1));
+      d->data[i] = swap ? combine(*s, *(s + 1)) : combine(*(s + 1), *s);
       s += 2;
     }
   }

From 0792eedd12711c070f142a5284b7b0ea001376b9 Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Sun, 17 Mar 2013 20:47:58 +0900
Subject: [PATCH 4/9] Fix UTF-16 BOM detection

---
 taglib/toolkit/tstring.cpp | 46 +++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index b8f50f3d..2b723100 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -724,7 +724,7 @@ void String::copyFromUTF8(const char *s, size_t length)
   d->data.resize(::wcslen(d->data.c_str()));
 
   if(result != Unicode::conversionOK) {
-    debug("String::prepare() - Unicode conversion error.");
+    debug("String::copyFromUTF8() - Unicode conversion error.");
   }
 }
 
@@ -732,19 +732,17 @@ void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)
 {
   bool swap;
   if(t == UTF16) {
-    if(length >= 1) {
-      if(s[0] == 0xfeff) 
-        swap = false; // Same as CPU endian. No need to swap bytes.
-      else if(s[0] == 0xfffe) 
-        swap = true;  // Not same as CPU endian. Need to swap bytes.
-      else {
-        debug("String::prepare() - Invalid UTF16 string.");
-        return;
-      }
-
-      s++;
-      length--;
+    if(length >= 1 && s[0] == 0xfeff) 
+      swap = false; // Same as CPU endian. No need to swap bytes.
+    else if(length >= 1 && s[0] == 0xfffe) 
+      swap = true;  // Not same as CPU endian. Need to swap bytes.
+    else {
+      debug("String::copyFromUTF16() - Invalid UTF16 string.");
+      return;
     }
+
+    s++;
+    length--;
   }
   else 
     swap = (t != WCharByteOrder);
@@ -766,19 +764,17 @@ void String::copyFromUTF16(const char *s, size_t length, Type t)
   {
     bool swap;
     if(t == UTF16) {
-      if(length >= 2) {
-        if(*reinterpret_cast<const TagLib::ushort*>(s) == 0xfeff) 
-          swap = false; // Same as CPU endian. No need to swap bytes.
-        else if(*reinterpret_cast<const TagLib::ushort*>(s) == 0xfffe) 
-          swap = true;  // Not same as CPU endian. Need to swap bytes.
-        else {
-          debug("String::prepare() - Invalid UTF16 string.");
-          return;
-        }
-
-        s += 2;
-        length -= 2;
+      if(length >= 2 && *reinterpret_cast<const TagLib::ushort*>(s) == 0xfeff) 
+        swap = false; // Same as CPU endian. No need to swap bytes.
+      else if(length >= 2 && *reinterpret_cast<const TagLib::ushort*>(s) == 0xfffe) 
+        swap = true;  // Not same as CPU endian. Need to swap bytes.
+      else {
+        debug("String::copyFromUTF16() - Invalid UTF16 string.");
+        return;
       }
+
+      s += 2;
+      length -= 2;
     }
     else 
       swap = (t != WCharByteOrder);

From 6e3639de9e4a6b984dbcdaa15f109b8256cf7fa6 Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Mon, 18 Mar 2013 02:51:11 +0900
Subject: [PATCH 5/9] Avoid creating new String object when comparing

---
 taglib/toolkit/tstring.cpp | 25 +++++++++++++++++++++----
 taglib/toolkit/tstring.h   | 14 +++++++++++++-
 tests/test_string.cpp      |  2 +-
 3 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index 2b723100..e88b70f7 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -330,9 +330,10 @@ String &String::append(const String &s)
 
 String String::upper() const
 {
-  String s;
+  static const int shift = 'A' - 'a';
 
-  static int shift = 'A' - 'a';
+  String s;
+  s.d->data.reserve(d->data.size());
 
   for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); ++it) {
     if(*it >= 'a' && *it <= 'z')
@@ -537,7 +538,24 @@ const TagLib::wchar &String::operator[](size_t i) const
 
 bool String::operator==(const String &s) const
 {
-  return d == s.d || d->data == s.d->data;
+  return (d == s.d || d->data == s.d->data);
+}
+
+bool String::operator==(const char *s) const
+{
+  for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
+    if(*it != static_cast<uchar>(*s))
+      return false;
+
+    s++;
+  }
+
+  return true;
+}
+
+bool String::operator==(const wchar_t *s) const
+{
+  return (d->data == s);
 }
 
 bool String::operator!=(const String &s) const
@@ -700,7 +718,6 @@ void String::detach()
 // private members
 ////////////////////////////////////////////////////////////////////////////////
 
-
 void String::copyFromLatin1(const char *s, size_t length)
 {
   d->data.resize(length);
diff --git a/taglib/toolkit/tstring.h b/taglib/toolkit/tstring.h
index ee81f999..c502e56c 100644
--- a/taglib/toolkit/tstring.h
+++ b/taglib/toolkit/tstring.h
@@ -342,11 +342,23 @@ namespace TagLib {
     const wchar &operator[](size_t i) const;
 
     /*!
-     * Compares each character of the String with each character of \a s and
+     * Compares each character of the String with each character in \a s and
      * returns true if the strings match.
      */
     bool operator==(const String &s) const;
 
+    /*!
+     * Compares each character of the String with each character in \a s and
+     * returns true if the strings match.
+     */
+    bool operator==(const char *s) const;
+
+    /*!
+     * Compares each character of the String with each character of \a s and
+     * returns true if the strings match.
+     */
+    bool operator==(const wchar_t *s) const;
+
     /*!
      * Compares each character of the String with each character of \a s and
      * returns false if the strings match.
diff --git a/tests/test_string.cpp b/tests/test_string.cpp
index 1e37d7a2..c67fa41e 100644
--- a/tests/test_string.cpp
+++ b/tests/test_string.cpp
@@ -116,7 +116,7 @@ public:
     CPPUNIT_ASSERT_EQUAL(a, String(d, String::UTF16));
   }
 
-  // this test is expected to print "TagLib: String::prepare() -
+  // this test is expected to print "TagLib: String::copyFromUTF16() -
   // Invalid UTF16 string." on the console 3 times
   void testUTF16DecodeInvalidBOM()
   {

From 19ce4d0dfa8c13255f56342f8c179c6a601d9b2d Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Mon, 18 Mar 2013 05:56:48 +0900
Subject: [PATCH 6/9] Use the standard library to convert between UTF-8 and
 UTF-16 where possible

---
 taglib/toolkit/tstring.cpp | 88 ++++++++++++++++++++++++++++++++++++--
 1 file changed, 84 insertions(+), 4 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index e88b70f7..530a9c6f 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -31,6 +31,24 @@
 #include <iostream>
 #include <string.h>
 
+// Determine if the compiler supports codecvt.
+
+#ifndef __has_include       
+# define __has_include(x) 0 
+#endif
+
+#if (((defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__))  /* GCC with -std=c++0x option */  \
+  || (defined(_MSC_VER) && _MSC_VER >= 1600)))                    /* VC++2010 or later */           \
+  || (defined(__has_include) && __has_include(<codecvt>))         /* Clang has <codecvt> */
+
+# define TAGLIB_USE_CODECVT
+#endif
+
+#ifdef TAGLIB_USE_CODECVT
+# include <codecvt>
+  typedef std::codecvt_utf8_utf16<wchar_t> utf8_utf16_t;
+#endif
+
 using namespace TagLib;
 
 namespace {
@@ -215,6 +233,20 @@ std::string String::to8Bit(bool unicode) const
   else {
     s.resize(d->data.size() * 4 + 1);
 
+#ifdef TAGLIB_USE_CODECVT
+
+    std::mbstate_t st = 0;
+    const wchar_t *source;
+    char *target;
+    utf8_utf16_t::result result = utf8_utf16_t().out(
+      st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target);
+
+    if(result != utf8_utf16_t::ok) {
+      debug("String::copyFromUTF8() - Unicode conversion error.");
+    }
+
+#else
+
     const Unicode::UTF16 *source = &d->data[0];
     Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(&s[0]);
 
@@ -227,6 +259,8 @@ std::string String::to8Bit(bool unicode) const
       debug("String::to8Bit() - Unicode conversion error.");
     }
 
+#endif
+
     s.resize(::strlen(s.c_str()));
   }
 
@@ -379,8 +413,38 @@ ByteVector String::data(Type t) const
   }
   case UTF8:
   {
-    std::string s = to8Bit(true);
-    v.setData(s.c_str(), static_cast<TagLib::uint>(s.length()));
+    v.resize(d->data.size() * 4 + 1);
+
+#ifdef TAGLIB_USE_CODECVT
+
+    std::mbstate_t st = 0;
+    const wchar_t *source;
+    char *target;
+    utf8_utf16_t::result result = utf8_utf16_t().out(
+      st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target);
+
+    if(result != utf8_utf16_t::ok) {
+      debug("String::copyFromUTF8() - Unicode conversion error.");
+    }
+
+#else
+
+    const Unicode::UTF16 *source = &d->data[0];
+    Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(v.data());
+
+    Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
+      &source, source + d->data.size(),
+      &target, target + v.size(),
+      Unicode::lenientConversion);
+
+    if(result != Unicode::conversionOK) {
+      debug("String::to8Bit() - Unicode conversion error.");
+    }
+
+#endif
+
+    v.resize(::strlen(v.data()) + 1);
+
     break;
   }
   case UTF16:
@@ -730,6 +794,20 @@ void String::copyFromUTF8(const char *s, size_t length)
 {
   d->data.resize(length);
 
+#ifdef TAGLIB_USE_CODECVT
+
+  std::mbstate_t st = 0;
+  const char *source;
+  wchar_t *target;
+  utf8_utf16_t::result result = utf8_utf16_t().in(
+    st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target);
+
+  if(result != utf8_utf16_t::ok) {
+    debug("String::copyFromUTF8() - Unicode conversion error.");
+  }
+
+#else
+  
   const Unicode::UTF8 *source = reinterpret_cast<const Unicode::UTF8 *>(s);
   Unicode::UTF16 *target = &d->data[0];
 
@@ -738,11 +816,13 @@ void String::copyFromUTF8(const char *s, size_t length)
     &target, target + length,
     Unicode::lenientConversion);
 
-  d->data.resize(::wcslen(d->data.c_str()));
-
   if(result != Unicode::conversionOK) {
     debug("String::copyFromUTF8() - Unicode conversion error.");
   }
+
+#endif
+
+  d->data.resize(::wcslen(d->data.c_str()));
 }
 
 void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)

From a842220fe6f3211d046b7718c8aba7e0b0f23acf Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Mon, 18 Mar 2013 06:08:05 +0900
Subject: [PATCH 7/9] Revert "Use the standard library to convert between UTF-8
 and UTF-16 where possible"

This reverts commit 19ce4d0dfa8c13255f56342f8c179c6a601d9b2d.
---
 taglib/toolkit/tstring.cpp | 88 ++------------------------------------
 1 file changed, 4 insertions(+), 84 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index 530a9c6f..e88b70f7 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -31,24 +31,6 @@
 #include <iostream>
 #include <string.h>
 
-// Determine if the compiler supports codecvt.
-
-#ifndef __has_include       
-# define __has_include(x) 0 
-#endif
-
-#if (((defined(__GNUC__) && defined(__GXX_EXPERIMENTAL_CXX0X__))  /* GCC with -std=c++0x option */  \
-  || (defined(_MSC_VER) && _MSC_VER >= 1600)))                    /* VC++2010 or later */           \
-  || (defined(__has_include) && __has_include(<codecvt>))         /* Clang has <codecvt> */
-
-# define TAGLIB_USE_CODECVT
-#endif
-
-#ifdef TAGLIB_USE_CODECVT
-# include <codecvt>
-  typedef std::codecvt_utf8_utf16<wchar_t> utf8_utf16_t;
-#endif
-
 using namespace TagLib;
 
 namespace {
@@ -233,20 +215,6 @@ std::string String::to8Bit(bool unicode) const
   else {
     s.resize(d->data.size() * 4 + 1);
 
-#ifdef TAGLIB_USE_CODECVT
-
-    std::mbstate_t st = 0;
-    const wchar_t *source;
-    char *target;
-    utf8_utf16_t::result result = utf8_utf16_t().out(
-      st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target);
-
-    if(result != utf8_utf16_t::ok) {
-      debug("String::copyFromUTF8() - Unicode conversion error.");
-    }
-
-#else
-
     const Unicode::UTF16 *source = &d->data[0];
     Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(&s[0]);
 
@@ -259,8 +227,6 @@ std::string String::to8Bit(bool unicode) const
       debug("String::to8Bit() - Unicode conversion error.");
     }
 
-#endif
-
     s.resize(::strlen(s.c_str()));
   }
 
@@ -413,38 +379,8 @@ ByteVector String::data(Type t) const
   }
   case UTF8:
   {
-    v.resize(d->data.size() * 4 + 1);
-
-#ifdef TAGLIB_USE_CODECVT
-
-    std::mbstate_t st = 0;
-    const wchar_t *source;
-    char *target;
-    utf8_utf16_t::result result = utf8_utf16_t().out(
-      st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target);
-
-    if(result != utf8_utf16_t::ok) {
-      debug("String::copyFromUTF8() - Unicode conversion error.");
-    }
-
-#else
-
-    const Unicode::UTF16 *source = &d->data[0];
-    Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(v.data());
-
-    Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
-      &source, source + d->data.size(),
-      &target, target + v.size(),
-      Unicode::lenientConversion);
-
-    if(result != Unicode::conversionOK) {
-      debug("String::to8Bit() - Unicode conversion error.");
-    }
-
-#endif
-
-    v.resize(::strlen(v.data()) + 1);
-
+    std::string s = to8Bit(true);
+    v.setData(s.c_str(), static_cast<TagLib::uint>(s.length()));
     break;
   }
   case UTF16:
@@ -794,20 +730,6 @@ void String::copyFromUTF8(const char *s, size_t length)
 {
   d->data.resize(length);
 
-#ifdef TAGLIB_USE_CODECVT
-
-  std::mbstate_t st = 0;
-  const char *source;
-  wchar_t *target;
-  utf8_utf16_t::result result = utf8_utf16_t().in(
-    st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target);
-
-  if(result != utf8_utf16_t::ok) {
-    debug("String::copyFromUTF8() - Unicode conversion error.");
-  }
-
-#else
-  
   const Unicode::UTF8 *source = reinterpret_cast<const Unicode::UTF8 *>(s);
   Unicode::UTF16 *target = &d->data[0];
 
@@ -816,13 +738,11 @@ void String::copyFromUTF8(const char *s, size_t length)
     &target, target + length,
     Unicode::lenientConversion);
 
+  d->data.resize(::wcslen(d->data.c_str()));
+
   if(result != Unicode::conversionOK) {
     debug("String::copyFromUTF8() - Unicode conversion error.");
   }
-
-#endif
-
-  d->data.resize(::wcslen(d->data.c_str()));
 }
 
 void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)

From c86ea7bdffc6da24faa5c6917265a416b09016b3 Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Mon, 18 Mar 2013 06:18:50 +0900
Subject: [PATCH 8/9] Use the standard library to convert between UTF-8 and
 UTF-16 where possible

---
 taglib/toolkit/tstring.cpp | 85 +++++++++++++++++++++++++++++++++++---
 1 file changed, 79 insertions(+), 6 deletions(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index e88b70f7..3ed88fd2 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -24,13 +24,24 @@
  ***************************************************************************/
 
 #include "tstring.h"
-#include "unicode.h"
 #include "tdebug.h"
 #include "tstringlist.h"
 
-#include <iostream>
 #include <string.h>
 
+// Determine if the compiler supports codecvt.
+
+#if (defined(_MSC_VER) && _MSC_VER >= 1600)  // VC++2010 or later 
+# define TAGLIB_USE_CODECVT
+#endif
+
+#ifdef TAGLIB_USE_CODECVT
+# include <codecvt>
+  typedef std::codecvt_utf8_utf16<wchar_t> utf8_utf16_t;
+#else
+# include "unicode.h"
+#endif
+
 using namespace TagLib;
 
 namespace {
@@ -215,6 +226,20 @@ std::string String::to8Bit(bool unicode) const
   else {
     s.resize(d->data.size() * 4 + 1);
 
+#ifdef TAGLIB_USE_CODECVT
+
+    std::mbstate_t st = 0;
+    const wchar_t *source;
+    char *target;
+    std::codecvt_base::result result = utf8_utf16_t().out(
+      st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target);
+
+    if(result != utf8_utf16_t::ok) {
+      debug("String::copyFromUTF8() - Unicode conversion error.");
+    }
+
+#else
+
     const Unicode::UTF16 *source = &d->data[0];
     Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(&s[0]);
 
@@ -227,6 +252,8 @@ std::string String::to8Bit(bool unicode) const
       debug("String::to8Bit() - Unicode conversion error.");
     }
 
+#endif
+
     s.resize(::strlen(s.c_str()));
   }
 
@@ -379,8 +406,38 @@ ByteVector String::data(Type t) const
   }
   case UTF8:
   {
-    std::string s = to8Bit(true);
-    v.setData(s.c_str(), static_cast<TagLib::uint>(s.length()));
+    v.resize(d->data.size() * 4 + 1);
+
+#ifdef TAGLIB_USE_CODECVT
+
+    std::mbstate_t st = 0;
+    const wchar_t *source;
+    char *target;
+    std::codecvt_base::result result = utf8_utf16_t().out(
+        st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target);
+
+    if(result != utf8_utf16_t::ok) {
+      debug("String::copyFromUTF8() - Unicode conversion error.");
+    }
+
+#else
+
+    const Unicode::UTF16 *source = &d->data[0];
+    Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(v.data());
+
+    Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
+      &source, source + d->data.size(),
+      &target, target + v.size(),
+      Unicode::lenientConversion);
+
+    if(result != Unicode::conversionOK) {
+      debug("String::to8Bit() - Unicode conversion error.");
+    }
+
+#endif
+
+    v.resize(::strlen(v.data()) + 1);
+
     break;
   }
   case UTF16:
@@ -730,6 +787,20 @@ void String::copyFromUTF8(const char *s, size_t length)
 {
   d->data.resize(length);
 
+#ifdef TAGLIB_USE_CODECVT
+
+  std::mbstate_t st = 0;
+  const char *source;
+  wchar_t *target;
+  std::codecvt_base::result result = utf8_utf16_t().in(
+    st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target);
+
+  if(result != utf8_utf16_t::ok) {
+    debug("String::copyFromUTF8() - Unicode conversion error.");
+  }
+
+#else
+  
   const Unicode::UTF8 *source = reinterpret_cast<const Unicode::UTF8 *>(s);
   Unicode::UTF16 *target = &d->data[0];
 
@@ -738,11 +809,13 @@ void String::copyFromUTF8(const char *s, size_t length)
     &target, target + length,
     Unicode::lenientConversion);
 
-  d->data.resize(::wcslen(d->data.c_str()));
-
   if(result != Unicode::conversionOK) {
     debug("String::copyFromUTF8() - Unicode conversion error.");
   }
+
+#endif
+
+  d->data.resize(::wcslen(d->data.c_str()));
 }
 
 void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)

From 4e05923479ff80234418ecacf0b512f3920080cb Mon Sep 17 00:00:00 2001
From: Tsuda Kageyu <tsuda.kageyu@gmail.com>
Date: Mon, 18 Mar 2013 13:55:49 +0900
Subject: [PATCH 9/9] Removed null termination from return value of
 String::data()

---
 taglib/toolkit/tstring.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taglib/toolkit/tstring.cpp b/taglib/toolkit/tstring.cpp
index 3ed88fd2..97309913 100644
--- a/taglib/toolkit/tstring.cpp
+++ b/taglib/toolkit/tstring.cpp
@@ -436,7 +436,7 @@ ByteVector String::data(Type t) const
 
 #endif
 
-    v.resize(::strlen(v.data()) + 1);
+    v.resize(::strlen(v.data()));
 
     break;
   }