Merge pull request #103 from TsudaKageyu/string-improvement

Some improvements of String
This commit is contained in:
Tsuda Kageyu 2013-03-19 01:12:25 -07:00
commit 42a74babb5
4 changed files with 341 additions and 234 deletions

View File

@ -24,52 +24,81 @@
***************************************************************************/
#include "tstring.h"
#include "unicode.h"
#include "tdebug.h"
#include "tstringlist.h"
#include <iostream>
#include <string.h>
namespace TagLib {
// Determine if the compiler supports codecvt.
#if (defined(_MSC_VER) && _MSC_VER >= 1600) // VC++2010 or later
# define TAGLIB_USE_CODECVT
#endif
#ifdef TAGLIB_USE_CODECVT
# include <codecvt>
typedef std::codecvt_utf8_utf16<wchar_t> utf8_utf16_t;
#else
# include "unicode.h"
#endif
using namespace TagLib;
namespace {
inline unsigned short byteSwap(unsigned short x)
{
#if defined(_MSC_VER) && (_MSC_VER >= 1400) // VC++2005 or later
return _byteswap_ushort(x);
#else
return (((x) >> 8) & 0xff) | (((x) & 0xff) << 8);
#endif
}
inline unsigned short combine(unsigned char c1, unsigned char c2)
{
return (c1 << 8) | c2;
}
String::Type wcharByteOrder()
{
// Detect CPU endian.
union {
TagLib::ushort w;
char c;
} x = { 0x1234 };
if(x.c == 0x34)
return String::UTF16LE;
else
return String::UTF16BE;
}
}
using namespace TagLib;
class String::StringPrivate : public RefCounter
{
public:
StringPrivate(const wstring &s) :
RefCounter(),
data(s),
CString(0) {}
data(s) {}
StringPrivate() :
RefCounter(),
CString(0) {}
~StringPrivate() {
delete [] CString;
}
wstring data;
RefCounter() {}
/*!
* This is only used to hold the a pointer to the most recent value of
* toCString.
* Stores string in UTF-16. The byte order depends on the CPU endian.
*/
char *CString;
TagLib::wstring data;
/*!
* This is only used to hold the the most recent value of toCString().
*/
std::string cstring;
};
String String::null;
@ -90,74 +119,72 @@ String::String(const std::string &s, Type t)
{
d = new StringPrivate;
if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
if(t == Latin1)
copyFromLatin1(&s[0], s.length());
else if(t == String::UTF8)
copyFromUTF8(&s[0], s.length());
else {
debug("String::String() -- A std::string should not contain UTF16.");
return;
}
d->data.resize(s.length());
wstring::iterator targetIt = d->data.begin();
for(std::string::const_iterator it = s.begin(); it != s.end(); it++) {
*targetIt = uchar(*it);
++targetIt;
}
prepare(t);
}
String::String(const wstring &s, Type t)
{
d = new StringPrivate(s);
prepare(t);
d = new StringPrivate;
if(t == UTF16 || t == UTF16BE || t == UTF16LE)
copyFromUTF16(s.c_str(), s.length(), t);
else {
debug("String::String() -- A TagLib::wstring should not contain Latin1 or UTF-8.");
}
}
String::String(const wchar_t *s, Type t)
{
d = new StringPrivate(s);
prepare(t);
d = new StringPrivate;
if(t == UTF16 || t == UTF16BE || t == UTF16LE)
copyFromUTF16(s, ::wcslen(s), t);
else {
debug("String::String() -- A const wchar_t * should not contain Latin1 or UTF-8.");
}
}
String::String(const char *s, Type t)
{
d = new StringPrivate;
if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
if(t == Latin1)
copyFromLatin1(s, ::strlen(s));
else if(t == String::UTF8)
copyFromUTF8(s, ::strlen(s));
else {
debug("String::String() -- A const char * should not contain UTF16.");
return;
}
const size_t length = ::strlen(s);
d->data.resize(length);
wstring::iterator targetIt = d->data.begin();
for(size_t i = 0; i < length; i++) {
*targetIt = uchar(s[i]);
++targetIt;
}
prepare(t);
}
String::String(wchar_t c, Type t)
{
d = new StringPrivate;
d->data += c;
prepare(t);
if(t == UTF16 || t == UTF16BE || t == UTF16LE)
copyFromUTF16(&c, 1, t);
else {
debug("String::String() -- A const wchar_t should not contain Latin1 or UTF-8.");
}
}
String::String(char c, Type t)
{
d = new StringPrivate;
if(t == UTF16 || t == UTF16BE || t == UTF16LE) {
debug("String::String() -- A std::string should not contain UTF16.");
return;
if(t == Latin1 || t == UTF8) {
d->data.resize(1);
d->data[0] = static_cast<uchar>(c);
}
else {
debug("String::String() -- A char should not contain UTF16.");
}
d->data += uchar(c);
prepare(t);
}
String::String(const ByteVector &v, Type t)
@ -167,31 +194,12 @@ String::String(const ByteVector &v, Type t)
if(v.isEmpty())
return;
if(t == Latin1 || t == UTF8) {
int length = 0;
d->data.resize(v.size());
wstring::iterator targetIt = d->data.begin();
for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) {
*targetIt = uchar(*it);
++targetIt;
++length;
}
d->data.resize(length);
}
else {
d->data.resize(v.size() / 2);
wstring::iterator targetIt = d->data.begin();
for(ByteVector::ConstIterator it = v.begin();
it != v.end() && it + 1 != v.end() && combine(*it, *(it + 1));
it += 2)
{
*targetIt = combine(*it, *(it + 1));
++targetIt;
}
}
prepare(t);
if(t == Latin1)
copyFromLatin1(v.data(), v.size());
else if(t == UTF8)
copyFromUTF8(v.data(), v.size());
else
copyFromUTF16(v.data(), v.size(), t);
}
////////////////////////////////////////////////////////////////////////////////
@ -205,72 +213,62 @@ String::~String()
std::string String::to8Bit(bool unicode) const
{
std::string s;
s.resize(d->data.size());
if(!unicode) {
s.resize(d->data.size());
std::string::iterator targetIt = s.begin();
for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
*targetIt = char(*it);
*targetIt = static_cast<char>(*it);
++targetIt;
}
return s;
}
else {
s.resize(d->data.size() * 4 + 1);
const size_t outputBufferSize = d->data.size() * 3 + 1;
#ifdef TAGLIB_USE_CODECVT
Unicode::UTF16 *sourceBuffer = new Unicode::UTF16[d->data.size() + 1];
Unicode::UTF8 *targetBuffer = new Unicode::UTF8[outputBufferSize];
std::mbstate_t st = 0;
const wchar_t *source;
char *target;
std::codecvt_base::result result = utf8_utf16_t().out(
st, &d->data[0], &d->data[d->data.size()], source, &s[0], &s[s.size()], target);
for(size_t i = 0; i < d->data.size(); i++)
sourceBuffer[i] = Unicode::UTF16(d->data[i]);
if(result != utf8_utf16_t::ok) {
debug("String::copyFromUTF8() - Unicode conversion error.");
}
const Unicode::UTF16 *source = sourceBuffer;
Unicode::UTF8 *target = targetBuffer;
#else
Unicode::ConversionResult result =
Unicode::ConvertUTF16toUTF8(&source, sourceBuffer + d->data.size(),
&target, targetBuffer + outputBufferSize,
Unicode::lenientConversion);
const Unicode::UTF16 *source = &d->data[0];
Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(&s[0]);
if(result != Unicode::conversionOK) {
debug("String::to8Bit() - Unicode conversion error.");
Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
&source, source + d->data.size(),
&target, target + s.size(),
Unicode::lenientConversion);
if(result != Unicode::conversionOK) {
debug("String::to8Bit() - Unicode conversion error.");
}
#endif
s.resize(::strlen(s.c_str()));
}
const size_t newSize = target - targetBuffer;
s.resize(newSize);
targetBuffer[newSize] = 0;
s = (char *) targetBuffer;
delete [] sourceBuffer;
delete [] targetBuffer;
return s;
}
TagLib::wstring String::toWString() const
const TagLib::wstring &String::toWString() const
{
return d->data;
}
const char *String::toCString(bool unicode) const
{
delete [] d->CString;
std::string buffer = to8Bit(unicode);
d->CString = new char[buffer.size() + 1];
#if defined(_MSC_VER) && (_MSC_VER >= 1400) // VC++2005 or later
strcpy_s(d->CString, buffer.size() + 1, buffer.c_str());
#else
strcpy(d->CString, buffer.c_str());
#endif
return d->CString;
d->cstring = to8Bit(unicode);
return d->cstring.c_str();
}
String::Iterator String::begin()
@ -359,9 +357,10 @@ String &String::append(const String &s)
String String::upper() const
{
String s;
static const int shift = 'A' - 'a';
static int shift = 'A' - 'a';
String s;
s.d->data.reserve(d->data.size());
for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); ++it) {
if(*it >= 'a' && *it <= 'z')
@ -407,8 +406,38 @@ ByteVector String::data(Type t) const
}
case UTF8:
{
std::string s = to8Bit(true);
v.setData(s.c_str(), static_cast<TagLib::uint>(s.length()));
v.resize(d->data.size() * 4 + 1);
#ifdef TAGLIB_USE_CODECVT
std::mbstate_t st = 0;
const wchar_t *source;
char *target;
std::codecvt_base::result result = utf8_utf16_t().out(
st, &d->data[0], &d->data[d->data.size()], source, v.data(), v.data() + v.size(), target);
if(result != utf8_utf16_t::ok) {
debug("String::copyFromUTF8() - Unicode conversion error.");
}
#else
const Unicode::UTF16 *source = &d->data[0];
Unicode::UTF8 *target = reinterpret_cast<Unicode::UTF8*>(v.data());
Unicode::ConversionResult result = Unicode::ConvertUTF16toUTF8(
&source, source + d->data.size(),
&target, target + v.size(),
Unicode::lenientConversion);
if(result != Unicode::conversionOK) {
debug("String::to8Bit() - Unicode conversion error.");
}
#endif
v.resize(::strlen(v.data()));
break;
}
case UTF16:
@ -552,21 +581,38 @@ String String::number(int n) // static
return s;
}
TagLib::wchar &String::operator[](int i)
TagLib::wchar &String::operator[](size_t i)
{
detach();
return d->data[i];
}
const TagLib::wchar &String::operator[](int i) const
const TagLib::wchar &String::operator[](size_t i) const
{
return d->data[i];
}
bool String::operator==(const String &s) const
{
return d == s.d || d->data == s.d->data;
return (d == s.d || d->data == s.d->data);
}
bool String::operator==(const char *s) const
{
for(wstring::const_iterator it = d->data.begin(); it != d->data.end(); it++) {
if(*it != static_cast<uchar>(*s))
return false;
s++;
}
return true;
}
bool String::operator==(const wchar_t *s) const
{
return (d->data == s);
}
bool String::operator!=(const String &s) const
@ -633,14 +679,7 @@ String &String::operator=(const std::string &s)
delete d;
d = new StringPrivate;
d->data.resize(s.size());
wstring::iterator targetIt = d->data.begin();
for(std::string::const_iterator it = s.begin(); it != s.end(); it++) {
*targetIt = uchar(*it);
++targetIt;
}
copyFromLatin1(s.c_str(), s.length());
return *this;
}
@ -649,7 +688,9 @@ String &String::operator=(const wstring &s)
{
if(d->deref())
delete d;
d = new StringPrivate(s);
return *this;
}
@ -657,7 +698,10 @@ String &String::operator=(const wchar_t *s)
{
if(d->deref())
delete d;
d = new StringPrivate(s);
d = new StringPrivate;
copyFromUTF16(s, ::wcslen(s), WCharByteOrder);
return *this;
}
@ -665,8 +709,11 @@ String &String::operator=(char c)
{
if(d->deref())
delete d;
d = new StringPrivate;
d->data += uchar(c);
d->data.resize(1);
d->data[0] = static_cast<uchar>(c);
return *this;
}
@ -674,8 +721,11 @@ String &String::operator=(wchar_t c)
{
if(d->deref())
delete d;
d = new StringPrivate;
d->data += c;
d->data.resize(1);
d->data[0] = c;
return *this;
}
@ -685,15 +735,7 @@ String &String::operator=(const char *s)
delete d;
d = new StringPrivate;
const size_t length = ::strlen(s);
d->data.resize(length);
wstring::iterator targetIt = d->data.begin();
for(size_t i = 0; i < length; i++) {
*targetIt = uchar(s[i]);
++targetIt;
}
copyFromLatin1(s, ::strlen(s));
return *this;
}
@ -704,20 +746,10 @@ String &String::operator=(const ByteVector &v)
delete d;
d = new StringPrivate;
d->data.resize(v.size());
wstring::iterator targetIt = d->data.begin();
uint i = 0;
for(ByteVector::ConstIterator it = v.begin(); it != v.end() && (*it); ++it) {
*targetIt = uchar(*it);
++targetIt;
++i;
}
copyFromLatin1(v.data(), v.size());
// If we hit a null in the ByteVector, shrink the string again.
d->data.resize(i);
d->data.resize(::wcslen(d->data.c_str()));
return *this;
}
@ -743,70 +775,111 @@ void String::detach()
// private members
////////////////////////////////////////////////////////////////////////////////
void String::prepare(Type t)
void String::copyFromLatin1(const char *s, size_t length)
{
switch(t) {
case UTF16:
{
if(d->data.size() >= 1 && (d->data[0] == 0xfeff || d->data[0] == 0xfffe)) {
bool swap = d->data[0] != 0xfeff;
d->data.erase(d->data.begin(), d->data.begin() + 1);
if(swap) {
for(uint i = 0; i < d->data.size(); i++)
d->data[i] = byteSwap((unsigned short)d->data[i]);
}
}
d->data.resize(length);
for(size_t i = 0; i < length; ++i)
d->data[i] = static_cast<uchar>(s[i]);
}
void String::copyFromUTF8(const char *s, size_t length)
{
d->data.resize(length);
#ifdef TAGLIB_USE_CODECVT
std::mbstate_t st = 0;
const char *source;
wchar_t *target;
std::codecvt_base::result result = utf8_utf16_t().in(
st, s, s + length, source, &d->data[0], &d->data[d->data.size()], target);
if(result != utf8_utf16_t::ok) {
debug("String::copyFromUTF8() - Unicode conversion error.");
}
#else
const Unicode::UTF8 *source = reinterpret_cast<const Unicode::UTF8 *>(s);
Unicode::UTF16 *target = &d->data[0];
Unicode::ConversionResult result = Unicode::ConvertUTF8toUTF16(
&source, source + length,
&target, target + length,
Unicode::lenientConversion);
if(result != Unicode::conversionOK) {
debug("String::copyFromUTF8() - Unicode conversion error.");
}
#endif
d->data.resize(::wcslen(d->data.c_str()));
}
void String::copyFromUTF16(const wchar_t *s, size_t length, Type t)
{
bool swap;
if(t == UTF16) {
if(length >= 1 && s[0] == 0xfeff)
swap = false; // Same as CPU endian. No need to swap bytes.
else if(length >= 1 && s[0] == 0xfffe)
swap = true; // Not same as CPU endian. Need to swap bytes.
else {
debug("String::prepare() - Invalid UTF16 string.");
d->data.erase(d->data.begin(), d->data.end());
}
break;
}
case UTF8:
{
const size_t bufferSize = d->data.size() + 1;
Unicode::UTF8 *sourceBuffer = new Unicode::UTF8[bufferSize];
Unicode::UTF16 *targetBuffer = new Unicode::UTF16[bufferSize];
unsigned int i = 0;
for(; i < d->data.size(); i++)
sourceBuffer[i] = Unicode::UTF8(d->data[i]);
sourceBuffer[i] = 0;
const Unicode::UTF8 *source = sourceBuffer;
Unicode::UTF16 *target = targetBuffer;
Unicode::ConversionResult result =
Unicode::ConvertUTF8toUTF16(&source, sourceBuffer + bufferSize,
&target, targetBuffer + bufferSize,
Unicode::lenientConversion);
if(result != Unicode::conversionOK) {
debug("String::prepare() - Unicode conversion error.");
debug("String::copyFromUTF16() - Invalid UTF16 string.");
return;
}
const size_t newSize = target != targetBuffer ? target - targetBuffer - 1 : 0;
d->data.resize(newSize);
for(size_t i = 0; i < newSize; i++)
d->data[i] = targetBuffer[i];
delete [] sourceBuffer;
delete [] targetBuffer;
break;
s++;
length--;
}
case UTF16LE:
{
for(uint i = 0; i < d->data.size(); i++)
d->data[i] = byteSwap((unsigned short)d->data[i]);
break;
}
default:
break;
else
swap = (t != WCharByteOrder);
d->data.resize(length);
memcpy(&d->data[0], s, length * sizeof(wchar_t));
if(swap) {
for(size_t i = 0; i < length; ++i)
d->data[i] = byteSwap(static_cast<unsigned short>(s[i]));
}
}
void String::copyFromUTF16(const char *s, size_t length, Type t)
{
if(sizeof(wchar_t) == 2)
copyFromUTF16(reinterpret_cast<const wchar_t*>(s), length / 2, t);
else
{
bool swap;
if(t == UTF16) {
if(length >= 2 && *reinterpret_cast<const TagLib::ushort*>(s) == 0xfeff)
swap = false; // Same as CPU endian. No need to swap bytes.
else if(length >= 2 && *reinterpret_cast<const TagLib::ushort*>(s) == 0xfffe)
swap = true; // Not same as CPU endian. Need to swap bytes.
else {
debug("String::copyFromUTF16() - Invalid UTF16 string.");
return;
}
s += 2;
length -= 2;
}
else
swap = (t != WCharByteOrder);
d->data.resize(length / 2);
for(size_t i = 0; i < length / 2; ++i) {
d->data[i] = swap ? combine(*s, *(s + 1)) : combine(*(s + 1), *s);
s += 2;
}
}
}
String::Type String::WCharByteOrder = wcharByteOrder();
////////////////////////////////////////////////////////////////////////////////
// related functions
////////////////////////////////////////////////////////////////////////////////

View File

@ -98,8 +98,7 @@ namespace TagLib {
*/
UTF16 = 1,
/*!
* UTF16 <i>big endian</i>. 16 bit characters. This is the encoding used
* internally by TagLib.
* UTF16 <i>big endian</i>. 16 bit characters.
*/
UTF16BE = 2,
/*!
@ -135,12 +134,12 @@ namespace TagLib {
/*!
* Makes a deep copy of the data in \a s.
*/
String(const wstring &s, Type t = UTF16BE);
String(const wstring &s, Type t = WCharByteOrder);
/*!
* Makes a deep copy of the data in \a s.
*/
String(const wchar_t *s, Type t = UTF16BE);
String(const wchar_t *s, Type t = WCharByteOrder);
/*!
* Makes a deep copy of the data in \a c.
@ -187,7 +186,7 @@ namespace TagLib {
/*!
* Returns a wstring version of the TagLib string as a wide string.
*/
wstring toWString() const;
const TagLib::wstring &toWString() const;
/*!
* Creates and returns a C-String based on the data. This string is still
@ -335,18 +334,30 @@ namespace TagLib {
/*!
* Returns a reference to the character at position \a i.
*/
wchar &operator[](int i);
wchar &operator[](size_t i);
/*!
* Returns a const reference to the character at position \a i.
*/
const wchar &operator[](int i) const;
const wchar &operator[](size_t i) const;
/*!
* Compares each character of the String with each character in \a s and
* returns true if the strings match.
*/
bool operator==(const String &s) const;
/*!
* Compares each character of the String with each character in \a s and
* returns true if the strings match.
*/
bool operator==(const char *s) const;
/*!
* Compares each character of the String with each character of \a s and
* returns true if the strings match.
*/
bool operator==(const String &s) const;
bool operator==(const wchar_t *s) const;
/*!
* Compares each character of the String with each character of \a s and
@ -442,12 +453,35 @@ namespace TagLib {
private:
/*!
* This checks to see if the string is in \e UTF-16 (with BOM) or \e UTF-8
* format and if so converts it to \e UTF-16BE for internal use. \e Latin1
* does not require conversion since it is a subset of \e UTF-16BE and
* \e UTF16-BE requires no conversion since it is used internally.
* Converts a \e Latin-1 string into \e UTF-16(without BOM/CPU byte order)
* and copies it to the internal buffer.
*/
void prepare(Type t);
void copyFromLatin1(const char *s, size_t length);
/*!
* Converts a \e UTF-8 string into \e UTF-16(without BOM/CPU byte order)
* and copies it to the internal buffer.
*/
void copyFromUTF8(const char *s, size_t length);
/*!
* Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
* \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
*/
void copyFromUTF16(const wchar_t *s, size_t length, Type t);
/*!
* Converts a \e UTF-16 (with BOM), UTF-16LE or UTF16-BE string into
* \e UTF-16(without BOM/CPU byte order) and copies it to the internal buffer.
*/
void copyFromUTF16(const char *s, size_t length, Type t);
/*!
* Indicates which byte order of UTF-16 is used to store strings internally.
*
* \note Set to \e UTF16BE or \e UTF16LE at run time.
*/
static Type WCharByteOrder;
class StringPrivate;
StringPrivate *d;

View File

@ -115,8 +115,8 @@
namespace Unicode {
typedef unsigned long UTF32; /* at least 32 bits */
typedef unsigned short UTF16; /* at least 16 bits */
typedef unsigned char UTF8; /* typically 8 bits */
typedef wchar_t UTF16; /* TagLib assumes that wchar_t is sufficient for UTF-16. */
typedef unsigned char UTF8; /* typically 8 bits */
typedef unsigned char Boolean; /* 0 or 1 */
typedef enum {

View File

@ -116,7 +116,7 @@ public:
CPPUNIT_ASSERT_EQUAL(a, String(d, String::UTF16));
}
// this test is expected to print "TagLib: String::prepare() -
// this test is expected to print "TagLib: String::copyFromUTF16() -
// Invalid UTF16 string." on the console 3 times
void testUTF16DecodeInvalidBOM()
{