From 4e7f844ea6439a5a90546feba359832b2ce2e2e8 Mon Sep 17 00:00:00 2001 From: Urs Fleisch Date: Mon, 25 Jul 2022 20:37:15 +0200 Subject: [PATCH] Correctly parse ID3v2.4.0 multiple strings with single BOM (#1055) Some ID3v2.4.0 frames such as text information frames support multiple strings separated by the termination code of the character encoding. If the encoding is $01 UTF-16 with BOM, all strings shall have the same byte order. In the multi strings written by TagLib, all string elements of such a multi string have a BOM. However, I have often seen tags where a BOM exists only at the beginning, i.e. at the start of the first string. In such a case, TagLib will only return a list with the first string and a second empty string. This commit will detect such cases and parse the strings without BOM according to the BOM of the first string. --- .../id3v2/frames/textidentificationframe.cpp | 26 ++++++++++++-- tests/test_id3v2.cpp | 36 +++++++++++++++++-- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/taglib/mpeg/id3v2/frames/textidentificationframe.cpp b/taglib/mpeg/id3v2/frames/textidentificationframe.cpp index bc647b54..f468b485 100644 --- a/taglib/mpeg/id3v2/frames/textidentificationframe.cpp +++ b/taglib/mpeg/id3v2/frames/textidentificationframe.cpp @@ -218,12 +218,32 @@ void TextIdentificationFrame::parseFields(const ByteVector &data) // append those split values to the list and make sure that the new string's // type is the same specified for this frame + unsigned short firstBom = 0; for(ByteVectorList::ConstIterator it = l.begin(); it != l.end(); it++) { if(!(*it).isEmpty()) { - if(d->textEncoding == String::Latin1) + if(d->textEncoding == String::Latin1) { d->fieldList.append(Tag::latin1StringHandler()->parse(*it)); - else - d->fieldList.append(String(*it, d->textEncoding)); + } + else { + String::Type textEncoding = d->textEncoding; + if(textEncoding == String::UTF16) { + if(it == l.begin()) { + firstBom = it->mid(0, 2).toUShort(); + } + else { + unsigned short subsequentBom = it->mid(0, 2).toUShort(); + if(subsequentBom != 0xfeff && subsequentBom != 0xfffe) { + if(firstBom == 0xfeff) { + textEncoding = String::UTF16BE; + } + else if(firstBom == 0xfffe) { + textEncoding = String::UTF16LE; + } + } + } + } + d->fieldList.append(String(*it, textEncoding)); + } } } } diff --git a/tests/test_id3v2.cpp b/tests/test_id3v2.cpp index 521f460a..b932a9a2 100644 --- a/tests/test_id3v2.cpp +++ b/tests/test_id3v2.cpp @@ -199,7 +199,14 @@ public: sl.append("Foo"); sl.append("Bar"); f.setText(sl); - CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+6+2+6), f.render().size()); + ByteVector data = f.render(); + CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+6+2+6), data.size()); + ByteVector noBomBeData("TPE1\x00\x00\x00\x0f\x00\x00\x02" + "\0F\0o\0o\0\0" + "\0B\0a\0r", 25); + CPPUNIT_ASSERT_EQUAL(noBomBeData, data); + f.setData(data); + CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString()); } void testUTF16Delimiter() @@ -209,7 +216,32 @@ public: sl.append("Foo"); sl.append("Bar"); f.setText(sl); - CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+8+2+8), f.render().size()); + ByteVector data = f.render(); + CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+8+2+8), data.size()); + ByteVector multiBomLeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xff\xfe" + "F\0o\0o\0\0\0" "\xff\xfe" + "B\0a\0r\0", 29); + CPPUNIT_ASSERT_EQUAL(multiBomLeData, data); + f.setData(data); + CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString()); + + ByteVector multiBomBeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xfe\xff" + "\0F\0o\0o\0\0" "\xfe\xff" + "\0B\0a\0r", 29); + f.setData(multiBomBeData); + CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString()); + + ByteVector singleBomLeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xff\xfe" + "F\0o\0o\0\0\0" + "B\0a\0r\0", 27); + f.setData(singleBomLeData); + CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString()); + + ByteVector singleBomBeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xfe\xff" + "\0F\0o\0o\0\0" + "\0B\0a\0r", 27); + f.setData(singleBomBeData); + CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString()); } void testBrokenFrame1()