From 4e7f844ea6439a5a90546feba359832b2ce2e2e8 Mon Sep 17 00:00:00 2001
From: Urs Fleisch <ufleisch@users.sourceforge.net>
Date: Mon, 25 Jul 2022 20:37:15 +0200
Subject: [PATCH] Correctly parse ID3v2.4.0 multiple strings with single BOM
 (#1055)

Some ID3v2.4.0 frames such as text information frames support multiple strings
separated by the termination code of the character encoding. If the encoding
is $01 UTF-16 with BOM, all strings shall have the same byte order. In the
multi strings written by TagLib, all string elements of such a multi string
have a BOM. However, I have often seen tags where a BOM exists only at the
beginning, i.e. at the start of the first string. In such a case, TagLib will
only return a list with the first string and a second empty string. This
commit will detect such cases and parse the strings without BOM according to
the BOM of the first string.
---
 .../id3v2/frames/textidentificationframe.cpp  | 26 ++++++++++++--
 tests/test_id3v2.cpp                          | 36 +++++++++++++++++--
 2 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/taglib/mpeg/id3v2/frames/textidentificationframe.cpp b/taglib/mpeg/id3v2/frames/textidentificationframe.cpp
index bc647b54..f468b485 100644
--- a/taglib/mpeg/id3v2/frames/textidentificationframe.cpp
+++ b/taglib/mpeg/id3v2/frames/textidentificationframe.cpp
@@ -218,12 +218,32 @@ void TextIdentificationFrame::parseFields(const ByteVector &data)
   // append those split values to the list and make sure that the new string's
   // type is the same specified for this frame
 
+  unsigned short firstBom = 0;
   for(ByteVectorList::ConstIterator it = l.begin(); it != l.end(); it++) {
     if(!(*it).isEmpty()) {
-      if(d->textEncoding == String::Latin1)
+      if(d->textEncoding == String::Latin1) {
         d->fieldList.append(Tag::latin1StringHandler()->parse(*it));
-      else
-        d->fieldList.append(String(*it, d->textEncoding));
+      }
+      else {
+        String::Type textEncoding = d->textEncoding;
+        if(textEncoding == String::UTF16) {
+          if(it == l.begin()) {
+            firstBom = it->mid(0, 2).toUShort();
+          }
+          else {
+            unsigned short subsequentBom = it->mid(0, 2).toUShort();
+            if(subsequentBom != 0xfeff && subsequentBom != 0xfffe) {
+              if(firstBom == 0xfeff) {
+                textEncoding = String::UTF16BE;
+              }
+              else if(firstBom == 0xfffe) {
+                textEncoding = String::UTF16LE;
+              }
+            }
+          }
+        }
+        d->fieldList.append(String(*it, textEncoding));
+      }
     }
   }
 }
diff --git a/tests/test_id3v2.cpp b/tests/test_id3v2.cpp
index 521f460a..b932a9a2 100644
--- a/tests/test_id3v2.cpp
+++ b/tests/test_id3v2.cpp
@@ -199,7 +199,14 @@ public:
     sl.append("Foo");
     sl.append("Bar");
     f.setText(sl);
-    CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+6+2+6), f.render().size());
+    ByteVector data = f.render();
+    CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+6+2+6), data.size());
+    ByteVector noBomBeData("TPE1\x00\x00\x00\x0f\x00\x00\x02"
+                           "\0F\0o\0o\0\0"
+                           "\0B\0a\0r", 25);
+    CPPUNIT_ASSERT_EQUAL(noBomBeData, data);
+    f.setData(data);
+    CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString());
   }
 
   void testUTF16Delimiter()
@@ -209,7 +216,32 @@ public:
     sl.append("Foo");
     sl.append("Bar");
     f.setText(sl);
-    CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+8+2+8), f.render().size());
+    ByteVector data = f.render();
+    CPPUNIT_ASSERT_EQUAL((unsigned int)(4+4+2+1+8+2+8), data.size());
+    ByteVector multiBomLeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xff\xfe"
+                              "F\0o\0o\0\0\0" "\xff\xfe"
+                              "B\0a\0r\0", 29);
+    CPPUNIT_ASSERT_EQUAL(multiBomLeData, data);
+    f.setData(data);
+    CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString());
+
+    ByteVector multiBomBeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xfe\xff"
+                              "\0F\0o\0o\0\0" "\xfe\xff"
+                              "\0B\0a\0r", 29);
+    f.setData(multiBomBeData);
+    CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString());
+
+    ByteVector singleBomLeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xff\xfe"
+                               "F\0o\0o\0\0\0"
+                               "B\0a\0r\0", 27);
+    f.setData(singleBomLeData);
+    CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString());
+
+    ByteVector singleBomBeData("TPE1\x00\x00\x00\x13\x00\x00\x01\xfe\xff"
+                               "\0F\0o\0o\0\0"
+                               "\0B\0a\0r", 27);
+    f.setData(singleBomBeData);
+    CPPUNIT_ASSERT_EQUAL(String("Foo Bar"), f.toString());
   }
 
   void testBrokenFrame1()