Matroska: Use seek head for faster element lookup (#1321)

Limit scan for Matroska seek head to 512 KB in ReadStyle::Fast

---------

Co-authored-by: tolriq <git@leetzone.org>
This commit is contained in:
Urs Fleisch
2026-04-13 19:58:52 +02:00
committed by GitHub
parent 13751f5a6b
commit c3a0e1d0a2
7 changed files with 101 additions and 17 deletions

View File

@@ -117,4 +117,3 @@ int main(int argc, char *argv[])
}
return 0;
}

View File

@@ -225,7 +225,7 @@ namespace
#endif
#ifdef TAGLIB_WITH_MATROSKA
else if(ext == "MKA" || ext == "MKV" || ext == "WEBM")
file = new Matroska::File(stream, readAudioProperties);
file = new Matroska::File(stream, readAudioProperties, audioPropertiesStyle);
#endif
// if file is not valid, leave it to content-based detection.

View File

@@ -30,6 +30,32 @@
using namespace TagLib;
namespace {
template <EBML::Element::Id Id, typename ElementType>
std::unique_ptr<ElementType> readElementAt(File &file,
offset_t offset,
offset_t maxOffset)
{
if(offset < 0 || offset >= maxOffset) {
return nullptr;
}
file.seek(offset);
auto element = EBML::Element::factory(file);
if(!element || element->getId() != Id) {
return nullptr;
}
auto typed = EBML::element_cast<Id>(std::move(element));
if(!typed || !typed->read(file)) {
return nullptr;
}
return typed;
}
} // namespace
EBML::MkSegment::MkSegment(int sizeLength, offset_t dataSize, offset_t offset):
MasterElement(Id::MkSegment, sizeLength, dataSize, offset)
{
@@ -49,16 +75,64 @@ offset_t EBML::MkSegment::segmentDataOffset() const
bool EBML::MkSegment::read(File &file)
{
const offset_t maxOffset = file.tell() + dataSize;
return readLimited(file, dataSize);
}
bool EBML::MkSegment::readLimited(File &file, offset_t scanLimit)
{
const offset_t filePos = file.tell();
const offset_t maxOffset = filePos + dataSize;
const offset_t maxScanOffset = filePos + std::min(scanLimit, dataSize);
std::unique_ptr<Element> element;
int i = 0;
int seekHeadIndex = -1;
while((element = findNextElement(file, maxOffset))) {
while((element = findNextElement(file, maxScanOffset))) {
if(const Id id = element->getId(); id == Id::MkSeekHead) {
seekHeadIndex = i;
seekHead = element_cast<Id::MkSeekHead>(std::move(element));
if(!seekHead->read(file))
return false;
// We have a seek head, let's use it for faster access to the other elements
if(const auto elementAfterSeekHead = findNextElement(file, maxScanOffset);
elementAfterSeekHead && elementAfterSeekHead->getId() == Id::VoidElement)
seekHead->setPadding(elementAfterSeekHead->getSize());
const offset_t segDataOffset = segmentDataOffset();
const auto matroskaSeekHead = parseSeekHead();
for(const auto &[idValue, relativeOffset] : matroskaSeekHead->entryList()) {
const offset_t absoluteOffset = segDataOffset + relativeOffset;
switch(static_cast<Id>(idValue)) {
case Id::MkCues:
if(!((cues = readElementAt<Id::MkCues, MkCues>(
file, absoluteOffset, maxOffset))))
return false;
break;
case Id::MkInfo:
if(!((info = readElementAt<Id::MkInfo, MkInfo>(
file, absoluteOffset, maxOffset))))
return false;
break;
case Id::MkTracks:
if(!((tracks = readElementAt<Id::MkTracks, MkTracks>(
file, absoluteOffset, maxOffset))))
return false;
break;
case Id::MkTags:
if(!((tags = readElementAt<Id::MkTags, MkTags>(
file, absoluteOffset, maxOffset))))
return false;
break;
case Id::MkAttachments:
if(!((attachments = readElementAt<Id::MkAttachments, MkAttachments>(
file, absoluteOffset, maxOffset))))
return false;
break;
case Id::MkChapters:
if(!((chapters = readElementAt<Id::MkChapters, MkChapters>(
file, absoluteOffset, maxOffset))))
return false;
break;
default:
break;
}
}
return true;
}
else if(id == Id::MkCues) {
cues = element_cast<Id::MkCues>(std::move(element));
@@ -91,14 +165,8 @@ bool EBML::MkSegment::read(File &file)
return false;
}
else {
if(id == Id::VoidElement
&& seekHead
&& seekHeadIndex == i - 1)
seekHead->setPadding(element->getSize());
element->skipData(file);
}
i++;
}
return true;
}

View File

@@ -51,6 +51,7 @@ namespace TagLib {
offset_t segmentDataOffset() const;
bool read(File &file) override;
bool readLimited(File &file, offset_t scanLimit);
std::unique_ptr<Matroska::Tag> parseTag() const;
std::unique_ptr<Matroska::Attachments> parseAttachments() const;
std::unique_ptr<Matroska::Chapters> parseChapters() const;

View File

@@ -144,6 +144,8 @@ PropertyMap Matroska::File::setProperties(const PropertyMap &properties)
namespace {
constexpr offset_t FAST_SCAN_LIMIT = static_cast<offset_t>(512 * 1024);
String keyForAttachedFile(const Matroska::AttachedFile &attachedFile)
{
if(attachedFile.mediaType().startsWith("image/")) {
@@ -376,10 +378,15 @@ void Matroska::File::read(bool readProperties, Properties::ReadStyle readStyle)
head->skipData(*this);
}
offset_t maxOffset = fileLength - tell();
if (readStyle == Properties::ReadStyle::Fast && maxOffset > FAST_SCAN_LIMIT) {
maxOffset = FAST_SCAN_LIMIT;
}
// Find the Matroska segment in the file
const std::unique_ptr<EBML::MkSegment> segment(
EBML::element_cast<EBML::Element::Id::MkSegment>(
EBML::findElement(*this, EBML::Element::Id::MkSegment, fileLength - tell())
EBML::findElement(*this, EBML::Element::Id::MkSegment, maxOffset)
)
);
if(!segment) {
@@ -389,14 +396,18 @@ void Matroska::File::read(bool readProperties, Properties::ReadStyle readStyle)
}
// Read the segment into memory from file
if(!segment->read(*this)) {
d->segment = segment->parseSegment();
maxOffset = segment->getDataSize();
if (readStyle == Properties::ReadStyle::Fast && maxOffset > FAST_SCAN_LIMIT) {
maxOffset = FAST_SCAN_LIMIT;
}
if(!segment->readLimited(*this, maxOffset)) {
debug("Failed to read segment");
setValid(false);
return;
}
// Parse the elements
d->segment = segment->parseSegment();
d->seekHead = segment->parseSeekHead();
d->cues = segment->parseCues();
d->tag = segment->parseTag();

View File

@@ -54,7 +54,6 @@ bool Matroska::SeekHead::isValid(TagLib::File &file) const
void Matroska::SeekHead::addEntry(const Element &element)
{
entries.append({element.id(), element.offset()});
debug("adding to seekhead");
setNeedsRender(true);
}
@@ -64,6 +63,11 @@ void Matroska::SeekHead::addEntry(ID id, offset_t offset)
setNeedsRender(true);
}
const List<std::pair<unsigned int, offset_t>> &Matroska::SeekHead::entryList() const
{
return entries;
}
ByteVector Matroska::SeekHead::renderInternal()
{
const auto beforeSize = sizeRenderedOrWritten();

View File

@@ -39,6 +39,7 @@ namespace TagLib {
bool isValid(TagLib::File &file) const;
void addEntry(const Element &element);
void addEntry(ID id, offset_t offset);
const List<std::pair<unsigned int, offset_t>> &entryList() const;
void write(TagLib::File &file) override;
void sort();
bool sizeChanged(Element &caller, offset_t delta) override;