From c3a0e1d0a2d0d1ccaf5b56d27592713f3fa4564f Mon Sep 17 00:00:00 2001 From: Urs Fleisch Date: Mon, 13 Apr 2026 19:58:52 +0200 Subject: [PATCH] Matroska: Use seek head for faster element lookup (#1321) Limit scan for Matroska seek head to 512 KB in ReadStyle::Fast --------- Co-authored-by: tolriq --- examples/tagreader.cpp | 1 - taglib/fileref.cpp | 2 +- taglib/matroska/ebml/ebmlmksegment.cpp | 90 ++++++++++++++++++++++---- taglib/matroska/ebml/ebmlmksegment.h | 1 + taglib/matroska/matroskafile.cpp | 17 ++++- taglib/matroska/matroskaseekhead.cpp | 6 +- taglib/matroska/matroskaseekhead.h | 1 + 7 files changed, 101 insertions(+), 17 deletions(-) diff --git a/examples/tagreader.cpp b/examples/tagreader.cpp index c14ff921..d7c632aa 100644 --- a/examples/tagreader.cpp +++ b/examples/tagreader.cpp @@ -117,4 +117,3 @@ int main(int argc, char *argv[]) } return 0; } - diff --git a/taglib/fileref.cpp b/taglib/fileref.cpp index e0284c6a..e9b0d58a 100644 --- a/taglib/fileref.cpp +++ b/taglib/fileref.cpp @@ -225,7 +225,7 @@ namespace #endif #ifdef TAGLIB_WITH_MATROSKA else if(ext == "MKA" || ext == "MKV" || ext == "WEBM") - file = new Matroska::File(stream, readAudioProperties); + file = new Matroska::File(stream, readAudioProperties, audioPropertiesStyle); #endif // if file is not valid, leave it to content-based detection. diff --git a/taglib/matroska/ebml/ebmlmksegment.cpp b/taglib/matroska/ebml/ebmlmksegment.cpp index 4b9ba4e5..a3450c2f 100644 --- a/taglib/matroska/ebml/ebmlmksegment.cpp +++ b/taglib/matroska/ebml/ebmlmksegment.cpp @@ -30,6 +30,32 @@ using namespace TagLib; +namespace { + +template +std::unique_ptr readElementAt(File &file, + offset_t offset, + offset_t maxOffset) +{ + if(offset < 0 || offset >= maxOffset) { + return nullptr; + } + + file.seek(offset); + auto element = EBML::Element::factory(file); + if(!element || element->getId() != Id) { + return nullptr; + } + + auto typed = EBML::element_cast(std::move(element)); + if(!typed || !typed->read(file)) { + return nullptr; + } + return typed; +} + +} // namespace + EBML::MkSegment::MkSegment(int sizeLength, offset_t dataSize, offset_t offset): MasterElement(Id::MkSegment, sizeLength, dataSize, offset) { @@ -49,16 +75,64 @@ offset_t EBML::MkSegment::segmentDataOffset() const bool EBML::MkSegment::read(File &file) { - const offset_t maxOffset = file.tell() + dataSize; + return readLimited(file, dataSize); +} + +bool EBML::MkSegment::readLimited(File &file, offset_t scanLimit) +{ + const offset_t filePos = file.tell(); + const offset_t maxOffset = filePos + dataSize; + const offset_t maxScanOffset = filePos + std::min(scanLimit, dataSize); std::unique_ptr element; - int i = 0; - int seekHeadIndex = -1; - while((element = findNextElement(file, maxOffset))) { + while((element = findNextElement(file, maxScanOffset))) { if(const Id id = element->getId(); id == Id::MkSeekHead) { - seekHeadIndex = i; seekHead = element_cast(std::move(element)); if(!seekHead->read(file)) return false; + // We have a seek head, let's use it for faster access to the other elements + if(const auto elementAfterSeekHead = findNextElement(file, maxScanOffset); + elementAfterSeekHead && elementAfterSeekHead->getId() == Id::VoidElement) + seekHead->setPadding(elementAfterSeekHead->getSize()); + const offset_t segDataOffset = segmentDataOffset(); + const auto matroskaSeekHead = parseSeekHead(); + for(const auto &[idValue, relativeOffset] : matroskaSeekHead->entryList()) { + const offset_t absoluteOffset = segDataOffset + relativeOffset; + switch(static_cast(idValue)) { + case Id::MkCues: + if(!((cues = readElementAt( + file, absoluteOffset, maxOffset)))) + return false; + break; + case Id::MkInfo: + if(!((info = readElementAt( + file, absoluteOffset, maxOffset)))) + return false; + break; + case Id::MkTracks: + if(!((tracks = readElementAt( + file, absoluteOffset, maxOffset)))) + return false; + break; + case Id::MkTags: + if(!((tags = readElementAt( + file, absoluteOffset, maxOffset)))) + return false; + break; + case Id::MkAttachments: + if(!((attachments = readElementAt( + file, absoluteOffset, maxOffset)))) + return false; + break; + case Id::MkChapters: + if(!((chapters = readElementAt( + file, absoluteOffset, maxOffset)))) + return false; + break; + default: + break; + } + } + return true; } else if(id == Id::MkCues) { cues = element_cast(std::move(element)); @@ -91,14 +165,8 @@ bool EBML::MkSegment::read(File &file) return false; } else { - if(id == Id::VoidElement - && seekHead - && seekHeadIndex == i - 1) - seekHead->setPadding(element->getSize()); - element->skipData(file); } - i++; } return true; } diff --git a/taglib/matroska/ebml/ebmlmksegment.h b/taglib/matroska/ebml/ebmlmksegment.h index 13000f3e..3e8f84f1 100644 --- a/taglib/matroska/ebml/ebmlmksegment.h +++ b/taglib/matroska/ebml/ebmlmksegment.h @@ -51,6 +51,7 @@ namespace TagLib { offset_t segmentDataOffset() const; bool read(File &file) override; + bool readLimited(File &file, offset_t scanLimit); std::unique_ptr parseTag() const; std::unique_ptr parseAttachments() const; std::unique_ptr parseChapters() const; diff --git a/taglib/matroska/matroskafile.cpp b/taglib/matroska/matroskafile.cpp index e21278be..756801d6 100644 --- a/taglib/matroska/matroskafile.cpp +++ b/taglib/matroska/matroskafile.cpp @@ -144,6 +144,8 @@ PropertyMap Matroska::File::setProperties(const PropertyMap &properties) namespace { + constexpr offset_t FAST_SCAN_LIMIT = static_cast(512 * 1024); + String keyForAttachedFile(const Matroska::AttachedFile &attachedFile) { if(attachedFile.mediaType().startsWith("image/")) { @@ -376,10 +378,15 @@ void Matroska::File::read(bool readProperties, Properties::ReadStyle readStyle) head->skipData(*this); } + offset_t maxOffset = fileLength - tell(); + if (readStyle == Properties::ReadStyle::Fast && maxOffset > FAST_SCAN_LIMIT) { + maxOffset = FAST_SCAN_LIMIT; + } + // Find the Matroska segment in the file const std::unique_ptr segment( EBML::element_cast( - EBML::findElement(*this, EBML::Element::Id::MkSegment, fileLength - tell()) + EBML::findElement(*this, EBML::Element::Id::MkSegment, maxOffset) ) ); if(!segment) { @@ -389,14 +396,18 @@ void Matroska::File::read(bool readProperties, Properties::ReadStyle readStyle) } // Read the segment into memory from file - if(!segment->read(*this)) { + d->segment = segment->parseSegment(); + maxOffset = segment->getDataSize(); + if (readStyle == Properties::ReadStyle::Fast && maxOffset > FAST_SCAN_LIMIT) { + maxOffset = FAST_SCAN_LIMIT; + } + if(!segment->readLimited(*this, maxOffset)) { debug("Failed to read segment"); setValid(false); return; } // Parse the elements - d->segment = segment->parseSegment(); d->seekHead = segment->parseSeekHead(); d->cues = segment->parseCues(); d->tag = segment->parseTag(); diff --git a/taglib/matroska/matroskaseekhead.cpp b/taglib/matroska/matroskaseekhead.cpp index bc8bb0a1..f3aa756a 100644 --- a/taglib/matroska/matroskaseekhead.cpp +++ b/taglib/matroska/matroskaseekhead.cpp @@ -54,7 +54,6 @@ bool Matroska::SeekHead::isValid(TagLib::File &file) const void Matroska::SeekHead::addEntry(const Element &element) { entries.append({element.id(), element.offset()}); - debug("adding to seekhead"); setNeedsRender(true); } @@ -64,6 +63,11 @@ void Matroska::SeekHead::addEntry(ID id, offset_t offset) setNeedsRender(true); } +const List> &Matroska::SeekHead::entryList() const +{ + return entries; +} + ByteVector Matroska::SeekHead::renderInternal() { const auto beforeSize = sizeRenderedOrWritten(); diff --git a/taglib/matroska/matroskaseekhead.h b/taglib/matroska/matroskaseekhead.h index 56fe6f4c..d51aeec3 100644 --- a/taglib/matroska/matroskaseekhead.h +++ b/taglib/matroska/matroskaseekhead.h @@ -39,6 +39,7 @@ namespace TagLib { bool isValid(TagLib::File &file) const; void addEntry(const Element &element); void addEntry(ID id, offset_t offset); + const List> &entryList() const; void write(TagLib::File &file) override; void sort(); bool sizeChanged(Element &caller, offset_t delta) override;