Parse "SeekHead" elements referenced by "Seek" elements

Follow at least one level of indirection by default
This commit is contained in:
Martchus 2019-06-17 19:11:00 +02:00
parent 0c2056c2f9
commit 480857b1b6
3 changed files with 60 additions and 27 deletions

View File

@ -500,8 +500,8 @@ void MatroskaContainer::internalParseHeader(Diagnostics &diag)
}
break;
case MatroskaIds::Cluster:
// cluster reached
// stop here if all relevant information has been gathered
// stop as soon as the first cluster has been reached if all relevant information has been gathered
// -> take elements from seek tables within this segment into account
for (auto i = m_seekInfos.cbegin() + seekInfosIndex, end = m_seekInfos.cend(); i != end; ++i, ++seekInfosIndex) {
for (const auto &infoPair : (*i)->info()) {
std::uint64_t offset = currentOffset + topLevelElement->dataOffset() + infoPair.second;
@ -559,9 +559,7 @@ void MatroskaContainer::internalParseHeader(Diagnostics &diag)
}
}
}
// not checking if m_tagsElements is empty avoids long parsing times when loading big files
// but also has the disadvantage that the parser relies on the presence of a SeekHead element
// (which is not mandatory) to detect tags at the end of the segment
// -> stop if tracks and tags have been found or the file exceeds the max. size to fully process
if (((!m_tracksElements.empty() && !m_tagsElements.empty()) || fileInfo().size() > m_maxFullParseSize)
&& !m_segmentInfoElements.empty()) {
goto finish;

View File

@ -36,25 +36,30 @@ void MatroskaSeekInfo::shift(std::uint64_t start, std::int64_t amount)
}
/*!
* \brief Parses the specified \a seekHeadElement.
* \brief Parses the specified \a seekHeadElement and populates info() with the gathered information.
* \throws Throws ios_base::failure when an IO error occurs.
* \throws Throws Failure or a derived exception when a parsing error occurs.
* \remarks The object does not take ownership over the specified \a seekHeadElement.
* \remarks
* - The object does not take ownership over the specified \a seekHeadElement.
* - Possibly previously parsed info() is not cleared. So subsequent calls can be used to gather seek
* information from multiple seek head elements. Use clear() manually if that is not wanted.
* - If the specified \a seekHeadElement references another seek head element the referenced seek head
* element is parsed as well. One can set \a maxNesting to 0 to prevent that or even increase the value
* to allow following references even more deeply. References to elements which have already been visited
* are never followed, though.
*/
void MatroskaSeekInfo::parse(EbmlElement *seekHeadElement, Diagnostics &diag)
void MatroskaSeekInfo::parse(EbmlElement *seekHeadElement, Diagnostics &diag, size_t maxNesting)
{
static const string context("parsing \"SeekHead\"-element");
m_seekHeadElement = seekHeadElement;
m_info.clear();
EbmlElement *seekElement = seekHeadElement->firstChild();
EbmlElement *seekElementChild, *seekIdElement, *seekPositionElement;
while (seekElement) {
m_seekHeadElements.emplace_back(seekHeadElement);
for (EbmlElement *seekElement = seekHeadElement->firstChild(), *seekIdElement, *seekPositionElement; seekElement; seekElement = seekElement->nextSibling()) {
seekElement->parse(diag);
switch (seekElement->id()) {
case MatroskaIds::Seek:
seekElementChild = seekElement->firstChild();
seekIdElement = seekPositionElement = nullptr;
while (seekElementChild) {
for (auto *seekElementChild = seekElement->firstChild(); seekElementChild; seekElementChild = seekElementChild->nextSibling()) {
seekElementChild->parse(diag);
switch (seekElementChild->id()) {
case MatroskaIds::SeekID:
@ -80,13 +85,42 @@ void MatroskaSeekInfo::parse(EbmlElement *seekHeadElement, Diagnostics &diag)
+ "\" within the \"Seek\" element is not a \"SeekID\"-element nor a \"SeekPosition\"-element and will be ignored.",
context);
}
seekElementChild = seekElementChild->nextSibling();
}
if (seekIdElement && seekPositionElement) {
m_info.emplace_back(seekIdElement->readUInteger(), seekPositionElement->readUInteger());
} else {
if (!seekIdElement || !seekPositionElement) {
diag.emplace_back(DiagLevel::Warning, "The \"Seek\"-element does not contain a \"SeekID\"- and a \"SeekPosition\"-element.", context);
break;
}
m_info.emplace_back(seekIdElement->readUInteger(), seekPositionElement->readUInteger());
// follow possibly referenced seek head element
if (m_info.back().first == MatroskaIds::SeekHead) {
const auto startOffset = m_info.back().second;
if (!maxNesting) {
diag.emplace_back(DiagLevel::Warning,
argsToString("Not following reference by \"Seek\" element at ", seekElement->startOffset(), " contains to another \"SeekHead\" element at ", startOffset, '.'),
context);
break;
}
auto visited = false;
for (const auto *const visitedSeekHeadElement : m_seekHeadElements) {
if (visitedSeekHeadElement->startOffset() == startOffset) {
diag.emplace_back(DiagLevel::Warning,
argsToString("The \"Seek\" element at ", seekElement->startOffset(), " contains a loop to the \"SeekHead\" element at ", visitedSeekHeadElement->startOffset(), '.'),
context);
visited = true;
break;
}
}
if (visited) {
break;
}
m_additionalSeekHeadElements.emplace_back(make_unique<EbmlElement>(seekHeadElement->container(), startOffset));
parse(m_additionalSeekHeadElements.back().get(), diag, maxNesting - 1);
}
break;
case EbmlIds::Crc32:
case EbmlIds::Void:
@ -95,7 +129,6 @@ void MatroskaSeekInfo::parse(EbmlElement *seekHeadElement, Diagnostics &diag)
diag.emplace_back(
DiagLevel::Warning, "The element " % seekElement->idToString() + " is not a seek element and will be ignored.", context);
}
seekElement = seekElement->nextSibling();
}
if (m_info.empty()) {
diag.emplace_back(DiagLevel::Warning, "No seek information found.", context);

View File

@ -11,11 +11,11 @@ class TAG_PARSER_EXPORT MatroskaSeekInfo {
public:
MatroskaSeekInfo();
EbmlElement *seekHeadElement() const;
const std::vector<EbmlElement *> &seekHeadElements() const;
const std::vector<std::pair<EbmlElement::IdentifierType, std::uint64_t>> &info() const;
std::vector<std::pair<EbmlElement::IdentifierType, std::uint64_t>> &info();
void shift(std::uint64_t start, std::int64_t amount);
void parse(EbmlElement *seekHeadElement, Diagnostics &diag);
void parse(EbmlElement *seekHeadElements, Diagnostics &diag, std::size_t maxNesting = 1);
void make(std::ostream &stream, Diagnostics &diag);
std::uint64_t minSize() const;
std::uint64_t maxSize() const;
@ -30,7 +30,8 @@ public:
static bool updateSeekInfo(std::vector<MatroskaSeekInfo> &newSeekInfos, std::uint64_t oldOffset, std::uint64_t newOffset);
private:
EbmlElement *m_seekHeadElement;
std::vector<EbmlElement *> m_seekHeadElements;
std::vector<std::unique_ptr<EbmlElement>> m_additionalSeekHeadElements;
std::vector<std::pair<EbmlElement::IdentifierType, std::uint64_t>> m_info;
};
@ -38,16 +39,17 @@ private:
* \brief Constructs a new MatroskaSeekInfo.
*/
inline MatroskaSeekInfo::MatroskaSeekInfo()
: m_seekHeadElement(nullptr)
{
}
/*!
* \brief Returns a pointer to the \a seekHeadElement specified when the parse() method was called.
* \brief Returns a pointer to the seek head elements the seek information is composed of.
* \remarks This list is initially empty. When calling parse() it is at least populated with the specified seek head element (ownership remains
* by the caller). In case that seek table references another seek table those elements are also returned (the MatroskaSeekInfo has ownership).
*/
inline EbmlElement *MatroskaSeekInfo::seekHeadElement() const
inline const std::vector<EbmlElement *> &MatroskaSeekInfo::seekHeadElements() const
{
return m_seekHeadElement;
return m_seekHeadElements;
}
/*!