From e0437c0a43c68bad822b8b49127733f5ceb9975b Mon Sep 17 00:00:00 2001 From: Martchus Date: Sun, 13 Mar 2016 22:00:23 +0100 Subject: [PATCH] skip invalid bytes when parsing EBML --- genericfileelement.h | 6 +- matroska/ebmlelement.cpp | 170 ++++++++++-------- matroska/ebmlelement.h | 1 + matroska/matroskacontainer.cpp | 306 +++++++++++++++++---------------- 4 files changed, 262 insertions(+), 221 deletions(-) diff --git a/genericfileelement.h b/genericfileelement.h index d7c59f1..b8fe85c 100644 --- a/genericfileelement.h +++ b/genericfileelement.h @@ -200,6 +200,7 @@ public: protected: identifierType m_id; uint64 m_startOffset; + uint64 m_maxSize; uint32 m_idLength; dataSizeType m_dataSize; uint32 m_sizeLength; @@ -212,7 +213,6 @@ private: void copyInternal(std::ostream &targetStream, uint64 startOffset, uint64 bytesToCopy); containerType* m_container; - uint64 m_maxSize; bool m_parsed; }; @@ -248,12 +248,12 @@ template GenericFileElement::GenericFileElement(GenericFileElement::implementationType &parent, uint64 startOffset) : m_id(identifierType()), m_startOffset(startOffset), + m_maxSize(parent.startOffset() + parent.totalSize() - startOffset), m_idLength(0), m_dataSize(0), m_sizeLength(0), m_parent(&parent), m_container(&parent.container()), - m_maxSize(parent.startOffset() + parent.totalSize() - startOffset), m_parsed(false) {} @@ -264,12 +264,12 @@ template GenericFileElement::GenericFileElement(GenericFileElement::containerType &container, uint64 startOffset, uint64 maxSize) : m_id(identifierType()), m_startOffset(startOffset), + m_maxSize(maxSize), m_idLength(0), m_dataSize(0), m_sizeLength(0), m_parent(nullptr), m_container(&container), - m_maxSize(maxSize), m_parsed(false) {} diff --git a/matroska/ebmlelement.cpp b/matroska/ebmlelement.cpp index 6bba396..51d8aec 100644 --- a/matroska/ebmlelement.cpp +++ b/matroska/ebmlelement.cpp @@ -64,82 +64,116 @@ void EbmlElement::internalParse() { invalidateStatus(); static const string context("parsing EBML element header"); - // check whether max size is valid - if(maxTotalSize() < 2) { - addNotification(NotificationType::Critical, "The EBML element at " + numberToString(startOffset()) + " is truncated or does not exist.", context); - throw TruncatedDataException(); - } - stream().seekg(startOffset()); - // read ID - char buf[maximumIdLengthSupported() > maximumSizeLengthSupported() ? maximumIdLengthSupported() : maximumSizeLengthSupported()] = {0}; - byte beg, mask = 0x80; - beg = stream().peek(); - m_idLength = 1; - while(m_idLength <= GenericFileElement::maximumIdLengthSupported() && (beg & mask) == 0) { - ++m_idLength; - mask >>= 1; - } - if(m_idLength > GenericFileElement::maximumIdLengthSupported()) { - addNotification(NotificationType::Critical, "EBML ID length is not supported.", context); - throw VersionNotSupportedException(); - } - if(m_idLength > container().maxIdLength()) { - addNotification(NotificationType::Critical, "EBML ID length is invalid.", context); - throw InvalidDataException(); - } - reader().read(buf + (GenericFileElement::maximumIdLengthSupported() - m_idLength), m_idLength); - m_id = BE::toUInt32(buf); - // read size - mask = 0x80; - m_sizeLength = 1; - beg = stream().peek(); - while(m_sizeLength <= GenericFileElement::maximumSizeLengthSupported() && (beg & mask) == 0) { - ++m_sizeLength; - mask >>= 1; - } - if(m_sizeLength > GenericFileElement::maximumSizeLengthSupported()) { - addNotification(NotificationType::Critical, "EBML size length is not supported.", parsingContext()); - throw VersionNotSupportedException(); - } - if(m_sizeLength > container().maxSizeLength()) { - addNotification(NotificationType::Critical, "EBML size length is invalid.", parsingContext()); - throw InvalidDataException(); - } - // read size into buffer - memset(buf, 0, sizeof(dataSizeType)); // reset buffer - reader().read(buf + (GenericFileElement::maximumSizeLengthSupported() - m_sizeLength), m_sizeLength); - *(buf + (GenericFileElement::maximumSizeLengthSupported() - m_sizeLength)) ^= mask; // xor the first byte in buffer which has been read from the file with mask - m_dataSize = ConversionUtilities::BE::toUInt64(buf); - // check if element is truncated - if(totalSize() > maxTotalSize()) { - if(m_idLength + m_sizeLength > maxTotalSize()) { // header truncated - addNotification(NotificationType::Critical, "EBML header seems to be truncated.", parsingContext()); + + byte skipped; + for(skipped = 0; /* TODO: add a sane limit here */; ++m_startOffset, --m_maxSize, ++skipped) { + // check whether max size is valid + if(maxTotalSize() < 2) { + addNotification(NotificationType::Critical, "The EBML element at " + numberToString(startOffset()) + " is truncated or does not exist.", context); throw TruncatedDataException(); - } else { // data truncated - addNotification(NotificationType::Warning, "Data of EBML element seems to be truncated; unable to parse siblings of that element.", parsingContext()); - m_dataSize = maxTotalSize() - m_idLength - m_sizeLength; // using max size instead } - } - // check if there's a first child - if(uint64 firstChildOffset = this->firstChildOffset()) { - if(firstChildOffset < dataSize()) { - m_firstChild.reset(new EbmlElement(static_cast(*this), startOffset() + firstChildOffset)); + stream().seekg(startOffset()); + // read ID + char buf[maximumIdLengthSupported() > maximumSizeLengthSupported() ? maximumIdLengthSupported() : maximumSizeLengthSupported()] = {0}; + byte beg, mask = 0x80; + beg = stream().peek(); + m_idLength = 1; + while(m_idLength <= GenericFileElement::maximumIdLengthSupported() && (beg & mask) == 0) { + ++m_idLength; + mask >>= 1; + } + if(m_idLength > GenericFileElement::maximumIdLengthSupported()) { + if(!skipped) { + addNotification(NotificationType::Critical, "EBML ID length is not supported, trying to skip.", context); + } + continue; // try again + } + if(m_idLength > container().maxIdLength()) { + if(!skipped) { + addNotification(NotificationType::Critical, "EBML ID length is invalid.", context); + } + continue; // try again + } + reader().read(buf + (GenericFileElement::maximumIdLengthSupported() - m_idLength), m_idLength); + m_id = BE::toUInt32(buf); + + // read size + mask = 0x80; + m_sizeLength = 1; + beg = stream().peek(); + if(beg == 0xFF) { + // this indicates that the element size is unknown + // -> just assume the element takes the maximum available size + m_dataSize = maxTotalSize() - headerSize(); + } else { + while(m_sizeLength <= GenericFileElement::maximumSizeLengthSupported() && (beg & mask) == 0) { + ++m_sizeLength; + mask >>= 1; + } + if(m_sizeLength > GenericFileElement::maximumSizeLengthSupported()) { + if(!skipped) { + addNotification(NotificationType::Critical, "EBML size length is not supported.", parsingContext()); + } + continue; // try again + } + if(m_sizeLength > container().maxSizeLength()) { + if(!skipped) { + addNotification(NotificationType::Critical, "EBML size length is invalid.", parsingContext()); + } + continue; // try again + } + // read size into buffer + memset(buf, 0, sizeof(dataSizeType)); // reset buffer + reader().read(buf + (GenericFileElement::maximumSizeLengthSupported() - m_sizeLength), m_sizeLength); + *(buf + (GenericFileElement::maximumSizeLengthSupported() - m_sizeLength)) ^= mask; // xor the first byte in buffer which has been read from the file with mask + m_dataSize = ConversionUtilities::BE::toUInt64(buf); + // check if element is truncated + if(totalSize() > maxTotalSize()) { + if(m_idLength + m_sizeLength > maxTotalSize()) { // header truncated + if(!skipped) { + addNotification(NotificationType::Critical, "EBML header seems to be truncated.", parsingContext()); + } + continue; // try again + } else { // data truncated + addNotification(NotificationType::Warning, "Data of EBML element seems to be truncated; unable to parse siblings of that element.", parsingContext()); + m_dataSize = maxTotalSize() - m_idLength - m_sizeLength; // using max size instead + } + } + } + + // check if there's a first child + if(const uint64 firstChildOffset = this->firstChildOffset()) { + if(firstChildOffset < dataSize()) { + m_firstChild.reset(new EbmlElement(static_cast(*this), startOffset() + firstChildOffset)); + } else { + m_firstChild.reset(); + } } else { m_firstChild.reset(); } - } else { - m_firstChild.reset(); - } - // check if there's a sibling - if(totalSize() < maxTotalSize()) { - if(parent()) { - m_nextSibling.reset(new EbmlElement(*(parent()), startOffset() + totalSize())); + + // check if there's a sibling + if(totalSize() < maxTotalSize()) { + if(parent()) { + m_nextSibling.reset(new EbmlElement(*(parent()), startOffset() + totalSize())); + } else { + m_nextSibling.reset(new EbmlElement(container(), startOffset() + totalSize(), maxTotalSize() - totalSize())); + } } else { - m_nextSibling.reset(new EbmlElement(container(), startOffset() + totalSize(), maxTotalSize() - totalSize())); + m_nextSibling.reset(); } - } else { - m_nextSibling.reset(); + + // no critical errors occured + // -> add a warning if bytes have been skipped + if(skipped) { + addNotification(NotificationType::Warning, numberToString(skipped) + " bytes have been skipped", parsingContext()); + } + // -> don't need another try, return here + return; } + + // critical errors occured and skipping some bytes wasn't successful + throw InvalidDataException(); } /*! diff --git a/matroska/ebmlelement.h b/matroska/ebmlelement.h index 0f8d7b5..10ea66f 100644 --- a/matroska/ebmlelement.h +++ b/matroska/ebmlelement.h @@ -136,6 +136,7 @@ inline bool EbmlElement::isPadding() const /*! * \brief Returns the offset of the first child of the element. + * \remarks The returned offset is relative to the start offset if this element. */ inline uint64 EbmlElement::firstChildOffset() const { diff --git a/matroska/matroskacontainer.cpp b/matroska/matroskacontainer.cpp index 399d3f4..7668f39 100644 --- a/matroska/matroskacontainer.cpp +++ b/matroska/matroskacontainer.cpp @@ -354,171 +354,177 @@ void MatroskaContainer::internalParseHeader() for(EbmlElement *topLevelElement = m_firstElement.get(); topLevelElement; topLevelElement = topLevelElement->nextSibling()) { try { topLevelElement->parse(); - } catch(const Failure &) { - addNotification(NotificationType::Critical, "Unable to parse top-level element at " + numberToString(topLevelElement->startOffset()) + ".", context); - break; - } - switch(topLevelElement->id()) { - case EbmlIds::Header: - for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) { - try { - subElement->parse(); - } catch (Failure &) { - addNotification(NotificationType::Critical, "Unable to parse all childs of EBML header.", context); - break; + switch(topLevelElement->id()) { + case EbmlIds::Header: + for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) { + try { + subElement->parse(); + switch(subElement->id()) { + case EbmlIds::Version: + m_version = subElement->readUInteger(); + break; + case EbmlIds::ReadVersion: + m_readVersion = subElement->readUInteger(); + break; + case EbmlIds::DocType: + m_doctype = subElement->readString(); + break; + case EbmlIds::DocTypeVersion: + m_doctypeVersion = subElement->readUInteger(); + break; + case EbmlIds::DocTypeReadVersion: + m_doctypeReadVersion = subElement->readUInteger(); + break; + case EbmlIds::MaxIdLength: + m_maxIdLength = subElement->readUInteger(); + if(m_maxIdLength > EbmlElement::maximumIdLengthSupported()) { + addNotification(NotificationType::Critical, "Maximum EBML element ID length greather then " + + numberToString(EbmlElement::maximumIdLengthSupported()) + + " bytes is not supported.", context); + throw InvalidDataException(); + } + break; + case EbmlIds::MaxSizeLength: + m_maxSizeLength = subElement->readUInteger(); + if(m_maxSizeLength > EbmlElement::maximumSizeLengthSupported()) { + addNotification(NotificationType::Critical, "Maximum EBML element size length greather then " + + numberToString(EbmlElement::maximumSizeLengthSupported()) + + " bytes is not supported.", context); + throw InvalidDataException(); + } + break; + } + addNotifications(*subElement); + } catch(const Failure &) { + addNotifications(*subElement); + addNotification(NotificationType::Critical, "Unable to parse all childs of EBML header.", context); + break; + } } - switch(subElement->id()) { - case EbmlIds::Version: - m_version = subElement->readUInteger(); - break; - case EbmlIds::ReadVersion: - m_readVersion = subElement->readUInteger(); - break; - case EbmlIds::DocType: - m_doctype = subElement->readString(); - break; - case EbmlIds::DocTypeVersion: - m_doctypeVersion = subElement->readUInteger(); - break; - case EbmlIds::DocTypeReadVersion: - m_doctypeReadVersion = subElement->readUInteger(); - break; - case EbmlIds::MaxIdLength: - m_maxIdLength = subElement->readUInteger(); - if(m_maxIdLength > EbmlElement::maximumIdLengthSupported()) { - addNotification(NotificationType::Critical, "Maximum EBML element ID length greather then " - + numberToString(EbmlElement::maximumIdLengthSupported()) - + " bytes is not supported.", context); - throw InvalidDataException(); - } - break; - case EbmlIds::MaxSizeLength: - m_maxSizeLength = subElement->readUInteger(); - if(m_maxSizeLength > EbmlElement::maximumSizeLengthSupported()) { - addNotification(NotificationType::Critical, "Maximum EBML element size length greather then " - + numberToString(EbmlElement::maximumSizeLengthSupported()) - + " bytes is not supported.", context); - throw InvalidDataException(); - } - break; - } - } - break; - case MatroskaIds::Segment: - ++m_segmentCount; - for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) { - try { - subElement->parse(); - } catch (Failure &) { - addNotification(NotificationType::Critical, "Unable to parse all childs of \"Segment\"-element.", context); - break; - } - switch(subElement->id()) { - case MatroskaIds::SeekHead: - m_seekInfos.emplace_back(make_unique()); - m_seekInfos.back()->parse(subElement); - addNotifications(*m_seekInfos.back()); - break; - case MatroskaIds::Tracks: - if(excludesOffset(m_tracksElements, subElement->startOffset())) { - m_tracksElements.push_back(subElement); - } - break; - case MatroskaIds::SegmentInfo: - if(excludesOffset(m_segmentInfoElements, subElement->startOffset())) { - m_segmentInfoElements.push_back(subElement); - } - break; - case MatroskaIds::Tags: - if(excludesOffset(m_tagsElements, subElement->startOffset())) { - m_tagsElements.push_back(subElement); - } - break; - case MatroskaIds::Chapters: - if(excludesOffset(m_chaptersElements, subElement->startOffset())) { - m_chaptersElements.push_back(subElement); - } - break; - case MatroskaIds::Attachments: - if(excludesOffset(m_attachmentsElements, subElement->startOffset())) { - m_attachmentsElements.push_back(subElement); - } - break; - case MatroskaIds::Cluster: - // cluster reached - // stop here if all relevant information has been gathered - for(auto i = m_seekInfos.cbegin() + seekInfosIndex, end = m_seekInfos.cend(); i != end; ++i, ++seekInfosIndex) { - for(const auto &infoPair : (*i)->info()) { - uint64 offset = currentOffset + topLevelElement->dataOffset() + infoPair.second; - if(offset >= fileInfo().size()) { - addNotification(NotificationType::Critical, "Offset (" + numberToString(offset) + ") denoted by \"SeekHead\" element is invalid.", context); - } else { - auto element = make_unique(*this, offset); - try { - element->parse(); - if(element->id() != infoPair.first) { - addNotification(NotificationType::Critical, "ID of element " + element->idToString() + " at " + numberToString(offset) + " does not match the ID denoted in the \"SeekHead\" element (0x" + numberToString(infoPair.first, 16) + ").", context); + break; + case MatroskaIds::Segment: + ++m_segmentCount; + for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) { + try { + subElement->parse(); + switch(subElement->id()) { + case MatroskaIds::SeekHead: + m_seekInfos.emplace_back(make_unique()); + m_seekInfos.back()->parse(subElement); + addNotifications(*m_seekInfos.back()); + break; + case MatroskaIds::Tracks: + if(excludesOffset(m_tracksElements, subElement->startOffset())) { + m_tracksElements.push_back(subElement); + } + break; + case MatroskaIds::SegmentInfo: + if(excludesOffset(m_segmentInfoElements, subElement->startOffset())) { + m_segmentInfoElements.push_back(subElement); + } + break; + case MatroskaIds::Tags: + if(excludesOffset(m_tagsElements, subElement->startOffset())) { + m_tagsElements.push_back(subElement); + } + break; + case MatroskaIds::Chapters: + if(excludesOffset(m_chaptersElements, subElement->startOffset())) { + m_chaptersElements.push_back(subElement); + } + break; + case MatroskaIds::Attachments: + if(excludesOffset(m_attachmentsElements, subElement->startOffset())) { + m_attachmentsElements.push_back(subElement); + } + break; + case MatroskaIds::Cluster: + // cluster reached + // stop here if all relevant information has been gathered + for(auto i = m_seekInfos.cbegin() + seekInfosIndex, end = m_seekInfos.cend(); i != end; ++i, ++seekInfosIndex) { + for(const auto &infoPair : (*i)->info()) { + uint64 offset = currentOffset + topLevelElement->dataOffset() + infoPair.second; + if(offset >= fileInfo().size()) { + addNotification(NotificationType::Critical, "Offset (" + numberToString(offset) + ") denoted by \"SeekHead\" element is invalid.", context); + } else { + auto element = make_unique(*this, offset); + try { + element->parse(); + if(element->id() != infoPair.first) { + addNotification(NotificationType::Critical, "ID of element " + element->idToString() + " at " + numberToString(offset) + " does not match the ID denoted in the \"SeekHead\" element (0x" + numberToString(infoPair.first, 16) + ").", context); + } + switch(element->id()) { + case MatroskaIds::SegmentInfo: + if(excludesOffset(m_segmentInfoElements, offset)) { + m_additionalElements.emplace_back(move(element)); + m_segmentInfoElements.emplace_back(m_additionalElements.back().get()); + } + break; + case MatroskaIds::Tracks: + if(excludesOffset(m_tracksElements, offset)) { + m_additionalElements.emplace_back(move(element)); + m_tracksElements.emplace_back(m_additionalElements.back().get()); + } + break; + case MatroskaIds::Tags: + if(excludesOffset(m_tagsElements, offset)) { + m_additionalElements.emplace_back(move(element)); + m_tagsElements.emplace_back(m_additionalElements.back().get()); + } + break; + case MatroskaIds::Chapters: + if(excludesOffset(m_chaptersElements, offset)) { + m_additionalElements.emplace_back(move(element)); + m_chaptersElements.emplace_back(m_additionalElements.back().get()); + } + break; + case MatroskaIds::Attachments: + if(excludesOffset(m_attachmentsElements, offset)) { + m_additionalElements.emplace_back(move(element)); + m_attachmentsElements.emplace_back(m_additionalElements.back().get()); + } + break; + default: + ; + } + } catch(const Failure &) { + addNotification(NotificationType::Critical, "Can not parse element at " + numberToString(offset) + " (denoted using \"SeekHead\" element).", context); + } } - switch(element->id()) { - case MatroskaIds::SegmentInfo: - if(excludesOffset(m_segmentInfoElements, offset)) { - m_additionalElements.emplace_back(move(element)); - m_segmentInfoElements.emplace_back(m_additionalElements.back().get()); - } - break; - case MatroskaIds::Tracks: - if(excludesOffset(m_tracksElements, offset)) { - m_additionalElements.emplace_back(move(element)); - m_tracksElements.emplace_back(m_additionalElements.back().get()); - } - break; - case MatroskaIds::Tags: - if(excludesOffset(m_tagsElements, offset)) { - m_additionalElements.emplace_back(move(element)); - m_tagsElements.emplace_back(m_additionalElements.back().get()); - } - break; - case MatroskaIds::Chapters: - if(excludesOffset(m_chaptersElements, offset)) { - m_additionalElements.emplace_back(move(element)); - m_chaptersElements.emplace_back(m_additionalElements.back().get()); - } - break; - case MatroskaIds::Attachments: - if(excludesOffset(m_attachmentsElements, offset)) { - m_additionalElements.emplace_back(move(element)); - m_attachmentsElements.emplace_back(m_additionalElements.back().get()); - } - break; - default: - ; - } - } catch(const Failure &) { - addNotification(NotificationType::Critical, "Can not parse element at " + numberToString(offset) + " (denoted using \"SeekHead\" element).", context); } } + // not checking if m_tagsElements is empty avoids long parsing times when loading big files + // but also has the disadvantage that the parser relies on the presence of a SeekHead element + // (which is not mandatory) to detect tags at the end of the segment + if(((!m_tracksElements.empty() && !m_tagsElements.empty()) || fileInfo().size() > m_maxFullParseSize) && !m_segmentInfoElements.empty()) { + goto finish; + } + break; } + addNotifications(*subElement); + } catch(const Failure &) { + addNotifications(*subElement); + addNotification(NotificationType::Critical, "Unable to parse all childs of \"Segment\"-element.", context); + break; } - // not checking if m_tagsElements is empty avoids long parsing times when loading big files - // but also has the disadvantage that the parser relies on the presence of a SeekHead element - // (which is not mandatory) to detect tags at the end of the segment - if(((!m_tracksElements.empty() && !m_tagsElements.empty()) || fileInfo().size() > m_maxFullParseSize) && !m_segmentInfoElements.empty()) { - goto finish; - } - break; } + currentOffset += topLevelElement->totalSize(); + break; + default: + ; } - currentOffset += topLevelElement->totalSize(); + addNotifications(*topLevelElement); + } catch(const Failure &) { + addNotifications(*topLevelElement); + addNotification(NotificationType::Critical, "Unable to parse top-level element at " + numberToString(topLevelElement->startOffset()) + ".", context); break; - default: - ; } } // finally parse the "Info"-element and fetch "EditionEntry"-elements finish: try { parseSegmentInfo(); - } catch (Failure &) { + } catch(const Failure &) { addNotification(NotificationType::Critical, "Unable to parse EBML (segment) \"Info\"-element.", context); } }