skip invalid bytes when parsing EBML

This commit is contained in:
Martchus 2016-03-13 22:00:23 +01:00
parent 9016097d36
commit e0437c0a43
4 changed files with 262 additions and 221 deletions

View File

@ -200,6 +200,7 @@ public:
protected:
identifierType m_id;
uint64 m_startOffset;
uint64 m_maxSize;
uint32 m_idLength;
dataSizeType m_dataSize;
uint32 m_sizeLength;
@ -212,7 +213,6 @@ private:
void copyInternal(std::ostream &targetStream, uint64 startOffset, uint64 bytesToCopy);
containerType* m_container;
uint64 m_maxSize;
bool m_parsed;
};
@ -248,12 +248,12 @@ template <class ImplementationType>
GenericFileElement<ImplementationType>::GenericFileElement(GenericFileElement<ImplementationType>::implementationType &parent, uint64 startOffset) :
m_id(identifierType()),
m_startOffset(startOffset),
m_maxSize(parent.startOffset() + parent.totalSize() - startOffset),
m_idLength(0),
m_dataSize(0),
m_sizeLength(0),
m_parent(&parent),
m_container(&parent.container()),
m_maxSize(parent.startOffset() + parent.totalSize() - startOffset),
m_parsed(false)
{}
@ -264,12 +264,12 @@ template <class ImplementationType>
GenericFileElement<ImplementationType>::GenericFileElement(GenericFileElement<ImplementationType>::containerType &container, uint64 startOffset, uint64 maxSize) :
m_id(identifierType()),
m_startOffset(startOffset),
m_maxSize(maxSize),
m_idLength(0),
m_dataSize(0),
m_sizeLength(0),
m_parent(nullptr),
m_container(&container),
m_maxSize(maxSize),
m_parsed(false)
{}

View File

@ -64,82 +64,116 @@ void EbmlElement::internalParse()
{
invalidateStatus();
static const string context("parsing EBML element header");
// check whether max size is valid
if(maxTotalSize() < 2) {
addNotification(NotificationType::Critical, "The EBML element at " + numberToString(startOffset()) + " is truncated or does not exist.", context);
throw TruncatedDataException();
}
stream().seekg(startOffset());
// read ID
char buf[maximumIdLengthSupported() > maximumSizeLengthSupported() ? maximumIdLengthSupported() : maximumSizeLengthSupported()] = {0};
byte beg, mask = 0x80;
beg = stream().peek();
m_idLength = 1;
while(m_idLength <= GenericFileElement<implementationType>::maximumIdLengthSupported() && (beg & mask) == 0) {
++m_idLength;
mask >>= 1;
}
if(m_idLength > GenericFileElement<implementationType>::maximumIdLengthSupported()) {
addNotification(NotificationType::Critical, "EBML ID length is not supported.", context);
throw VersionNotSupportedException();
}
if(m_idLength > container().maxIdLength()) {
addNotification(NotificationType::Critical, "EBML ID length is invalid.", context);
throw InvalidDataException();
}
reader().read(buf + (GenericFileElement<implementationType>::maximumIdLengthSupported() - m_idLength), m_idLength);
m_id = BE::toUInt32(buf);
// read size
mask = 0x80;
m_sizeLength = 1;
beg = stream().peek();
while(m_sizeLength <= GenericFileElement<implementationType>::maximumSizeLengthSupported() && (beg & mask) == 0) {
++m_sizeLength;
mask >>= 1;
}
if(m_sizeLength > GenericFileElement<implementationType>::maximumSizeLengthSupported()) {
addNotification(NotificationType::Critical, "EBML size length is not supported.", parsingContext());
throw VersionNotSupportedException();
}
if(m_sizeLength > container().maxSizeLength()) {
addNotification(NotificationType::Critical, "EBML size length is invalid.", parsingContext());
throw InvalidDataException();
}
// read size into buffer
memset(buf, 0, sizeof(dataSizeType)); // reset buffer
reader().read(buf + (GenericFileElement<implementationType>::maximumSizeLengthSupported() - m_sizeLength), m_sizeLength);
*(buf + (GenericFileElement<implementationType>::maximumSizeLengthSupported() - m_sizeLength)) ^= mask; // xor the first byte in buffer which has been read from the file with mask
m_dataSize = ConversionUtilities::BE::toUInt64(buf);
// check if element is truncated
if(totalSize() > maxTotalSize()) {
if(m_idLength + m_sizeLength > maxTotalSize()) { // header truncated
addNotification(NotificationType::Critical, "EBML header seems to be truncated.", parsingContext());
byte skipped;
for(skipped = 0; /* TODO: add a sane limit here */; ++m_startOffset, --m_maxSize, ++skipped) {
// check whether max size is valid
if(maxTotalSize() < 2) {
addNotification(NotificationType::Critical, "The EBML element at " + numberToString(startOffset()) + " is truncated or does not exist.", context);
throw TruncatedDataException();
} else { // data truncated
addNotification(NotificationType::Warning, "Data of EBML element seems to be truncated; unable to parse siblings of that element.", parsingContext());
m_dataSize = maxTotalSize() - m_idLength - m_sizeLength; // using max size instead
}
}
// check if there's a first child
if(uint64 firstChildOffset = this->firstChildOffset()) {
if(firstChildOffset < dataSize()) {
m_firstChild.reset(new EbmlElement(static_cast<EbmlElement &>(*this), startOffset() + firstChildOffset));
stream().seekg(startOffset());
// read ID
char buf[maximumIdLengthSupported() > maximumSizeLengthSupported() ? maximumIdLengthSupported() : maximumSizeLengthSupported()] = {0};
byte beg, mask = 0x80;
beg = stream().peek();
m_idLength = 1;
while(m_idLength <= GenericFileElement<implementationType>::maximumIdLengthSupported() && (beg & mask) == 0) {
++m_idLength;
mask >>= 1;
}
if(m_idLength > GenericFileElement<implementationType>::maximumIdLengthSupported()) {
if(!skipped) {
addNotification(NotificationType::Critical, "EBML ID length is not supported, trying to skip.", context);
}
continue; // try again
}
if(m_idLength > container().maxIdLength()) {
if(!skipped) {
addNotification(NotificationType::Critical, "EBML ID length is invalid.", context);
}
continue; // try again
}
reader().read(buf + (GenericFileElement<implementationType>::maximumIdLengthSupported() - m_idLength), m_idLength);
m_id = BE::toUInt32(buf);
// read size
mask = 0x80;
m_sizeLength = 1;
beg = stream().peek();
if(beg == 0xFF) {
// this indicates that the element size is unknown
// -> just assume the element takes the maximum available size
m_dataSize = maxTotalSize() - headerSize();
} else {
while(m_sizeLength <= GenericFileElement<implementationType>::maximumSizeLengthSupported() && (beg & mask) == 0) {
++m_sizeLength;
mask >>= 1;
}
if(m_sizeLength > GenericFileElement<implementationType>::maximumSizeLengthSupported()) {
if(!skipped) {
addNotification(NotificationType::Critical, "EBML size length is not supported.", parsingContext());
}
continue; // try again
}
if(m_sizeLength > container().maxSizeLength()) {
if(!skipped) {
addNotification(NotificationType::Critical, "EBML size length is invalid.", parsingContext());
}
continue; // try again
}
// read size into buffer
memset(buf, 0, sizeof(dataSizeType)); // reset buffer
reader().read(buf + (GenericFileElement<implementationType>::maximumSizeLengthSupported() - m_sizeLength), m_sizeLength);
*(buf + (GenericFileElement<implementationType>::maximumSizeLengthSupported() - m_sizeLength)) ^= mask; // xor the first byte in buffer which has been read from the file with mask
m_dataSize = ConversionUtilities::BE::toUInt64(buf);
// check if element is truncated
if(totalSize() > maxTotalSize()) {
if(m_idLength + m_sizeLength > maxTotalSize()) { // header truncated
if(!skipped) {
addNotification(NotificationType::Critical, "EBML header seems to be truncated.", parsingContext());
}
continue; // try again
} else { // data truncated
addNotification(NotificationType::Warning, "Data of EBML element seems to be truncated; unable to parse siblings of that element.", parsingContext());
m_dataSize = maxTotalSize() - m_idLength - m_sizeLength; // using max size instead
}
}
}
// check if there's a first child
if(const uint64 firstChildOffset = this->firstChildOffset()) {
if(firstChildOffset < dataSize()) {
m_firstChild.reset(new EbmlElement(static_cast<EbmlElement &>(*this), startOffset() + firstChildOffset));
} else {
m_firstChild.reset();
}
} else {
m_firstChild.reset();
}
} else {
m_firstChild.reset();
}
// check if there's a sibling
if(totalSize() < maxTotalSize()) {
if(parent()) {
m_nextSibling.reset(new EbmlElement(*(parent()), startOffset() + totalSize()));
// check if there's a sibling
if(totalSize() < maxTotalSize()) {
if(parent()) {
m_nextSibling.reset(new EbmlElement(*(parent()), startOffset() + totalSize()));
} else {
m_nextSibling.reset(new EbmlElement(container(), startOffset() + totalSize(), maxTotalSize() - totalSize()));
}
} else {
m_nextSibling.reset(new EbmlElement(container(), startOffset() + totalSize(), maxTotalSize() - totalSize()));
m_nextSibling.reset();
}
} else {
m_nextSibling.reset();
// no critical errors occured
// -> add a warning if bytes have been skipped
if(skipped) {
addNotification(NotificationType::Warning, numberToString<unsigned int>(skipped) + " bytes have been skipped", parsingContext());
}
// -> don't need another try, return here
return;
}
// critical errors occured and skipping some bytes wasn't successful
throw InvalidDataException();
}
/*!

View File

@ -136,6 +136,7 @@ inline bool EbmlElement::isPadding() const
/*!
* \brief Returns the offset of the first child of the element.
* \remarks The returned offset is relative to the start offset if this element.
*/
inline uint64 EbmlElement::firstChildOffset() const
{

View File

@ -354,171 +354,177 @@ void MatroskaContainer::internalParseHeader()
for(EbmlElement *topLevelElement = m_firstElement.get(); topLevelElement; topLevelElement = topLevelElement->nextSibling()) {
try {
topLevelElement->parse();
} catch(const Failure &) {
addNotification(NotificationType::Critical, "Unable to parse top-level element at " + numberToString(topLevelElement->startOffset()) + ".", context);
break;
}
switch(topLevelElement->id()) {
case EbmlIds::Header:
for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) {
try {
subElement->parse();
} catch (Failure &) {
addNotification(NotificationType::Critical, "Unable to parse all childs of EBML header.", context);
break;
switch(topLevelElement->id()) {
case EbmlIds::Header:
for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) {
try {
subElement->parse();
switch(subElement->id()) {
case EbmlIds::Version:
m_version = subElement->readUInteger();
break;
case EbmlIds::ReadVersion:
m_readVersion = subElement->readUInteger();
break;
case EbmlIds::DocType:
m_doctype = subElement->readString();
break;
case EbmlIds::DocTypeVersion:
m_doctypeVersion = subElement->readUInteger();
break;
case EbmlIds::DocTypeReadVersion:
m_doctypeReadVersion = subElement->readUInteger();
break;
case EbmlIds::MaxIdLength:
m_maxIdLength = subElement->readUInteger();
if(m_maxIdLength > EbmlElement::maximumIdLengthSupported()) {
addNotification(NotificationType::Critical, "Maximum EBML element ID length greather then "
+ numberToString<uint32>(EbmlElement::maximumIdLengthSupported())
+ " bytes is not supported.", context);
throw InvalidDataException();
}
break;
case EbmlIds::MaxSizeLength:
m_maxSizeLength = subElement->readUInteger();
if(m_maxSizeLength > EbmlElement::maximumSizeLengthSupported()) {
addNotification(NotificationType::Critical, "Maximum EBML element size length greather then "
+ numberToString<uint32>(EbmlElement::maximumSizeLengthSupported())
+ " bytes is not supported.", context);
throw InvalidDataException();
}
break;
}
addNotifications(*subElement);
} catch(const Failure &) {
addNotifications(*subElement);
addNotification(NotificationType::Critical, "Unable to parse all childs of EBML header.", context);
break;
}
}
switch(subElement->id()) {
case EbmlIds::Version:
m_version = subElement->readUInteger();
break;
case EbmlIds::ReadVersion:
m_readVersion = subElement->readUInteger();
break;
case EbmlIds::DocType:
m_doctype = subElement->readString();
break;
case EbmlIds::DocTypeVersion:
m_doctypeVersion = subElement->readUInteger();
break;
case EbmlIds::DocTypeReadVersion:
m_doctypeReadVersion = subElement->readUInteger();
break;
case EbmlIds::MaxIdLength:
m_maxIdLength = subElement->readUInteger();
if(m_maxIdLength > EbmlElement::maximumIdLengthSupported()) {
addNotification(NotificationType::Critical, "Maximum EBML element ID length greather then "
+ numberToString<uint32>(EbmlElement::maximumIdLengthSupported())
+ " bytes is not supported.", context);
throw InvalidDataException();
}
break;
case EbmlIds::MaxSizeLength:
m_maxSizeLength = subElement->readUInteger();
if(m_maxSizeLength > EbmlElement::maximumSizeLengthSupported()) {
addNotification(NotificationType::Critical, "Maximum EBML element size length greather then "
+ numberToString<uint32>(EbmlElement::maximumSizeLengthSupported())
+ " bytes is not supported.", context);
throw InvalidDataException();
}
break;
}
}
break;
case MatroskaIds::Segment:
++m_segmentCount;
for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) {
try {
subElement->parse();
} catch (Failure &) {
addNotification(NotificationType::Critical, "Unable to parse all childs of \"Segment\"-element.", context);
break;
}
switch(subElement->id()) {
case MatroskaIds::SeekHead:
m_seekInfos.emplace_back(make_unique<MatroskaSeekInfo>());
m_seekInfos.back()->parse(subElement);
addNotifications(*m_seekInfos.back());
break;
case MatroskaIds::Tracks:
if(excludesOffset(m_tracksElements, subElement->startOffset())) {
m_tracksElements.push_back(subElement);
}
break;
case MatroskaIds::SegmentInfo:
if(excludesOffset(m_segmentInfoElements, subElement->startOffset())) {
m_segmentInfoElements.push_back(subElement);
}
break;
case MatroskaIds::Tags:
if(excludesOffset(m_tagsElements, subElement->startOffset())) {
m_tagsElements.push_back(subElement);
}
break;
case MatroskaIds::Chapters:
if(excludesOffset(m_chaptersElements, subElement->startOffset())) {
m_chaptersElements.push_back(subElement);
}
break;
case MatroskaIds::Attachments:
if(excludesOffset(m_attachmentsElements, subElement->startOffset())) {
m_attachmentsElements.push_back(subElement);
}
break;
case MatroskaIds::Cluster:
// cluster reached
// stop here if all relevant information has been gathered
for(auto i = m_seekInfos.cbegin() + seekInfosIndex, end = m_seekInfos.cend(); i != end; ++i, ++seekInfosIndex) {
for(const auto &infoPair : (*i)->info()) {
uint64 offset = currentOffset + topLevelElement->dataOffset() + infoPair.second;
if(offset >= fileInfo().size()) {
addNotification(NotificationType::Critical, "Offset (" + numberToString(offset) + ") denoted by \"SeekHead\" element is invalid.", context);
} else {
auto element = make_unique<EbmlElement>(*this, offset);
try {
element->parse();
if(element->id() != infoPair.first) {
addNotification(NotificationType::Critical, "ID of element " + element->idToString() + " at " + numberToString(offset) + " does not match the ID denoted in the \"SeekHead\" element (0x" + numberToString(infoPair.first, 16) + ").", context);
break;
case MatroskaIds::Segment:
++m_segmentCount;
for(EbmlElement *subElement = topLevelElement->firstChild(); subElement; subElement = subElement->nextSibling()) {
try {
subElement->parse();
switch(subElement->id()) {
case MatroskaIds::SeekHead:
m_seekInfos.emplace_back(make_unique<MatroskaSeekInfo>());
m_seekInfos.back()->parse(subElement);
addNotifications(*m_seekInfos.back());
break;
case MatroskaIds::Tracks:
if(excludesOffset(m_tracksElements, subElement->startOffset())) {
m_tracksElements.push_back(subElement);
}
break;
case MatroskaIds::SegmentInfo:
if(excludesOffset(m_segmentInfoElements, subElement->startOffset())) {
m_segmentInfoElements.push_back(subElement);
}
break;
case MatroskaIds::Tags:
if(excludesOffset(m_tagsElements, subElement->startOffset())) {
m_tagsElements.push_back(subElement);
}
break;
case MatroskaIds::Chapters:
if(excludesOffset(m_chaptersElements, subElement->startOffset())) {
m_chaptersElements.push_back(subElement);
}
break;
case MatroskaIds::Attachments:
if(excludesOffset(m_attachmentsElements, subElement->startOffset())) {
m_attachmentsElements.push_back(subElement);
}
break;
case MatroskaIds::Cluster:
// cluster reached
// stop here if all relevant information has been gathered
for(auto i = m_seekInfos.cbegin() + seekInfosIndex, end = m_seekInfos.cend(); i != end; ++i, ++seekInfosIndex) {
for(const auto &infoPair : (*i)->info()) {
uint64 offset = currentOffset + topLevelElement->dataOffset() + infoPair.second;
if(offset >= fileInfo().size()) {
addNotification(NotificationType::Critical, "Offset (" + numberToString(offset) + ") denoted by \"SeekHead\" element is invalid.", context);
} else {
auto element = make_unique<EbmlElement>(*this, offset);
try {
element->parse();
if(element->id() != infoPair.first) {
addNotification(NotificationType::Critical, "ID of element " + element->idToString() + " at " + numberToString(offset) + " does not match the ID denoted in the \"SeekHead\" element (0x" + numberToString(infoPair.first, 16) + ").", context);
}
switch(element->id()) {
case MatroskaIds::SegmentInfo:
if(excludesOffset(m_segmentInfoElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_segmentInfoElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Tracks:
if(excludesOffset(m_tracksElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_tracksElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Tags:
if(excludesOffset(m_tagsElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_tagsElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Chapters:
if(excludesOffset(m_chaptersElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_chaptersElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Attachments:
if(excludesOffset(m_attachmentsElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_attachmentsElements.emplace_back(m_additionalElements.back().get());
}
break;
default:
;
}
} catch(const Failure &) {
addNotification(NotificationType::Critical, "Can not parse element at " + numberToString(offset) + " (denoted using \"SeekHead\" element).", context);
}
}
switch(element->id()) {
case MatroskaIds::SegmentInfo:
if(excludesOffset(m_segmentInfoElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_segmentInfoElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Tracks:
if(excludesOffset(m_tracksElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_tracksElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Tags:
if(excludesOffset(m_tagsElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_tagsElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Chapters:
if(excludesOffset(m_chaptersElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_chaptersElements.emplace_back(m_additionalElements.back().get());
}
break;
case MatroskaIds::Attachments:
if(excludesOffset(m_attachmentsElements, offset)) {
m_additionalElements.emplace_back(move(element));
m_attachmentsElements.emplace_back(m_additionalElements.back().get());
}
break;
default:
;
}
} catch(const Failure &) {
addNotification(NotificationType::Critical, "Can not parse element at " + numberToString(offset) + " (denoted using \"SeekHead\" element).", context);
}
}
// not checking if m_tagsElements is empty avoids long parsing times when loading big files
// but also has the disadvantage that the parser relies on the presence of a SeekHead element
// (which is not mandatory) to detect tags at the end of the segment
if(((!m_tracksElements.empty() && !m_tagsElements.empty()) || fileInfo().size() > m_maxFullParseSize) && !m_segmentInfoElements.empty()) {
goto finish;
}
break;
}
addNotifications(*subElement);
} catch(const Failure &) {
addNotifications(*subElement);
addNotification(NotificationType::Critical, "Unable to parse all childs of \"Segment\"-element.", context);
break;
}
// not checking if m_tagsElements is empty avoids long parsing times when loading big files
// but also has the disadvantage that the parser relies on the presence of a SeekHead element
// (which is not mandatory) to detect tags at the end of the segment
if(((!m_tracksElements.empty() && !m_tagsElements.empty()) || fileInfo().size() > m_maxFullParseSize) && !m_segmentInfoElements.empty()) {
goto finish;
}
break;
}
currentOffset += topLevelElement->totalSize();
break;
default:
;
}
currentOffset += topLevelElement->totalSize();
addNotifications(*topLevelElement);
} catch(const Failure &) {
addNotifications(*topLevelElement);
addNotification(NotificationType::Critical, "Unable to parse top-level element at " + numberToString(topLevelElement->startOffset()) + ".", context);
break;
default:
;
}
}
// finally parse the "Info"-element and fetch "EditionEntry"-elements
finish:
try {
parseSegmentInfo();
} catch (Failure &) {
} catch(const Failure &) {
addNotification(NotificationType::Critical, "Unable to parse EBML (segment) \"Info\"-element.", context);
}
}