videodownloader/network/finder/linkfinder.cpp

98 lines
3.6 KiB
C++
Raw Normal View History

2015-09-08 17:05:59 +02:00
#include "./linkfinder.h"
2015-04-22 19:32:04 +02:00
#include "../httpdownload.h"
2015-09-08 17:05:59 +02:00
2015-04-22 19:32:04 +02:00
#include "../../application/utils.h"
#include <QRegExp>
2019-06-10 22:50:15 +02:00
using namespace CppUtilities;
2015-04-22 19:32:04 +02:00
using namespace Application;
namespace Network {
/*!
* \class LinkFinder
* \brief The LinkFinder class retrieves links from an ordinary webpage.
*/
/*!
* \brief Constructs a new link finder with the specified \a url.
*/
2017-05-01 03:22:50 +02:00
LinkFinder::LinkFinder(const QUrl &url, QObject *parent)
: DownloadFinder(parent)
, m_url(url)
{
}
2015-04-22 19:32:04 +02:00
Download *LinkFinder::createRequest(QString &)
{
return new HttpDownload(m_url, this);
}
DownloadFinder::ParsingResult LinkFinder::parseResults(const QByteArray &data, QString &)
{
QString html(data);
QRegExp titlePattern(QStringLiteral("<title>(.+)</title>"), Qt::CaseInsensitive);
QRegExp linkPattern(QStringLiteral("<a([^>]+)>(.+)</a>"), Qt::CaseInsensitive);
QRegExp commentPattern(QStringLiteral("<!--(.+)-->"), Qt::CaseInsensitive);
QRegExp hrefPattern1(QStringLiteral("\\s*href\\s*=\\s*['](.+)['>]"), Qt::CaseInsensitive);
QRegExp hrefPattern2(QStringLiteral("\\s*href\\s*=\\s*[\"](.+)[\">]"), Qt::CaseInsensitive);
titlePattern.setMinimal(true);
linkPattern.setMinimal(true);
commentPattern.setMinimal(true);
hrefPattern1.setMinimal(true);
hrefPattern2.setMinimal(true);
QString pageTitle;
2017-05-01 03:22:50 +02:00
if (titlePattern.indexIn(html) >= 0 && titlePattern.captureCount() >= 1) {
2015-04-22 19:32:04 +02:00
pageTitle = titlePattern.cap(1);
replaceHtmlEntities(pageTitle);
}
int overallIndex = 0;
int commentIndex = commentPattern.indexIn(html, overallIndex);
int linkIndex = 0;
2017-05-01 03:22:50 +02:00
while (((linkIndex = linkPattern.indexIn(html, overallIndex)) >= 0)) {
if (commentIndex >= 0 && commentIndex < linkIndex) {
2015-04-22 19:32:04 +02:00
// skip comment
overallIndex = commentIndex + commentPattern.matchedLength();
commentIndex = commentPattern.indexIn(html, overallIndex);
2017-05-01 03:22:50 +02:00
} else if (linkIndex >= 0) {
2015-04-22 19:32:04 +02:00
// read actual link
2017-05-01 03:22:50 +02:00
if (linkPattern.captureCount() >= 2) {
2015-04-22 19:32:04 +02:00
QString title(linkPattern.cap(2));
QString href(linkPattern.cap(1));
QString urlStr;
2017-05-01 03:22:50 +02:00
if (hrefPattern1.indexIn(href) >= 0 && hrefPattern1.captureCount() >= 1) {
2015-04-22 19:32:04 +02:00
urlStr = hrefPattern1.cap(1);
2017-05-01 03:22:50 +02:00
} else if (hrefPattern2.indexIn(href) >= 0 && hrefPattern2.captureCount() >= 1) {
2015-04-22 19:32:04 +02:00
urlStr = hrefPattern2.cap(1);
}
2017-05-01 03:22:50 +02:00
if (!urlStr.isEmpty()) {
2015-04-22 19:32:04 +02:00
replaceHtmlEntities(title);
replaceHtmlEntities(urlStr);
// resolve relative URLs
QUrl url(urlStr);
2017-05-01 03:22:50 +02:00
if (url.isRelative()) {
2015-04-22 19:32:04 +02:00
url = m_url.resolved(url);
}
// avoid duplicate results
2017-05-01 03:22:50 +02:00
if (Download *duplicateDownload = downloadByInitialUrl(url)) {
if (!title.isEmpty() && duplicateDownload->title().isEmpty()) {
2015-04-22 19:32:04 +02:00
duplicateDownload->provideMetaData(title);
}
2017-05-01 03:22:50 +02:00
} else if (Download *result = Download::fromUrl(url)) {
2015-04-22 19:32:04 +02:00
result->provideMetaData(title, QString(), TimeSpan(), pageTitle, results().size());
reportResult(result);
}
}
}
overallIndex = linkIndex + linkPattern.matchedLength();
} else {
// no more links
break;
}
}
return DownloadFinder::ParsingResult::Success;
}
2019-07-20 20:20:58 +02:00
} // namespace Network