videodownloader/network/finder/linkfinder.cpp

95 lines
3.7 KiB
C++
Raw Normal View History

2015-09-08 17:05:59 +02:00
#include "./linkfinder.h"
2015-04-22 19:32:04 +02:00
#include "../httpdownload.h"
2015-09-08 17:05:59 +02:00
2015-04-22 19:32:04 +02:00
#include "../../application/utils.h"
2020-09-04 00:57:42 +02:00
#include <QRegularExpression>
2015-04-22 19:32:04 +02:00
2019-06-10 22:50:15 +02:00
using namespace CppUtilities;
2015-04-22 19:32:04 +02:00
using namespace Application;
namespace Network {
/*!
* \class LinkFinder
* \brief The LinkFinder class retrieves links from an ordinary webpage.
*/
/*!
* \brief Constructs a new link finder with the specified \a url.
*/
2017-05-01 03:22:50 +02:00
LinkFinder::LinkFinder(const QUrl &url, QObject *parent)
: DownloadFinder(parent)
, m_url(url)
{
}
2015-04-22 19:32:04 +02:00
Download *LinkFinder::createRequest(QString &)
{
return new HttpDownload(m_url, this);
}
DownloadFinder::ParsingResult LinkFinder::parseResults(const QByteArray &data, QString &)
{
QString html(data);
2020-09-04 00:57:42 +02:00
static const QRegularExpression titlePattern(
QStringLiteral("<title>(.+)</title>"), QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
static const QRegularExpression linkPattern(
QStringLiteral("<a([^>]+)>(.+)</a>"), QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
static const QRegularExpression commentPattern(
QStringLiteral("<!--(.+)-->"), QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
static const QRegularExpression hrefPattern1(
QStringLiteral("\\s*href\\s*=\\s*['](.+)['>]"), QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
static const QRegularExpression hrefPattern2(
QStringLiteral("\\s*href\\s*=\\s*[\"](.+)[\">]"), QRegularExpression::CaseInsensitiveOption | QRegularExpression::InvertedGreedinessOption);
2015-04-22 19:32:04 +02:00
QString pageTitle;
2020-09-04 00:57:42 +02:00
const auto titleMatch = titlePattern.match(html);
if (titleMatch.hasMatch()) {
pageTitle = titleMatch.captured(1);
2015-04-22 19:32:04 +02:00
replaceHtmlEntities(pageTitle);
}
2020-09-04 00:57:42 +02:00
auto commentMatch = commentPattern.match(html, 0);
decltype(commentMatch.capturedEnd()) overallIndex = 0;
for (auto linkMatch = linkPattern.match(html, overallIndex); linkMatch.hasMatch(); linkMatch = linkPattern.match(html, overallIndex)) {
if (commentMatch.capturedStart() >= 0 && commentMatch.capturedStart() < linkMatch.capturedStart()) {
2015-04-22 19:32:04 +02:00
// skip comment
2020-09-04 00:57:42 +02:00
overallIndex = commentMatch.capturedEnd();
commentMatch = commentPattern.match(html, overallIndex);
break;
}
// read actual link
QString title = linkMatch.captured(2), href = linkMatch.captured(1), urlStr;
const auto hrefMatch1 = hrefPattern1.match(href);
if (hrefMatch1.hasMatch()) {
urlStr = hrefMatch1.captured(1);
} else {
const auto hrefMatch2 = hrefPattern2.match(href);
if (hrefMatch2.hasMatch()) {
urlStr = hrefMatch2.captured(1);
}
}
if (!urlStr.isEmpty()) {
replaceHtmlEntities(title);
replaceHtmlEntities(urlStr);
// resolve relative URLs
QUrl url(urlStr);
if (url.isRelative()) {
url = m_url.resolved(url);
}
// avoid duplicate results
if (Download *const duplicateDownload = downloadByInitialUrl(url)) {
if (!title.isEmpty() && duplicateDownload->title().isEmpty()) {
duplicateDownload->provideMetaData(title);
2015-04-22 19:32:04 +02:00
}
2020-09-04 00:57:42 +02:00
} else if (Download *result = Download::fromUrl(url)) {
result->provideMetaData(title, QString(), TimeSpan(), pageTitle, results().size());
reportResult(result);
2015-04-22 19:32:04 +02:00
}
}
2020-09-04 00:57:42 +02:00
overallIndex = linkMatch.capturedEnd();
2015-04-22 19:32:04 +02:00
}
return DownloadFinder::ParsingResult::Success;
}
2019-07-20 20:20:58 +02:00
} // namespace Network