| Server IP : 66.29.153.156 / Your IP : 216.73.216.226 Web Server : LiteSpeed System : Linux premium322.web-hosting.com 4.18.0-553.50.1.lve.el8.x86_64 #1 SMP Thu Apr 17 19:10:24 UTC 2025 x86_64 User : lastyfjz ( 1521) PHP Version : 8.1.34 Disable Function : NONE MySQL : OFF | cURL : ON | WGET : ON | Perl : ON | Python : ON | Sudo : OFF | Pkexec : OFF Directory : /home/lastyfjz/werepairmobile.com/wp-content/plugins/broken-link-checker-seo/app/Links/ |
Upload File : |
<?php
namespace AIOSEO\BrokenLinkChecker\Links;
// Exit if accessed directly.
if ( ! defined( 'ABSPATH' ) ) {
exit;
}
use AIOSEO\BrokenLinkChecker\Models;
use AIOSEO\BrokenLinkChecker\Utils\BlcHtmlTagProcessor;
/**
* Handles the extraction, parsing and storage of links for the links scan.
*
* @since 1.0.0
*/
class Data {
/**
* The ignored extensions.
*
* @since 1.0.0
*
* @var array
*/
private $ignoredExtensions = [];
/**
* Class constructor.
*
* @since 1.0.0
*/
public function __construct() {
$this->setIgnoredExtensions();
}
/**
* Indexes the links in the given post.
*
* @since 1.0.0
*
* @param int $postId The post ID.
* @return void
*/
public function indexLinks( $postId ) {
$post = get_post( $postId );
if ( ! is_a( $post, 'WP_Post' ) ) {
return;
}
// Delete all links first. We have to do this in order to remove old links that no longer exist.
Models\Link::deleteLinks( $postId );
$links = $this->extractLinks( $postId, $post->post_content );
if ( empty( $links ) ) {
return;
}
$this->storeLinks( $links );
}
/**
* Stores the given links to the DB.
*
* @since 1.0.0
*
* @param array $links The links.
* @return void
*/
private function storeLinks( $links ) {
$columns = [
'post_id',
'blc_link_status_id',
'url',
'url_hash',
'hostname',
'hostname_url',
'external',
'anchor',
'phrase',
'phrase_html',
'paragraph',
'paragraph_html',
'is_video',
'created',
'updated'
];
$currentDate = gmdate( 'Y-m-d H:i:s' );
$urls = [];
$rows = [];
foreach ( $links as $linkData ) {
$data = Models\Link::sanitizeLink( $linkData );
if ( empty( $data ) ) {
continue;
}
if ( ! Models\Link::validateLink( $data ) ) {
continue;
}
$urls[ $data['url_hash'] ] = $data['url'];
$rows[] = array_merge( array_values( $data ), [ $currentDate, $currentDate ] );
}
aioseoBrokenLinkChecker()->core->db->bulkInsert( 'aioseo_blc_links', $columns, $rows );
$existing = aioseoBrokenLinkChecker()->core->db->start( 'aioseo_blc_link_status' )
->select( 'url_hash' )
->whereIn( 'url_hash', array_keys( $urls ) )
->run()
->result();
foreach ( $existing as $row ) {
unset( $urls[ $row->url_hash ] );
}
if ( empty( $urls ) ) {
return;
}
foreach ( $urls as $hash => $url ) {
$statusId = aioseoBrokenLinkChecker()->core->db->insert( 'aioseo_blc_link_status' )
->set( [
'url' => $url,
'url_hash' => $hash,
'created' => aioseoBrokenLinkChecker()->helpers->timeToMysql( time() ),
'updated' => aioseoBrokenLinkChecker()->helpers->timeToMysql( time() )
] )
->run()
->insertId();
aioseoBrokenLinkChecker()->core->db->update( 'aioseo_blc_links' )
->where( 'url', $url )
->set( [
'blc_link_status_id' => $statusId
] )
->run();
}
}
/**
* Returns the links that are in the post content.
*
* Uses WP_HTML_Tag_Processor (WP 6.6+) for reliable HTML parsing that handles
* all quote styles, attribute orders and edge cases. Falls back to regex on older WP versions.
*
* @since 1.0.0
* @since 1.3.0 Rewritten to use WP_HTML_Tag_Processor with regex fallback.
*
* @param int $postId The post ID.
* @param string $postContent The post content.
* @return array The links.
*/
private function extractLinks( $postId, $postContent ) {
$postContent = aioseoBrokenLinkChecker()->helpers->decodeHtmlEntities( $postContent );
// Strip data URIs to prevent catastrophic backtracking.
$postContent = preg_replace( '/data:[^;]+;base64,[^"]+/', '', (string) $postContent ) ?? $postContent;
// WP 6.6+: next_token() (6.5) and the bookmark length fix (6.6, Trac #61301) are available.
if ( version_compare( get_bloginfo( 'version' ), '6.6', '>=' ) ) {
$links = $this->extractLinksHtmlApi( $postId, $postContent );
// Non-null means the HTML API path completed (even if zero links found).
// Null means bookmark internals failed and we should fall back to regex.
if ( null !== $links ) {
// The HTML API path only walks <a> tags, so oEmbed video blocks are extracted separately.
return array_merge( $links, $this->extractEmbeddedVideos( $postId, $postContent ) );
}
}
return $this->extractLinksRegex( $postId, $postContent );
}
/**
* Extracts links from post content using BlcHtmlTagProcessor with bookmarks.
*
* Single-pass approach: iterates <a> tags, uses bookmarks to get byte offsets
* for the opener and closer, then extracts anchor text and surrounding sentence
* context directly from the original content string. No markers, no regex for
* link identification, no content mutation.
*
* Returns null if the WP_HTML_Tag_Processor bookmark internals have changed,
* signalling the caller to fall back to regex-based extraction.
*
* Requires WP 6.6+ — the caller gates on this version.
*
* @since 1.3.0
*
* @param int $postId The post ID.
* @param string $postContent The preprocessed post content.
* @return array|null The links, or null if bookmark internals failed.
*/
private function extractLinksHtmlApi( $postId, $postContent ) {
$processor = new BlcHtmlTagProcessor( $postContent );
$links = [];
$prevCloserEnd = 0;
$seekTo = null;
while ( true ) {
if ( $seekTo ) {
$processor->seek( $seekTo );
$processor->release_bookmark( $seekTo );
$seekTo = null;
} elseif ( ! $processor->next_tag( 'a' ) ) {
break;
}
$href = $processor->get_attribute( 'href' );
if ( ! is_string( $href ) || '' === $href || '#' === $href[0] || preg_match( '/^(tel:|mailto:|javascript:|data:)/i', $href ) ) {
// Advance past the closing </a> so $prevCloserEnd stays accurate.
// Without this, the next link's phrase boundary search would scan
// through this filtered link's href attributes (e.g. the "." in "tel:+1.555.1234").
$prevCloserEnd = $this->skipPastCloser( $processor, $prevCloserEnd );
continue;
}
// Bookmark the opening <a> tag to get its byte offsets.
$openerBookmark = 'blc_opener';
$processor->set_bookmark( $openerBookmark );
$openerStart = $processor->getBookmarkStart( $openerBookmark );
$openerLength = $processor->getBookmarkLength( $openerBookmark );
$processor->release_bookmark( $openerBookmark );
// If bookmark was set but getters return null, WP core internals have changed.
if ( null === $openerStart || null === $openerLength ) {
return null;
}
$openerEnd = $openerStart + $openerLength;
// Advance through tokens until we find the closing </a> tag.
$closerStart = null;
$closerEnd = null;
while ( $processor->next_token() ) {
$tokenType = $processor->get_token_type();
if ( '#tag' !== $tokenType || 'A' !== $processor->get_tag() ) {
continue;
}
// Nested opener (malformed HTML) — bookmark for reprocessing, stop walking.
if ( ! $processor->is_tag_closer() ) {
if ( $processor->set_bookmark( 'blc_nested' ) ) {
$seekTo = 'blc_nested';
}
break;
}
$processor->set_bookmark( 'blc_closer' );
$closerStart = $processor->getBookmarkStart( 'blc_closer' );
$closerLength = $processor->getBookmarkLength( 'blc_closer' );
$processor->release_bookmark( 'blc_closer' );
// If bookmark was set but getters return null, WP core internals have changed.
if ( null === $closerStart || null === $closerLength ) {
return null;
}
$closerEnd = $closerStart + $closerLength;
break;
}
// Skip malformed HTML with no closing </a>.
if ( null === $closerEnd ) {
continue;
}
// Extract the inner HTML and anchor text.
$innerHtml = substr( $postContent, $openerEnd, $closerStart - $openerEnd );
$anchor = wp_strip_all_tags( $innerHtml );
// Save this link's closer position before any early-continue filters below.
// This ensures the next iteration's phrase boundary search won't scan
// through this link's href attributes (which contain . ? ! characters).
$phraseSearchFrom = $prevCloserEnd;
$prevCloserEnd = $closerEnd;
$parsedUrl = $this->parseUrl( $href, $postId );
if ( empty( $parsedUrl['host'] ) ) {
continue;
}
if (
! empty( $parsedUrl['path'] ) &&
preg_match( '/\.(.*)$/i', $parsedUrl['path'], $extension ) &&
! empty( $extension[1] ) &&
in_array( $extension[1], $this->ignoredExtensions, true )
) {
continue;
}
// NOTE: We need to check this here before we strip off the "www" part.
// Otherwise we will not be able to detect internal links on sites running on "www".
$isInternal = $parsedUrl['host'] === $this->getHostname();
$hostname = aioseoBrokenLinkChecker()->helpers->pregReplace( '/www\./i', '', $parsedUrl['host'] );
// Reformat the URL to get rid of params and fragments.
$url = aioseoBrokenLinkChecker()->helpers->buildUrl( $parsedUrl, [], [ 'fragment' ] );
// We need to sanitize the URL here so the hash is calculated based on the escaped version.
$url = trim( sanitize_url( $url ) );
$url = apply_filters( 'aioseo_blc_link_url_before_save', $url );
$isVideo = $this->isVideoUrl( $url );
$phraseHtml = $this->extractPhraseHtml( $postContent, $phraseSearchFrom, $openerStart, $closerEnd );
$phrase = wp_strip_all_tags( $phraseHtml );
$phrase = trim( $phrase );
// Don't continue if the anchor or phrase are empty, e.g. blank link tag.
// If it's a video, we don't mandate an anchor or phrase.
if ( ( ! $anchor || ! $phrase ) && ! $isVideo ) {
continue;
}
$paragraph = aioseoBrokenLinkChecker()->main->paragraph->get( $postId, $postContent, $phrase );
$paragraphHtml = aioseoBrokenLinkChecker()->main->paragraph->getHtml( $anchor, $paragraph, $postContent );
$linkData = [
'post_id' => (int) $postId,
'blc_link_status_id' => $this->getLinkStatusId( $url ),
'url' => $url,
'url_hash' => sha1( $url ),
'hostname' => $hostname,
'hostname_url' => sha1( $hostname ),
'external' => ! $isInternal,
'anchor' => $anchor,
'phrase' => $phrase,
'phrase_html' => $phraseHtml,
'paragraph' => $paragraph,
'paragraph_html' => $paragraphHtml,
'is_video' => $isVideo
];
$links[] = $linkData;
}
return $links;
}
/**
* Regex-based link extraction for WP < 6.6.
*
* @since 1.0.0
* @since 1.3.0 Renamed from extractLinks().
*
* @param int $postId The post ID.
* @param string $postContent The preprocessed post content.
* @return array The links.
*/
private function extractLinksRegex( $postId, $postContent ) {
/**
* Regex pattern divided into groups:
* 0 - Full phrase with link tag.
* 2 - Start of the phrase, before the anchor.
* 4 - The URL.
* 6 - The anchor.
* 7 - The oEmbed URL.
* 9 - The end of the phrase, after the anchor.
* 10 - The ending punctuation mark.
*/
preg_match_all(
'/(([^\r\n.?!]*)<t?a[^>]*?href=(\"|\')(?!tel:|mailto:)([^\"\']*?)(\"|\')[^>]*?>([\s\w\W]*?)<\/t?a>|<!-- wp:(?:core-embed\/wordpress|embed) [^{]*{"url":"([^"]*?)"[^}]*} -->|(?:>| |\s)((?:(?:http|ftp|https)\:\/\/)(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]))(?:<| |\s))([^<>.?!\r\n]*)([.?!]?)/i', // phpcs:disable Generic.Files.LineLength.MaxExceeded
(string) $postContent,
$matches
);
if ( empty( $matches[0] ) ) {
return [];
}
$links = [];
foreach ( $matches[0] as $k => $v ) {
if (
( empty( $matches[4][ $k ] ) || empty( $matches[6][ $k ] ) ) && // Link tag URL or anchor
empty( $matches[7][ $k ] ) // oEmbedded URL
) {
continue;
}
$oEmbed = ! empty( $matches[7][ $k ] ) ? true : false;
$capturedUrl = $matches[4][ $k ] ? $matches[4][ $k ] : $matches[7][ $k ];
$parsedUrl = $this->parseUrl( $capturedUrl, $postId );
if ( empty( $parsedUrl['host'] ) ) {
continue;
}
if (
! empty( $parsedUrl['path'] ) &&
preg_match( '/\.(.*)$/i', $parsedUrl['path'], $extension ) &&
! empty( $extension[1] ) &&
in_array( $extension[1], $this->ignoredExtensions, true )
) {
continue;
}
// NOTE: We need to check this here before we strip off the "www" part.
// Otherwise we will not be able to detect internal links on sites running on "www".
$isInternal = $parsedUrl['host'] === $this->getHostname();
$hostname = aioseoBrokenLinkChecker()->helpers->pregReplace( '/www\./i', '', $parsedUrl['host'] );
// Reformat the URL to strip fragments but preserve query params (e.g. YouTube ?v=).
$url = aioseoBrokenLinkChecker()->helpers->buildUrl( $parsedUrl, [], [ 'fragment' ] );
$isVideo = $this->isVideoUrl( $url );
$anchor = wp_strip_all_tags( $matches[6][ $k ] );
// Remove trailing URL tags. The regex isn't sufficient for this.
$phrase = wp_strip_all_tags( $matches[0][ $k ] );
$phrase = trim( preg_replace( '/(.*)(<t?a[^<>].*$)/', '', (string) $phrase ) ?? $phrase );
// Don't continue if the anchor or phrase are empty, e.g. blank link tag.
// If it's a video, we don't mandate an anchor or phrase.
if (
( ! $anchor || ! $phrase ) &&
! $isVideo
) {
continue;
}
$phraseHtml = aioseoBrokenLinkChecker()->helpers->stripIncompleteHtmlTags( $matches[0][ $k ] );
$phraseHtml = aioseoBrokenLinkChecker()->helpers->stripScriptTags( $phraseHtml );
$phraseHtml = aioseoBrokenLinkChecker()->helpers->trimParagraphTags( $phraseHtml );
// oEmbed blocks reduce to an empty phrase once the block comment is stripped, so don't skip videos here.
if ( empty( $phraseHtml ) && ! $isVideo ) {
continue;
}
$paragraph = '';
$paragraphHtml = '';
if ( ! $oEmbed ) {
$paragraph = aioseoBrokenLinkChecker()->main->paragraph->get( $postId, $postContent, $phrase );
$paragraphHtml = aioseoBrokenLinkChecker()->main->paragraph->getHtml( $anchor, $paragraph, $postContent );
}
// We need to sanitize the URL here so the hash is calculated based on the escaped version.
$url = trim( sanitize_url( $url ) );
$url = apply_filters( 'aioseo_blc_link_url_before_save', $url );
$linkData = [
'post_id' => (int) $postId,
'blc_link_status_id' => $this->getLinkStatusId( $url ),
'url' => $url,
'url_hash' => sha1( $url ),
'hostname' => $hostname,
'hostname_url' => sha1( $hostname ),
'external' => ! $isInternal,
'anchor' => $anchor,
'phrase' => $phrase,
'phrase_html' => $phraseHtml,
'paragraph' => $paragraph,
'paragraph_html' => $paragraphHtml,
'is_video' => $isVideo
];
$links[] = $linkData;
}
return $links;
}
/**
* Checks whether the given URL points to a video based on known provider patterns.
*
* @since 1.3.0
*
* @param string $url The URL to check.
* @return bool Whether the URL is a video.
*/
private function isVideoUrl( $url ) {
$videoPatterns = [
'/https?:\/\/((m|www)\.)?youtube\.com\/watch.*/i',
'/https?:\/\/((m|www)\.)?youtube\.com\/playlist.*/i',
'/https?:\/\/((m|www)\.)?youtube\.com\/shorts\/.*/i',
'/https?:\/\/((m|www)\.)?youtube\.com\/live\/.*/i',
'/https?:\/\/music\.youtube\.com\/watch.*/i',
'/https?:\/\/youtu\.be\/.*/i',
'/https?:\/\/(.+)?(wistia\.com|wi\.st)\/(medias|embed)\/.*/i',
'/https?:\/\/(www\.)?vimeo\.com\/\d+/i',
'/https?:\/\/(www\.)?vimeo\.com\/(video|channels\/.+|album\/.+\/video)\/\d+/i',
'/https?:\/\/(www\.)?vimeo\.com\/(ondemand|showcase)\/.+/i',
'/https?:\/\/player\.vimeo\.com\/video\/\d+/i',
'/https?:\/\/(www\.)?dailymotion\.com\/(video|embed\/video)\/.*/i',
'/https?:\/\/dai\.ly\/.*/i',
'/https?:\/\/wordpress\.tv\/.*/i',
'/https?:\/\/videopress\.com\/v\/.*/i'
];
foreach ( $videoPatterns as $pattern ) {
if ( preg_match( $pattern, (string) $url ) ) {
return true;
}
}
return false;
}
/**
* Extracts oEmbed video links from Gutenberg embed block comments.
*
* The HTML API path only walks <a> tags, so embedded videos (which live in
* wp:embed block comments, not anchors) must be extracted separately.
*
* @since 1.3.0
*
* @param int $postId The post ID.
* @param string $postContent The preprocessed post content.
* @return array The video links.
*/
private function extractEmbeddedVideos( $postId, $postContent ) {
preg_match_all(
'/<!-- wp:(?:core-embed\/\w+|embed) [^{]*{"url":"([^"]+)"[^}]*} -->/i',
(string) $postContent,
$matches
);
if ( empty( $matches[1] ) ) {
return [];
}
$links = [];
foreach ( $matches[1] as $capturedUrl ) {
// Block comment JSON escapes forward slashes, e.g. https:\/\/youtu.be\/x.
$capturedUrl = stripslashes( $capturedUrl );
$parsedUrl = $this->parseUrl( $capturedUrl );
if ( empty( $parsedUrl['host'] ) ) {
continue;
}
// Reformat the URL to strip fragments but preserve query params (e.g. YouTube ?v=).
$url = aioseoBrokenLinkChecker()->helpers->buildUrl( $parsedUrl, [], [ 'fragment' ] );
if ( ! $this->isVideoUrl( $url ) ) {
continue;
}
// NOTE: We need to check this here before we strip off the "www" part.
// Otherwise we will not be able to detect internal links on sites running on "www".
$isInternal = $parsedUrl['host'] === $this->getHostname();
$hostname = aioseoBrokenLinkChecker()->helpers->pregReplace( '/www\./i', '', $parsedUrl['host'] );
// We need to sanitize the URL here so the hash is calculated based on the escaped version.
$url = trim( sanitize_url( $url ) );
$url = apply_filters( 'aioseo_blc_link_url_before_save', $url );
$links[] = [
'post_id' => (int) $postId,
'blc_link_status_id' => $this->getLinkStatusId( $url ),
'url' => $url,
'url_hash' => sha1( $url ),
'hostname' => $hostname,
'hostname_url' => sha1( $hostname ),
'external' => ! $isInternal,
'anchor' => '',
'phrase' => '',
'phrase_html' => '',
'paragraph' => '',
'paragraph_html' => '',
'is_video' => true
];
}
return $links;
}
/**
* Extracts the phrase HTML surrounding a link from the post content.
*
* Finds sentence boundaries before and after the link using punctuation
* delimiters, then strips incomplete HTML tags, script tags and paragraph wrappers.
*
* @since 1.3.0
*
* @param string $postContent The full post content.
* @param int $phraseSearchFrom The byte offset to start searching for sentence boundaries.
* @param int $openerStart The byte offset of the opening <a> tag.
* @param int $closerEnd The byte offset of the end of the closing </a> tag.
* @return string The phrase HTML.
*/
private function extractPhraseHtml( $postContent, $phraseSearchFrom, $openerStart, $closerEnd ) {
// Find sentence boundary before the link.
// Only search between the previous link's </a> end and this opener,
// to avoid finding delimiters inside previous links' href attributes
// (e.g. the "." in "alpha.com" would produce a garbled phrase).
$textBefore = substr( $postContent, $phraseSearchFrom, $openerStart - $phraseSearchFrom );
$phraseStart = $phraseSearchFrom;
foreach ( [ '.', '?', '!', "\r", "\n" ] as $delimiter ) {
$pos = strrpos( $textBefore, $delimiter );
if ( false !== $pos ) {
// Start after the delimiter.
$phraseStart = max( $phraseStart, $phraseSearchFrom + $pos + 1 );
}
}
// Find sentence boundary after the link.
$textAfter = substr( $postContent, $closerEnd );
$boundaryLen = strcspn( $textAfter, '<>.?!' . "\r\n" );
$phraseEnd = $closerEnd + $boundaryLen;
// Include trailing punctuation if present.
if ( $phraseEnd < strlen( $postContent ) && false !== strpos( '.?!', $postContent[ $phraseEnd ] ) ) {
$phraseEnd++;
}
$phraseHtml = substr( $postContent, $phraseStart, $phraseEnd - $phraseStart );
$phraseHtml = aioseoBrokenLinkChecker()->helpers->stripIncompleteHtmlTags( $phraseHtml );
$phraseHtml = aioseoBrokenLinkChecker()->helpers->stripScriptTags( $phraseHtml );
$phraseHtml = aioseoBrokenLinkChecker()->helpers->trimParagraphTags( $phraseHtml );
return $phraseHtml;
}
/**
* Advances the processor past the closing </a> tag for a filtered link.
*
* Used when a link is skipped (tel:, mailto:, etc.) to keep $prevCloserEnd
* accurate so that subsequent phrase boundary searches don't scan through
* this link's href attributes.
*
* @since 1.3.0
*
* @param BlcHtmlTagProcessor $processor The processor, positioned on an <a> opener.
* @param int $prevCloserEnd The current closer-end byte offset.
* @return int The updated closer-end byte offset.
*/
private function skipPastCloser( $processor, $prevCloserEnd ) {
while ( $processor->next_token() ) {
if ( '#tag' !== $processor->get_token_type() || 'A' !== $processor->get_tag() ) {
continue;
}
// Found a closing </a> — bookmark to get its end offset.
if ( $processor->is_tag_closer() ) {
$processor->set_bookmark( 'blc_skip_closer' );
$start = $processor->getBookmarkStart( 'blc_skip_closer' );
$length = $processor->getBookmarkLength( 'blc_skip_closer' );
$processor->release_bookmark( 'blc_skip_closer' );
if ( null !== $start && null !== $length ) {
return $start + $length;
}
}
// Nested opener or bookmark failure — stop walking.
break;
}
return $prevCloserEnd;
}
/**
* Return the link status ID.
*
* @since 1.0.0
*
* @param string $url The URL to look up.
* @return int|null The link status ID.
*/
private function getLinkStatusId( $url ) {
static $linkStatusId = [];
$hash = sha1( $url );
if ( isset( $linkStatusId[ $hash ] ) ) {
return $linkStatusId[ $hash ];
}
$possibleLinkStatusId = aioseoBrokenLinkChecker()->core->db->start( 'aioseo_blc_link_status' )
->where( 'url_hash', $hash )
->run()
->result();
$linkStatusId[ $hash ] = ! empty( $possibleLinkStatusId ) ? $possibleLinkStatusId[0]->id : null;
return $linkStatusId[ $hash ];
}
/**
* Returns the site's hostname.
*
* @since 1.0.0
*
* @return string The hostname.
*/
private function getHostname() {
static $siteUrl = null;
if ( null === $siteUrl ) {
$siteUrl = wp_parse_url( get_site_url(), PHP_URL_HOST );
}
return $siteUrl;
}
/**
* Returns the parsed URL.
*
* Relative URLs (e.g. `../foo`, `./foo`, `foo`, `/foo`) are resolved against
* the post's permalink per RFC 3986 §5.2 before parsing, so that `..` and `.`
* segments are collapsed and missing path separators are inserted. This
* prevents malformed values like `https://example.com../foo` from being stored.
*
* @since 1.0.0
* @version 1.1.1 Renamed method.
* @version 1.3.0 Added $postId parameter; resolves relative URLs against the post permalink.
*
* @param string $url The URL.
* @param int $postId The ID of the post the URL was extracted from.
* @return array The parsed URL.
*/
private function parseUrl( $url, $postId = 0 ) {
$parsedUrl = wp_parse_url( $url );
if ( empty( $parsedUrl ) ) {
return [];
}
// If the URL is relative, resolve it against the post's permalink so that
// `..` and `.` segments are collapsed and a proper path separator is inserted.
if ( empty( $parsedUrl['host'] ) ) {
$baseUrl = $postId ? get_permalink( $postId ) : '';
if ( ! $baseUrl ) {
$baseUrl = get_site_url();
}
$absoluteUrl = \WP_Http::make_absolute_url( $url, $baseUrl );
$parsedUrl = wp_parse_url( $absoluteUrl );
if ( empty( $parsedUrl ) ) {
return [];
}
// make_absolute_url() resolves `..` during the merge but leaves `.`
// segments behind, so collapse those to keep stored URLs canonical.
if ( ! empty( $parsedUrl['path'] ) ) {
$path = preg_replace( '#/\.(?=/)#', '', $parsedUrl['path'] ); // /./ -> /
$parsedUrl['path'] = preg_replace( '#/\.$#', '/', $path ); // trailing /. -> /
}
// Fallback in case the base URL itself lacked a host (shouldn't happen, but be safe).
if ( empty( $parsedUrl['host'] ) ) {
$parsedUrl['host'] = $this->getHostname();
$parsedUrl['scheme'] = wp_parse_url( get_site_url(), PHP_URL_SCHEME );
}
}
return $parsedUrl;
}
/**
* Returns the posts to scan.
*
* @since 1.0.0
* @version 1.3.0 Count distinct post IDs so legacy duplicate rows in aioseo_blc_posts don't inflate the count and stall the scan percentage.
*
* @param bool $countOnly Whether to return only the count.
* @return array|int The posts to scan or a count.
*/
public function getPostsToScan( $countOnly = false ) {
$postsPerScan = apply_filters( 'aioseo_blc_links_posts_per_scan', 50 );
$postTypes = aioseoBrokenLinkChecker()->helpers->getScannablePostTypes();
$postStatuses = aioseoBrokenLinkChecker()->helpers->getPublicPostStatuses( true );
// phpcs:ignore WordPress.DateTime.RestrictedFunctions.date_date
$minimumLinkScanDate = esc_sql( aioseoBrokenLinkChecker()->internalOptions->internal->minimumLinkScanDate ?: date( 'Y-m-d H:i:s' ) );
$query = aioseoBrokenLinkChecker()->core->db->start( 'posts as p' )
->leftJoin( 'aioseo_blc_posts as abp', 'p.ID = abp.post_id' )
->whereIn( 'p.post_status', $postStatuses )
->whereIn( 'p.post_type', $postTypes )
->whereRaw( "(
abp.post_id IS NULL OR
abp.link_scan_date < p.post_modified_gmt OR
abp.link_scan_date IS NULL OR
abp.link_scan_date < '$minimumLinkScanDate'
)" );
if ( $countOnly ) {
return $query->count( 'DISTINCT p.ID' );
}
$postsToScan = $query
->select( 'DISTINCT p.ID, p.post_content, p.post_type, p.post_status' )
->limit( $postsPerScan )
->run()
->result();
return $postsToScan;
}
/**
* Returns the total number of scannable posts.
*
* @since 1.0.0
*
* @return int The total number of scannable posts.
*/
private function getTotalScannablePosts() {
$postTypes = aioseoBrokenLinkChecker()->helpers->getScannablePostTypes();
$postStatuses = aioseoBrokenLinkChecker()->helpers->getPublicPostStatuses( true );
$query = aioseoBrokenLinkChecker()->core->db->start( 'posts as p' )
->whereIn( 'p.post_status', $postStatuses )
->whereIn( 'p.post_type', $postTypes );
return $query->count();
}
/**
* Returns the scan percentage.
*
* @since 1.0.0
*
* @return int The scan percentage.
*/
public function getScanPercentage() {
$postsToScan = $this->getPostsToScan( true );
$totalScannablePosts = $this->getTotalScannablePosts();
if ( 0 === $postsToScan || 0 === $totalScannablePosts ) {
return 100;
}
return ceil( 100 - ( ( $postsToScan / $totalScannablePosts ) * 100 ) );
}
/**
* Sets the ignored extensions.
*
* @since 1.0.0
*
* @return void
*/
private function setIgnoredExtensions() {
$this->ignoredExtensions = apply_filters( 'aioseo_blc_ignored_extensions', [
// Executable files
'apk',
'bat',
'bin',
'cgi',
'com',
'exe',
'gadget',
'jar',
'py',
'wsf',
] );
}
}