Skip to content

Commit

Permalink
Don't check preconnect links
Browse files Browse the repository at this point in the history
Preconnect links are used to establish a server connection without loading a
specific resource yet. Not always do these links point to a URL that should
return a 200, and they are not user-facing, i.e. they don't show up in the
final rendered version of a page.

Therefore, I think we should them at all; not even in `--include-verbatim`
mode, as they might not point to a valid resource.

Fixes #897
  • Loading branch information
mre committed Jul 29, 2023
1 parent cead4ce commit 3d10611
Show file tree
Hide file tree
Showing 3 changed files with 44 additions and 2 deletions.
20 changes: 20 additions & 0 deletions lychee-lib/src/extract/html/html5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ impl TokenSink for LinkExtractor {
}
}

// Check and exclude rel=preconnect. Other than prefetch and preload,
// preconnect only does DNS lookups and might not be a link to a resource
if let Some(rel) = attrs.iter().find(|attr| &attr.name.local == "rel") {
if rel.value.contains("preconnect") {
return TokenSinkResult::Continue;
}
}

for attr in attrs {
let urls = LinkExtractor::extract_urls_from_elem_attr(
&attr.name.local,
Expand Down Expand Up @@ -136,6 +144,8 @@ impl LinkExtractor {
// and https://html.spec.whatwg.org/multipage/indices.html#attributes-1

match (elem_name, attr_name) {
// TODO: Skip <link rel="preconnect">

// Common element/attribute combinations for links
(_, "href" | "src" | "cite" | "usemap")
// Less common (but still valid!) combinations
Expand Down Expand Up @@ -353,4 +363,14 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_skip_preconnect() {
let input = r#"
<link rel="preconnect" href="https://example.com">
"#;

let uris = extract_html(input, false);
assert!(uris.is_empty());
}
}
23 changes: 22 additions & 1 deletion lychee-lib/src/extract/html/html5gum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@ use super::{is_email_link, is_verbatim_elem, srcset};
use crate::{extract::plaintext::extract_plaintext, types::uri::raw::RawUri};

#[derive(Clone)]
#[allow(clippy::struct_excessive_bools)]
struct LinkExtractor {
// note: what html5gum calls a tag, lychee calls an element
links: Vec<RawUri>,
current_string: Vec<u8>,
current_element_name: Vec<u8>,
current_element_is_closing: bool,
current_element_nofollow: bool,
current_element_preconnect: bool,
current_attribute_name: Vec<u8>,
current_attribute_value: Vec<u8>,
last_start_element: Vec<u8>,
Expand All @@ -33,6 +35,7 @@ impl LinkExtractor {
current_element_name: Vec::new(),
current_element_is_closing: false,
current_element_nofollow: false,
current_element_preconnect: false,
current_attribute_name: Vec::new(),
current_attribute_value: Vec::new(),
last_start_element: Vec::new(),
Expand Down Expand Up @@ -147,7 +150,15 @@ impl LinkExtractor {
if attr == "rel" && value.contains("nofollow") {
self.current_element_nofollow = true;
}
if self.current_element_nofollow {

// Ignore links with rel=preconnect
// Other than prefetch and preload, preconnect only makes
// a DNS lookup, so we don't want to extract those links.
if attr == "rel" && value.contains("preconnect") {
self.current_element_preconnect = true;
}

if self.current_element_nofollow || self.current_element_preconnect {
self.current_attribute_name.clear();
self.current_attribute_value.clear();
return;
Expand Down Expand Up @@ -507,4 +518,14 @@ mod tests {
let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_skip_preconnect() {
let input = r#"
<link rel="preconnect" href="https://example.com">
"#;

let uris = extract_html(input, false);
assert!(uris.is_empty());
}
}
3 changes: 2 additions & 1 deletion lychee-lib/src/extract/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ mod tests {
let expected_links = IntoIterator::into_iter([
website("https://example.com/"),
website("https://example.com/favicon.ico"),
website("https://fonts.externalsite.com"),
// Note that we exclude `preconnect` links:
// website("https://fonts.externalsite.com"),
website("https://example.com/docs/"),
website("https://example.com/forum"),
])
Expand Down

0 comments on commit 3d10611

Please sign in to comment.