Skip to content

Commit

Permalink
Don't parse hostname from netloc manually; rely on urlsplit's result (G…
Browse files Browse the repository at this point in the history
…H-348)

This manual parsing of netloc can be fooled by use of a userinfo component.
SplitResult already has a hostname property.

New test `test_host_whitelist_sneaky_userinfo` fails on master.
  • Loading branch information
timmc committed Jun 16, 2023
1 parent ea11793 commit 089d436
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 4 deletions.
7 changes: 3 additions & 4 deletions src/lxml/html/clean.py
Expand Up @@ -490,11 +490,10 @@ def allow_embedded_url(self, el, url):
"""
if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
return False
scheme, netloc, path, query, fragment = urlsplit(url)
netloc = netloc.lower().split(':', 1)[0]
if scheme not in ('http', 'https'):
parts = urlsplit(url)
if parts.scheme not in ('http', 'https'):
return False
if netloc in self.host_whitelist:
if parts.hostname in self.host_whitelist:
return True
return False

Expand Down
20 changes: 20 additions & 0 deletions src/lxml/html/tests/test_clean.py
Expand Up @@ -271,6 +271,26 @@ def test_formaction_attribute_in_button_input(self):
expected,
cleaner.clean_html(html))

def test_host_whitelist_valid(self):
# Frame with valid hostname in src is allowed.
html = '<div><iframe src="https://example.com/page"></div>'
expected = '<div><iframe src="https://example.com/page"></iframe></div>'
cleaner = Cleaner(frames=False, host_whitelist=["example.com"])
self.assertEqual(expected, cleaner.clean_html(html))

def test_host_whitelist_invalid(self):
html = '<div><iframe src="https://evil.com/page"></div>'
expected = '<div></div>'
cleaner = Cleaner(frames=False, host_whitelist=["example.com"])
self.assertEqual(expected, cleaner.clean_html(html))

def test_host_whitelist_sneaky_userinfo(self):
# Regression test: Don't be fooled by hostname and colon in userinfo.
html = '<div><iframe src="https://example.com:@evil.com/page"></div>'
expected = '<div></div>'
cleaner = Cleaner(frames=False, host_whitelist=["example.com"])
self.assertEqual(expected, cleaner.clean_html(html))


def test_suite():
suite = unittest.TestSuite()
Expand Down

0 comments on commit 089d436

Please sign in to comment.