add 'text.ensure_http_scheme()'

This commit is contained in:
Mike Fährmann 2020-05-19 21:25:07 +02:00
parent 4df2cadf60
commit 6294e2c540
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
5 changed files with 39 additions and 8 deletions

View File

@ -126,8 +126,9 @@ class DeviantartExtractor(Extractor):
if self.extra: if self.extra:
for match in DeviantartStashExtractor.pattern.finditer( for match in DeviantartStashExtractor.pattern.finditer(
deviation.get("description", "")): deviation.get("description", "")):
url = text.ensure_http_scheme(match.group(0))
deviation["_extractor"] = DeviantartStashExtractor deviation["_extractor"] = DeviantartStashExtractor
yield Message.Queue, match.group(0), deviation yield Message.Queue, url, deviation
def deviations(self): def deviations(self):
"""Return an iterable containing all relevant Deviation-objects""" """Return an iterable containing all relevant Deviation-objects"""

View File

@ -224,10 +224,7 @@ class NewgroundsImageExtractor(NewgroundsExtractor):
self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format( self.post_url = "https://www.newgrounds.com/art/view/{}/{}".format(
self.user, match.group(3)) self.user, match.group(3))
else: else:
url = match.group(0) self.post_url = text.ensure_http_scheme(match.group(0))
if not url.startswith("http"):
url = "https://" + url
self.post_url = url
def posts(self): def posts(self):
return (self.post_url,) return (self.post_url,)
@ -414,6 +411,6 @@ class NewgroundsFollowingExtractor(NewgroundsFavoriteExtractor):
@staticmethod @staticmethod
def _extract_favorites(page): def _extract_favorites(page):
return [ return [
"https://" + user.rpartition('"')[2].lstrip("/:") text.ensure_http_scheme(user.rpartition('"')[2])
for user in text.extract_iter(page, 'class="item-user', '"><img') for user in text.extract_iter(page, 'class="item-user', '"><img')
] ]

View File

@ -98,8 +98,7 @@ class PatreonExtractor(Extractor):
headers = {"Referer": self.root} headers = {"Referer": self.root}
while url: while url:
if not url.startswith("http"): url = text.ensure_http_scheme(url)
url = "https://" + url.lstrip("/:")
posts = self.request(url, headers=headers).json() posts = self.request(url, headers=headers).json()
if "included" in posts: if "included" in posts:

View File

@ -60,6 +60,13 @@ def split_html(txt, sep=None):
return [] return []
def ensure_http_scheme(url, scheme="https://"):
"""Prepend 'scheme' to 'url' if it doesn't have one"""
if url and not url.startswith(("https://", "http://")):
return scheme + url.lstrip("/:")
return url
def filename_from_url(url): def filename_from_url(url):
"""Extract the last part of an URL to use as a filename""" """Extract the last part of an URL to use as a filename"""
try: try:

View File

@ -94,6 +94,33 @@ class TestText(unittest.TestCase):
for value in INVALID: for value in INVALID:
self.assertEqual(f(value), empty) self.assertEqual(f(value), empty)
def test_ensure_http_scheme(self, f=text.ensure_http_scheme):
result = "https://example.org/filename.ext"
# standard usage
self.assertEqual(f(""), "")
self.assertEqual(f("example.org/filename.ext"), result)
self.assertEqual(f("/example.org/filename.ext"), result)
self.assertEqual(f("//example.org/filename.ext"), result)
self.assertEqual(f("://example.org/filename.ext"), result)
# no change
self.assertEqual(f(result), result)
self.assertEqual(
f("http://example.org/filename.ext"),
"http://example.org/filename.ext",
)
# ...
self.assertEqual(
f("htp://example.org/filename.ext"),
"https://htp://example.org/filename.ext",
)
# invalid arguments
for value in INVALID_ALT:
self.assertEqual(f(value), value)
def test_filename_from_url(self, f=text.filename_from_url): def test_filename_from_url(self, f=text.filename_from_url):
result = "filename.ext" result = "filename.ext"