[tumblr] attempt to fetch high-quality inline images (#2877)

* [tumblr] attempt to fetch high-quality images (again)

Fixes #1846, and fixes #1344

* slight refactor

* update configuration.rst entry
This commit is contained in:
blankie 2022-08-31 08:53:50 +00:00 committed by GitHub
parent daef91c925
commit 9745b48830
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 12 deletions

View File

@ -2266,10 +2266,11 @@ Type
Default
``true``
Description
Download full-resolution ``photo`` images.
Download full-resolution ``photo`` and ``inline`` images.
For each photo with "maximum" resolution
(width equal to 2048 or height equal to 3072),
(width equal to 2048 or height equal to 3072)
or each inline image,
use an extra HTTP request to find the URL to its full-resolution version.

View File

@ -14,14 +14,6 @@ from datetime import datetime, timedelta
import re
def _original_inline_image(url):
return re.sub(
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
r"https://\1_1280.\2", url
)
def _original_video(url):
return re.sub(
(r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
@ -141,7 +133,7 @@ class TumblrExtractor(Extractor):
# API response, but they can't contain images/videos anyway
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
for url in re.findall('<img src="([^"]+)"', body):
url = _original_inline_image(url)
url = self._original_inline_image(url)
posts.append(self._prepare_image(url, post.copy()))
for url in re.findall('<source src="([^"]+)"', body):
url = _original_video(url)
@ -221,7 +213,21 @@ class TumblrExtractor(Extractor):
return self.blog != post.get("reblogged_root_uuid")
def _original_image(self, url):
url = url.replace("/s2048x3072/", "/s99999x99999/", 1)
return self._update_image_token(
url.replace("/s2048x3072/", "/s99999x99999/", 1))
def _original_inline_image(self, url):
if self.original:
url, n = re.subn(r"/s\d+x\d+/", "/s99999x99999/", url, 1)
if n:
return self._update_image_token(url)
return re.sub(
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
r"https://\1_1280.\2", url
)
def _update_image_token(self, url):
headers = {"Accept": "text/html,*/*;q=0.8"}
response = self.request(url, headers=headers)
return text.extract(response.text, '" src="', '"')[0]
@ -305,6 +311,14 @@ class TumblrPostExtractor(TumblrExtractor):
("https://mikf123.tumblr.com/post/181022380064/chat-post", {
"count": 0,
}),
("https://kichatundk.tumblr.com/post/654953419288821760", {
"count": 2, # high-quality images (#1846)
"content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b",
}),
("https://hameru-is-cool.tumblr.com/post/639261855227002880", {
"count": 2, # high-quality images (#1344)
"content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34",
}),
("https://mikf123.tumblr.com/image/689860196535762944", {
"pattern": r"^https://\d+\.media\.tumblr\.com"
r"/134791621559a79793563b636b5fe2c6"