[tumblr] attempt to fetch high-quality inline images (#2877)

* [tumblr] attempt to fetch high-quality images (again)

Fixes #1846, and fixes #1344

* slight refactor

* update configuration.rst entry
This commit is contained in:
blankie 2022-08-31 08:53:50 +00:00 committed by GitHub
parent daef91c925
commit 9745b48830
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 12 deletions

View File

@ -2266,10 +2266,11 @@ Type
Default Default
``true`` ``true``
Description Description
Download full-resolution ``photo`` images. Download full-resolution ``photo`` and ``inline`` images.
For each photo with "maximum" resolution For each photo with "maximum" resolution
(width equal to 2048 or height equal to 3072), (width equal to 2048 or height equal to 3072)
or each inline image,
use an extra HTTP request to find the URL to its full-resolution version. use an extra HTTP request to find the URL to its full-resolution version.

View File

@ -14,14 +14,6 @@ from datetime import datetime, timedelta
import re import re
def _original_inline_image(url):
return re.sub(
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
r"https://\1_1280.\2", url
)
def _original_video(url): def _original_video(url):
return re.sub( return re.sub(
(r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com" (r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
@ -141,7 +133,7 @@ class TumblrExtractor(Extractor):
# API response, but they can't contain images/videos anyway # API response, but they can't contain images/videos anyway
body = post["reblog"]["comment"] + post["reblog"]["tree_html"] body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
for url in re.findall('<img src="([^"]+)"', body): for url in re.findall('<img src="([^"]+)"', body):
url = _original_inline_image(url) url = self._original_inline_image(url)
posts.append(self._prepare_image(url, post.copy())) posts.append(self._prepare_image(url, post.copy()))
for url in re.findall('<source src="([^"]+)"', body): for url in re.findall('<source src="([^"]+)"', body):
url = _original_video(url) url = _original_video(url)
@ -221,7 +213,21 @@ class TumblrExtractor(Extractor):
return self.blog != post.get("reblogged_root_uuid") return self.blog != post.get("reblogged_root_uuid")
def _original_image(self, url): def _original_image(self, url):
url = url.replace("/s2048x3072/", "/s99999x99999/", 1) return self._update_image_token(
url.replace("/s2048x3072/", "/s99999x99999/", 1))
def _original_inline_image(self, url):
if self.original:
url, n = re.subn(r"/s\d+x\d+/", "/s99999x99999/", url, 1)
if n:
return self._update_image_token(url)
return re.sub(
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
r"https://\1_1280.\2", url
)
def _update_image_token(self, url):
headers = {"Accept": "text/html,*/*;q=0.8"} headers = {"Accept": "text/html,*/*;q=0.8"}
response = self.request(url, headers=headers) response = self.request(url, headers=headers)
return text.extract(response.text, '" src="', '"')[0] return text.extract(response.text, '" src="', '"')[0]
@ -305,6 +311,14 @@ class TumblrPostExtractor(TumblrExtractor):
("https://mikf123.tumblr.com/post/181022380064/chat-post", { ("https://mikf123.tumblr.com/post/181022380064/chat-post", {
"count": 0, "count": 0,
}), }),
("https://kichatundk.tumblr.com/post/654953419288821760", {
"count": 2, # high-quality images (#1846)
"content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b",
}),
("https://hameru-is-cool.tumblr.com/post/639261855227002880", {
"count": 2, # high-quality images (#1344)
"content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34",
}),
("https://mikf123.tumblr.com/image/689860196535762944", { ("https://mikf123.tumblr.com/image/689860196535762944", {
"pattern": r"^https://\d+\.media\.tumblr\.com" "pattern": r"^https://\d+\.media\.tumblr\.com"
r"/134791621559a79793563b636b5fe2c6" r"/134791621559a79793563b636b5fe2c6"