[tumblr] attempt to fetch high-quality inline images (#2877)
* [tumblr] attempt to fetch high-quality images (again) Fixes #1846, and fixes #1344 * slight refactor * update configuration.rst entry
This commit is contained in:
parent
daef91c925
commit
9745b48830
@ -2266,10 +2266,11 @@ Type
|
|||||||
Default
|
Default
|
||||||
``true``
|
``true``
|
||||||
Description
|
Description
|
||||||
Download full-resolution ``photo`` images.
|
Download full-resolution ``photo`` and ``inline`` images.
|
||||||
|
|
||||||
For each photo with "maximum" resolution
|
For each photo with "maximum" resolution
|
||||||
(width equal to 2048 or height equal to 3072),
|
(width equal to 2048 or height equal to 3072)
|
||||||
|
or each inline image,
|
||||||
use an extra HTTP request to find the URL to its full-resolution version.
|
use an extra HTTP request to find the URL to its full-resolution version.
|
||||||
|
|
||||||
|
|
||||||
|
@ -14,14 +14,6 @@ from datetime import datetime, timedelta
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
def _original_inline_image(url):
|
|
||||||
return re.sub(
|
|
||||||
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
|
|
||||||
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
|
|
||||||
r"https://\1_1280.\2", url
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _original_video(url):
|
def _original_video(url):
|
||||||
return re.sub(
|
return re.sub(
|
||||||
(r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
|
(r"https?://((?:vt|vtt|ve)(?:\.media)?\.tumblr\.com"
|
||||||
@ -141,7 +133,7 @@ class TumblrExtractor(Extractor):
|
|||||||
# API response, but they can't contain images/videos anyway
|
# API response, but they can't contain images/videos anyway
|
||||||
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
|
body = post["reblog"]["comment"] + post["reblog"]["tree_html"]
|
||||||
for url in re.findall('<img src="([^"]+)"', body):
|
for url in re.findall('<img src="([^"]+)"', body):
|
||||||
url = _original_inline_image(url)
|
url = self._original_inline_image(url)
|
||||||
posts.append(self._prepare_image(url, post.copy()))
|
posts.append(self._prepare_image(url, post.copy()))
|
||||||
for url in re.findall('<source src="([^"]+)"', body):
|
for url in re.findall('<source src="([^"]+)"', body):
|
||||||
url = _original_video(url)
|
url = _original_video(url)
|
||||||
@ -221,7 +213,21 @@ class TumblrExtractor(Extractor):
|
|||||||
return self.blog != post.get("reblogged_root_uuid")
|
return self.blog != post.get("reblogged_root_uuid")
|
||||||
|
|
||||||
def _original_image(self, url):
|
def _original_image(self, url):
|
||||||
url = url.replace("/s2048x3072/", "/s99999x99999/", 1)
|
return self._update_image_token(
|
||||||
|
url.replace("/s2048x3072/", "/s99999x99999/", 1))
|
||||||
|
|
||||||
|
def _original_inline_image(self, url):
|
||||||
|
if self.original:
|
||||||
|
url, n = re.subn(r"/s\d+x\d+/", "/s99999x99999/", url, 1)
|
||||||
|
if n:
|
||||||
|
return self._update_image_token(url)
|
||||||
|
return re.sub(
|
||||||
|
(r"https?://(\d+\.media\.tumblr\.com(?:/[0-9a-f]+)?"
|
||||||
|
r"/tumblr(?:_inline)?_[^_]+)_\d+\.([0-9a-z]+)"),
|
||||||
|
r"https://\1_1280.\2", url
|
||||||
|
)
|
||||||
|
|
||||||
|
def _update_image_token(self, url):
|
||||||
headers = {"Accept": "text/html,*/*;q=0.8"}
|
headers = {"Accept": "text/html,*/*;q=0.8"}
|
||||||
response = self.request(url, headers=headers)
|
response = self.request(url, headers=headers)
|
||||||
return text.extract(response.text, '" src="', '"')[0]
|
return text.extract(response.text, '" src="', '"')[0]
|
||||||
@ -305,6 +311,14 @@ class TumblrPostExtractor(TumblrExtractor):
|
|||||||
("https://mikf123.tumblr.com/post/181022380064/chat-post", {
|
("https://mikf123.tumblr.com/post/181022380064/chat-post", {
|
||||||
"count": 0,
|
"count": 0,
|
||||||
}),
|
}),
|
||||||
|
("https://kichatundk.tumblr.com/post/654953419288821760", {
|
||||||
|
"count": 2, # high-quality images (#1846)
|
||||||
|
"content": "d6fcc7b6f750d835d55c7f31fa3b63be26c9f89b",
|
||||||
|
}),
|
||||||
|
("https://hameru-is-cool.tumblr.com/post/639261855227002880", {
|
||||||
|
"count": 2, # high-quality images (#1344)
|
||||||
|
"content": "6bc19a42787e46e1bba2ef4aeef5ca28fcd3cd34",
|
||||||
|
}),
|
||||||
("https://mikf123.tumblr.com/image/689860196535762944", {
|
("https://mikf123.tumblr.com/image/689860196535762944", {
|
||||||
"pattern": r"^https://\d+\.media\.tumblr\.com"
|
"pattern": r"^https://\d+\.media\.tumblr\.com"
|
||||||
r"/134791621559a79793563b636b5fe2c6"
|
r"/134791621559a79793563b636b5fe2c6"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user