[booru] split '_prepare_post()'

This commit is contained in:
Mike Fährmann 2020-12-24 01:04:44 +01:00
parent 53222445d5
commit e41e2be2f9
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 28 additions and 25 deletions

View File

@ -13,6 +13,7 @@ from .. import text, util, exception
from xml.etree import ElementTree
import collections
import operator
import re
@ -25,19 +26,25 @@ class BooruExtractor(Extractor):
def items(self):
self.login()
extended_tags = self.config("tags", False)
data = self.metadata()
tags = self.config("tags", False)
for post in self.posts():
try:
url = self._prepare_post(post, extended_tags)
url = self._file_url(post)
if url[0] == "/":
url = self.root + url
except (KeyError, TypeError):
self.log.debug("Unable to fetch download URL for post %s "
"(md5: %s)", post.get("id"), post.get("md5"))
continue
if tags:
self._extended_tags(post)
self._prepare(post)
post.update(data)
text.nameext_from_url(url, post)
yield Message.Directory, post
yield Message.Url, url, post
@ -57,17 +64,14 @@ class BooruExtractor(Extractor):
"""Return an iterable with post objects"""
return ()
def _prepare_post(self, post, extended_tags=False):
url = post["file_url"]
if url[0] == "/":
url = self.root + url
if extended_tags:
self._fetch_extended_tags(post)
_file_url = operator.itemgetter("file_url")
@staticmethod
def _prepare(post):
post["date"] = text.parse_datetime(
post["created_at"], "%a %b %d %H:%M:%S %z %Y")
return url
def _fetch_extended_tags(self, post, page=None):
def _extended_tags(self, post, page=None):
if not page:
url = "{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])

View File

@ -17,11 +17,12 @@ class GelbooruBase():
category = "gelbooru"
root = "https://gelbooru.com"
def _prepare_post(self, post, extended_tags=False):
url = booru.BooruExtractor._prepare_post(self, post, extended_tags)
if url.startswith("https://mp4.gelbooru.com/"):
@staticmethod
def _file_url(post):
url = post["file_url"]
if url.startswith(("https://mp4.gelbooru.com/", "https://video-cdn")):
md5 = post["md5"]
return "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
url = "https://img2.gelbooru.com/images/{}/{}/{}.webm".format(
md5[0:2], md5[2:4], md5)
return url

View File

@ -23,14 +23,11 @@ class MoebooruExtractor(BooruExtractor):
filename_fmt = "{category}_{id}_{md5}.{extension}"
page_start = 1
def _prepare_post(self, post, extended_tags=False):
url = post["file_url"]
if extended_tags:
self._fetch_extended_tags(post)
@staticmethod
def _prepare(post):
post["date"] = text.parse_timestamp(post["created_at"])
return url
def _fetch_extended_tags(self, post):
def _extended_tags(self, post):
url = "{}/post/show/{}".format(self.root, post["id"])
page = self.request(url).text
html = text.extract(page, '<ul id="tag-', '</ul>')[0]

View File

@ -41,20 +41,21 @@ class SankakuExtractor(BooruExtractor):
def skip(self, num):
return 0
def _prepare_post(self, post, extended_tags=False):
def _file_url(self, post):
url = post["file_url"]
if not url and self._warning:
self.log.warning(
"Login required to download 'contentious_content' posts")
SankakuExtractor._warning = False
if extended_tags:
self._fetch_extended_tags(post)
return url
@staticmethod
def _prepare(post):
post["created_at"] = post["created_at"]["s"]
post["date"] = text.parse_timestamp(post["created_at"])
post["tags"] = [tag["name"] for tag in post["tags"]]
return url
def _fetch_extended_tags(self, post):
def _extended_tags(self, post):
tags = collections.defaultdict(list)
types = self.TAG_TYPES
for tag in post["tags"]: