[zerochan] add 'metadata' option (#2861)

This commit is contained in:
Mike Fährmann 2022-09-01 21:44:22 +02:00
parent 9745b48830
commit 3cb8327c60
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
3 changed files with 56 additions and 14 deletions

View File

@ -2797,6 +2797,18 @@ Description
Location of a youtube-dl configuration file to load options from. Location of a youtube-dl configuration file to load options from.
extractor.zerochan.metadata
---------------------------
Type
``bool``
Default
``false``
Description
Extract additional metadata (date, md5, tags, ...)
Note: This requires 1-2 additional HTTP request for each post.
extractor.[booru].tags extractor.[booru].tags
---------------------- ----------------------
Type Type

View File

@ -341,7 +341,8 @@
"zerochan": "zerochan":
{ {
"username": null, "username": null,
"password": null "password": null,
"metadata": false
}, },
"booru": "booru":
{ {

View File

@ -11,6 +11,8 @@
from .booru import BooruExtractor from .booru import BooruExtractor
from ..cache import cache from ..cache import cache
from .. import text, exception from .. import text, exception
from xml.etree import ElementTree
BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net" BASE_PATTERN = r"(?:https?://)?(?:www\.)?zerochan\.net"
@ -54,7 +56,7 @@ class ZerochanExtractor(BooruExtractor):
return response.cookies return response.cookies
def _parse_entry_page(self, entry_id): def _parse_entry_html(self, entry_id):
url = "{}/{}".format(self.root, entry_id) url = "{}/{}".format(self.root, entry_id)
extr = text.extract_from(self.request(url).text) extr = text.extract_from(self.request(url).text)
@ -66,10 +68,26 @@ class ZerochanExtractor(BooruExtractor):
'"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"), '"datePublished": "', '"'), "%a %b %d %H:%M:%S %Y"),
"width" : extr('"width": "', ' '), "width" : extr('"width": "', ' '),
"height": extr('"height": "', ' '), "height": extr('"height": "', ' '),
"size" : extr('"contentSize": "', 'B'), "size" : text.parse_bytes(extr('"contentSize": "', 'B')),
"path" : text.split_html(extr( "path" : text.split_html(extr(
'class="breadcrumbs', '</p>'))[3::2], 'class="breadcrumbs', '</p>'))[3::2],
"tags" : extr('alt="Tags: ', '"').split(", ") "tags" : extr('alt="Tags: Anime, ', '"').split(", ")
}
def _parse_entry_xml(self, entry_id):
url = "{}/{}?xml".format(self.root, entry_id)
item = ElementTree.fromstring(self.request(url).text)[0][-1]
# content = item[4].attrib
return {
# "id" : entry_id,
# "file_url": content["url"],
# "width" : content["width"],
# "height": content["height"],
# "size" : content["filesize"],
"name" : item[2].text,
"tags" : item[5].text.lstrip().split(", "),
"md5" : item[6].text,
} }
@ -105,6 +123,7 @@ class ZerochanTagExtractor(ZerochanExtractor):
url = self.root + "/" + self.search_tag url = self.root + "/" + self.search_tag
params = text.parse_query(self.query) params = text.parse_query(self.query)
params["p"] = text.parse_int(params.get("p"), 1) params["p"] = text.parse_int(params.get("p"), 1)
metadata = self.config("metadata")
while True: while True:
page = self.request(url, params=params).text page = self.request(url, params=params).text
@ -115,15 +134,22 @@ class ZerochanTagExtractor(ZerochanExtractor):
post = extr('<li class="', '>') post = extr('<li class="', '>')
if not post: if not post:
break break
yield {
"id" : extr('href="/', '"'), if metadata:
"name" : extr('alt="', '"'), entry_id = extr('href="/', '"')
"width" : extr('title="', 'x'), post = self._parse_entry_html(entry_id)
"height": extr('', ' '), post.update(self._parse_entry_xml(entry_id))
"size" : extr('', 'B'), yield post
"file_url": "https://static." + extr( else:
'<a href="https://static.', '"'), yield {
} "id" : extr('href="/', '"'),
"name" : extr('alt="', '"'),
"width" : extr('title="', 'x'),
"height": extr('', ' '),
"size" : extr('', 'B'),
"file_url": "https://static." + extr(
'<a href="https://static.', '"'),
}
if 'rel="next"' not in page: if 'rel="next"' not in page:
break break
@ -153,4 +179,7 @@ class ZerochanImageExtractor(ZerochanExtractor):
self.image_id = match.group(1) self.image_id = match.group(1)
def posts(self): def posts(self):
return (self._parse_entry_page(self.image_id),) post = self._parse_entry_html(self.image_id)
if self.config("metadata"):
post.update(self._parse_entry_xml(self.image_id))
return (post,)