[gelbooru] re-enable API use (closes #56)

Gelbooru's API allows access to all images and is not restricted
to the first 20000.

This also adds an option to select between API use and manual
information extraction in case their API gets disabled again.
This commit is contained in:
Mike Fährmann 2017-12-21 21:42:40 +01:00
parent 8102aae311
commit d0886f411e
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
3 changed files with 69 additions and 15 deletions

View File

@ -441,6 +441,18 @@ Description Sets the maximum allowed size for downloaded images.
=========== =====
extractor.gelbooru.api
----------------------
=========== =====
Type ``bool``
Default ``true``
Description Enable use of Gelbooru's API.
Set this value to `false` if the API has been disabled to switch
to manual information extraction.
=========== =====
extractor.gfycat.format
-----------------------
=========== =====

View File

@ -70,7 +70,8 @@
},
"gelbooru":
{
"filename": "{category}_{id:>07}_{md5}.{extension}"
"filename": "{category}_{id:>07}_{md5}.{extension}",
"api": true
},
"reddit":
{

View File

@ -9,7 +9,8 @@
"""Extract images from https://gelbooru.com/"""
from .common import SharedConfigExtractor, Message
from .. import text, util
from .. import text, util, exception
import xml.etree.ElementTree as ET
class GelbooruExtractor(SharedConfigExtractor):
@ -17,19 +18,26 @@ class GelbooruExtractor(SharedConfigExtractor):
basecategory = "booru"
category = "gelbooru"
filename_fmt = "{category}_{id}_{md5}.{extension}"
api_url = "https://gelbooru.com/index.php?page=dapi&s=post&q=index"
def __init__(self):
SharedConfigExtractor.__init__(self)
self.start_post = 0
self.use_api = self.config("api", True)
if self.use_api:
self.get_post_data = self.get_post_data_api
def items(self):
yield Message.Version, 1
yield Message.Directory, self.get_metadata()
for post_id in util.advance(self.get_posts(), self.start_post):
data = self.get_post_data(post_id)
url = data["file_url"]
yield Message.Url, url, text.nameext_from_url(url, data)
for post in util.advance(self.get_posts(), self.start_post):
if isinstance(post, str):
post = self.get_post_data(post)
for key in ("id", "width", "height", "score", "change"):
post[key] = util.safe_int(post[key])
url = post["file_url"]
yield Message.Url, url, text.nameext_from_url(url, post)
def skip(self, num):
self.start_post += num
@ -40,7 +48,7 @@ class GelbooruExtractor(SharedConfigExtractor):
return {}
def get_posts(self):
"""Return an iterable containing all relevant post ids"""
"""Return an iterable containing all relevant post objects"""
def get_post_data(self, post_id):
"""Extract metadata of a single post"""
@ -58,14 +66,20 @@ class GelbooruExtractor(SharedConfigExtractor):
(None , '<li>Score: ', ''),
("score" , '>', '<'),
("file_url" , '<li><a href="http', '"'),
("change" , ' id="lupdated" value="', '"'),
))[0]
data["file_url"] = "http" + data["file_url"]
data["file_url"] = "http" + data["file_url"].replace("m//", "m/", 1)
data["md5"] = data["file_url"].rpartition("/")[2].partition(".")[0]
data["rating"] = (data["rating"] or "?")[0].lower()
for key in ("id", "width", "height", "score"):
data[key] = util.safe_int(data[key])
data["tags"] = " ".join(
[tag.replace(" ", "_") for tag in data["tags"].split(", ")])
return data
def get_post_data_api(self, post_id):
"""Request metadata of a single post from Gelbooru's API"""
return ET.fromstring(
self.request(self.api_url + "&id=" + post_id).text)[0].attrib
class GelbooruTagExtractor(GelbooruExtractor):
"""Extractor for images from gelbooru.com based on search-tags"""
@ -73,14 +87,20 @@ class GelbooruTagExtractor(GelbooruExtractor):
directory_fmt = ["{category}", "{tags}"]
pattern = [r"(?:https?://)?(?:www\.)?gelbooru\.com/(?:index\.php)?"
r"\?page=post&s=list&tags=([^&]+)"]
test = [("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
"count": 5,
})]
per_page = 42
test = [
("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
"count": 5,
}),
("https://gelbooru.com/index.php?page=post&s=list&tags=bonocho", {
"options": (("api", False),),
"count": 5,
}),
]
def __init__(self, match):
GelbooruExtractor.__init__(self)
self.tags = text.unquote(match.group(1).replace("+", " "))
self.per_page = 100 if self.use_api else 42
self.start_page = 0
def skip(self, num):
@ -93,8 +113,26 @@ class GelbooruTagExtractor(GelbooruExtractor):
return {"tags": self.tags}
def get_posts(self):
if self.use_api:
return self._get_posts_api()
return self._get_posts_manual()
def _get_posts_api(self):
params = {
# 'pid' is page-id; first page has index 0
"tags": self.tags, "limit": self.per_page, "pid": self.start_page}
while True:
root = ET.fromstring(
self.request(self.api_url, params=params).text)
for item in root:
yield item.attrib
if len(root) < self.per_page:
return
params["pid"] += 1
def _get_posts_manual(self):
url = "https://gelbooru.com/index.php?page=post&s=list"
# values for 'pid' must be multiples of 42
# 'pid' is post-id; values for 'pid' must be multiples of 42
params = {"tags": self.tags, "pid": self.start_page * self.per_page}
while True:
@ -127,6 +165,9 @@ class GelbooruPoolExtractor(GelbooruExtractor):
name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
self.posts = list(text.extract_iter(page, 'id="p', '"', pos))
if not name:
raise exception.NotFoundError("pool")
return {
"pool": util.safe_int(self.pool_id),
"pool_name": text.unescape(name),