2015-04-11 00:16:17 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2019-02-04 13:46:02 +01:00
|
|
|
# Copyright 2015-2019 Mike Fährmann
|
2015-04-11 00:16:17 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
|
|
|
"""Base classes for extractors for danbooru and co"""
|
|
|
|
|
2019-02-04 13:46:02 +01:00
|
|
|
from .common import Extractor, Message, SharedConfigMixin
|
2018-07-03 22:41:31 +02:00
|
|
|
from .. import text, exception
|
2018-01-03 23:52:01 +01:00
|
|
|
from xml.etree import ElementTree
|
2018-07-01 22:28:52 +02:00
|
|
|
import collections
|
2017-08-24 21:24:51 +02:00
|
|
|
import datetime
|
2017-09-06 12:31:42 +02:00
|
|
|
import operator
|
2018-07-01 22:28:52 +02:00
|
|
|
import re
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2017-02-01 00:53:19 +01:00
|
|
|
|
2019-02-04 13:46:02 +01:00
|
|
|
class BooruExtractor(SharedConfigMixin, Extractor):
|
2016-09-12 10:20:57 +02:00
|
|
|
"""Base class for all booru extractors"""
|
2017-08-29 22:42:48 +02:00
|
|
|
basecategory = "booru"
|
2017-05-30 12:10:16 +02:00
|
|
|
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
2015-04-11 00:16:17 +02:00
|
|
|
api_url = ""
|
2018-07-01 22:28:52 +02:00
|
|
|
post_url = ""
|
2018-01-03 23:52:01 +01:00
|
|
|
per_page = 50
|
|
|
|
page_start = 1
|
|
|
|
page_limit = None
|
|
|
|
sort = False
|
|
|
|
|
|
|
|
def __init__(self, match):
|
2019-02-11 13:31:10 +01:00
|
|
|
super().__init__(match)
|
2018-01-03 23:52:01 +01:00
|
|
|
self.params = {}
|
2018-07-06 15:18:49 +02:00
|
|
|
self.extags = self.post_url and self.config("tags", False)
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def skip(self, num):
|
|
|
|
pages = num // self.per_page
|
|
|
|
if self.page_limit and pages + self.page_start > self.page_limit:
|
|
|
|
pages = self.page_limit - self.page_start
|
|
|
|
self.page_start += pages
|
|
|
|
return pages * self.per_page
|
2015-04-11 00:16:17 +02:00
|
|
|
|
|
|
|
def items(self):
|
2018-03-01 17:40:31 +01:00
|
|
|
data = self.get_metadata()
|
|
|
|
|
2015-04-11 00:16:17 +02:00
|
|
|
yield Message.Version, 1
|
2018-03-01 17:40:31 +01:00
|
|
|
yield Message.Directory, data
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
self.reset_page()
|
|
|
|
while True:
|
2018-01-06 17:48:49 +01:00
|
|
|
images = self.parse_response(
|
2018-01-03 23:52:01 +01:00
|
|
|
self.request(self.api_url, params=self.params))
|
|
|
|
|
2018-03-01 17:40:31 +01:00
|
|
|
for image in images:
|
2018-01-03 23:52:01 +01:00
|
|
|
try:
|
2018-03-01 17:40:31 +01:00
|
|
|
url = image["file_url"]
|
2018-01-03 23:52:01 +01:00
|
|
|
except KeyError:
|
|
|
|
continue
|
2018-07-01 22:28:52 +02:00
|
|
|
if url.startswith("/"):
|
|
|
|
url = text.urljoin(self.api_url, url)
|
|
|
|
image.update(data)
|
2018-07-06 15:18:49 +02:00
|
|
|
if self.extags:
|
|
|
|
self.extended_tags(image)
|
2018-07-01 22:28:52 +02:00
|
|
|
yield Message.Url, url, text.nameext_from_url(url, image)
|
2018-01-03 23:52:01 +01:00
|
|
|
|
2018-01-06 17:48:49 +01:00
|
|
|
if len(images) < self.per_page:
|
2018-01-03 23:52:01 +01:00
|
|
|
return
|
2018-07-01 22:28:52 +02:00
|
|
|
self.update_page(image)
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def reset_page(self):
|
|
|
|
"""Initialize params to point to the first page"""
|
|
|
|
self.params["page"] = self.page_start
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def update_page(self, data):
|
|
|
|
"""Update params to point to the next page"""
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def parse_response(self, response):
|
2018-01-06 17:48:49 +01:00
|
|
|
"""Parse JSON API response"""
|
2018-01-03 23:52:01 +01:00
|
|
|
images = response.json()
|
|
|
|
if self.sort:
|
|
|
|
images.sort(key=operator.itemgetter("score", "id"),
|
|
|
|
reverse=True)
|
2018-01-06 17:48:49 +01:00
|
|
|
return images
|
|
|
|
|
|
|
|
def get_metadata(self):
|
|
|
|
"""Collect metadata for extractor-job"""
|
|
|
|
return {}
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-07-06 15:18:49 +02:00
|
|
|
def extended_tags(self, image, page=None):
|
2019-02-11 13:31:10 +01:00
|
|
|
"""Retrieve extended tag information"""
|
2018-07-06 15:18:49 +02:00
|
|
|
if not page:
|
|
|
|
url = self.post_url.format(image["id"])
|
|
|
|
page = self.request(url).text
|
2018-07-01 22:28:52 +02:00
|
|
|
tags = collections.defaultdict(list)
|
2018-07-06 15:18:49 +02:00
|
|
|
tags_html = text.extract(page, '<ul id="tag-', '</ul>')[0]
|
2018-07-01 22:28:52 +02:00
|
|
|
pattern = re.compile(r"tag-type-([^\"' ]+).*?[?;]tags=([^\"']+)", re.S)
|
2019-05-17 14:41:24 +02:00
|
|
|
for tag_type, tag_name in pattern.findall(tags_html or ""):
|
2018-07-01 22:28:52 +02:00
|
|
|
tags[tag_type].append(text.unquote(tag_name))
|
|
|
|
for key, value in tags.items():
|
|
|
|
image["tags_" + key] = " ".join(value)
|
2018-06-29 19:38:53 +02:00
|
|
|
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
class XmlParserMixin():
|
2018-01-06 17:48:49 +01:00
|
|
|
"""Mixin for XML based API responses"""
|
2018-01-03 23:52:01 +01:00
|
|
|
def parse_response(self, response):
|
|
|
|
root = ElementTree.fromstring(response.text)
|
2018-01-06 17:48:49 +01:00
|
|
|
return [post.attrib for post in root]
|
2018-01-03 23:52:01 +01:00
|
|
|
|
|
|
|
|
|
|
|
class DanbooruPageMixin():
|
|
|
|
"""Pagination for Danbooru v2"""
|
|
|
|
def update_page(self, data):
|
|
|
|
self.params["page"] = "b{}".format(data["id"])
|
|
|
|
|
|
|
|
|
|
|
|
class MoebooruPageMixin():
|
|
|
|
"""Pagination for Moebooru and Danbooru v1"""
|
|
|
|
def update_page(self, data):
|
|
|
|
if self.page_limit:
|
|
|
|
self.params["page"] = None
|
|
|
|
self.params["before_id"] = data["id"]
|
|
|
|
else:
|
|
|
|
self.params["page"] += 1
|
|
|
|
|
|
|
|
|
|
|
|
class GelbooruPageMixin():
|
|
|
|
"""Pagination for Gelbooru-like sites"""
|
2018-07-03 20:54:37 +02:00
|
|
|
page_start = 0
|
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def reset_page(self):
|
2018-07-03 20:54:37 +02:00
|
|
|
self.params["pid"] = self.page_start
|
2018-01-03 23:52:01 +01:00
|
|
|
|
|
|
|
def update_page(self, data):
|
|
|
|
self.params["pid"] += 1
|
2015-11-21 00:54:29 +01:00
|
|
|
|
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
class TagMixin():
|
|
|
|
"""Extraction of images based on search-tags"""
|
2017-08-26 21:47:45 +02:00
|
|
|
subcategory = "tag"
|
2019-02-08 13:45:40 +01:00
|
|
|
directory_fmt = ("{category}", "{search_tags}")
|
2018-03-01 17:40:31 +01:00
|
|
|
archive_fmt = "t_{search_tags}_{id}"
|
2015-11-21 00:54:29 +01:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2018-01-03 23:52:01 +01:00
|
|
|
super().__init__(match)
|
|
|
|
self.tags = text.unquote(match.group("tags").replace("+", " "))
|
2015-11-21 00:54:29 +01:00
|
|
|
self.params["tags"] = self.tags
|
2018-01-03 23:52:01 +01:00
|
|
|
self.params["limit"] = self.per_page
|
2015-11-21 00:54:29 +01:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def get_metadata(self):
|
2018-03-01 17:40:31 +01:00
|
|
|
return {"search_tags": self.tags}
|
2015-11-21 00:54:29 +01:00
|
|
|
|
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
class PoolMixin():
|
|
|
|
"""Extraction of image-pools"""
|
2017-08-26 21:47:45 +02:00
|
|
|
subcategory = "pool"
|
2019-02-08 13:45:40 +01:00
|
|
|
directory_fmt = ("{category}", "pool", "{pool}")
|
2018-03-01 17:40:31 +01:00
|
|
|
archive_fmt = "p_{pool}_{id}"
|
2015-11-21 00:54:29 +01:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2018-01-03 23:52:01 +01:00
|
|
|
super().__init__(match)
|
|
|
|
self.pool = match.group("pool")
|
2015-11-21 00:54:29 +01:00
|
|
|
self.params["tags"] = "pool:" + self.pool
|
2018-01-03 23:52:01 +01:00
|
|
|
self.params["limit"] = self.per_page
|
2015-11-21 00:54:29 +01:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def get_metadata(self):
|
2018-07-03 22:41:31 +02:00
|
|
|
return {"pool": text.parse_int(self.pool)}
|
|
|
|
|
|
|
|
|
|
|
|
class GelbooruPoolMixin(PoolMixin):
|
|
|
|
"""Image-pool extraction for Gelbooru-like sites"""
|
|
|
|
per_page = 1
|
|
|
|
|
|
|
|
def get_metadata(self):
|
|
|
|
page = self.request(self.pool_url.format(self.pool)).text
|
|
|
|
name, pos = text.extract(page, "<h3>Now Viewing: ", "</h3>")
|
|
|
|
if not name:
|
|
|
|
name, pos = text.extract(page, "<h4>Pool: ", "</h4>")
|
|
|
|
if not name:
|
|
|
|
raise exception.NotFoundError("pool")
|
|
|
|
self.posts = list(text.extract_iter(page, 'id="p', '"', pos))
|
|
|
|
|
|
|
|
return {
|
|
|
|
"pool": text.parse_int(self.pool),
|
2018-07-06 15:18:49 +02:00
|
|
|
"pool_name": text.unescape(name),
|
2018-07-03 22:41:31 +02:00
|
|
|
"count": len(self.posts),
|
|
|
|
}
|
|
|
|
|
|
|
|
def reset_page(self):
|
|
|
|
self.index = self.page_start
|
|
|
|
self.update_page(None)
|
|
|
|
|
|
|
|
def update_page(self, data):
|
|
|
|
try:
|
|
|
|
post = self.posts[self.index]
|
|
|
|
self.index += 1
|
|
|
|
except IndexError:
|
|
|
|
post = "0"
|
|
|
|
self.params["tags"] = "id:" + post
|
2015-11-21 00:54:29 +01:00
|
|
|
|
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
class PostMixin():
|
|
|
|
"""Extraction of a single image-post"""
|
2017-08-26 21:47:45 +02:00
|
|
|
subcategory = "post"
|
2018-03-01 17:40:31 +01:00
|
|
|
archive_fmt = "{id}"
|
2017-08-26 21:47:45 +02:00
|
|
|
|
2015-11-21 00:54:29 +01:00
|
|
|
def __init__(self, match):
|
2018-01-03 23:52:01 +01:00
|
|
|
super().__init__(match)
|
|
|
|
self.post = match.group("post")
|
2015-11-21 00:54:29 +01:00
|
|
|
self.params["tags"] = "id:" + self.post
|
2017-08-24 21:24:51 +02:00
|
|
|
|
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
class PopularMixin():
|
|
|
|
"""Extraction and metadata handling for Danbooru v2"""
|
2017-08-26 21:47:45 +02:00
|
|
|
subcategory = "popular"
|
2019-02-08 13:45:40 +01:00
|
|
|
directory_fmt = ("{category}", "popular", "{scale}", "{date}")
|
2018-03-01 17:40:31 +01:00
|
|
|
archive_fmt = "P_{scale[0]}_{date}_{id}"
|
2018-01-03 23:52:01 +01:00
|
|
|
page_start = None
|
2018-01-06 17:48:49 +01:00
|
|
|
sort = True
|
2017-08-24 21:24:51 +02:00
|
|
|
|
|
|
|
def __init__(self, match):
|
2018-01-03 23:52:01 +01:00
|
|
|
super().__init__(match)
|
|
|
|
self.params.update(text.parse_query(match.group("query")))
|
2017-08-24 21:24:51 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def get_metadata(self, fmt="%Y-%m-%d"):
|
|
|
|
date = self.get_date() or datetime.datetime.utcnow().strftime(fmt)
|
|
|
|
scale = self.get_scale() or "day"
|
2017-08-24 21:24:51 +02:00
|
|
|
|
|
|
|
if scale == "week":
|
|
|
|
dt = datetime.datetime.strptime(date, fmt)
|
|
|
|
dt -= datetime.timedelta(days=dt.weekday())
|
|
|
|
date = dt.strftime(fmt)
|
|
|
|
elif scale == "month":
|
|
|
|
date = date[:-3]
|
|
|
|
|
|
|
|
return {"date": date, "scale": scale}
|
2018-01-03 23:52:01 +01:00
|
|
|
|
|
|
|
def get_scale(self):
|
|
|
|
if "scale" in self.params:
|
|
|
|
return self.params["scale"]
|
|
|
|
return None
|
|
|
|
|
|
|
|
def get_date(self):
|
|
|
|
if "date" in self.params:
|
|
|
|
return self.params["date"][:10]
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
class MoebooruPopularMixin(PopularMixin):
|
|
|
|
"""Extraction and metadata handling for Moebooru and Danbooru v1"""
|
|
|
|
def __init__(self, match):
|
|
|
|
super().__init__(match)
|
|
|
|
self.scale = match.group("scale")
|
|
|
|
|
|
|
|
def get_date(self):
|
|
|
|
if "year" in self.params:
|
|
|
|
return "{:>04}-{:>02}-{:>02}".format(
|
|
|
|
self.params["year"],
|
|
|
|
self.params.get("month", "01"),
|
|
|
|
self.params.get("day", "01"))
|
|
|
|
return None
|
|
|
|
|
|
|
|
def get_scale(self):
|
|
|
|
if self.scale and self.scale.startswith("by_"):
|
|
|
|
return self.scale[3:]
|
|
|
|
return self.scale
|