2015-04-11 00:16:17 +02:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2021-01-27 01:33:01 +01:00
|
|
|
# Copyright 2015-2021 Mike Fährmann
|
2015-04-11 00:16:17 +02:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
"""Extractors for *booru sites"""
|
|
|
|
|
2021-01-27 01:33:01 +01:00
|
|
|
from .common import BaseExtractor, Message
|
2021-02-17 00:12:51 +01:00
|
|
|
from .. import text
|
2020-12-24 01:04:44 +01:00
|
|
|
import operator
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2017-02-01 00:53:19 +01:00
|
|
|
|
2021-01-27 01:33:01 +01:00
|
|
|
class BooruExtractor(BaseExtractor):
|
2020-12-08 18:31:59 +01:00
|
|
|
"""Base class for *booru extractors"""
|
2017-08-29 22:42:48 +02:00
|
|
|
basecategory = "booru"
|
2017-05-30 12:10:16 +02:00
|
|
|
filename_fmt = "{category}_{id}_{md5}.{extension}"
|
2020-12-08 18:31:59 +01:00
|
|
|
page_start = 0
|
|
|
|
per_page = 100
|
2018-01-03 23:52:01 +01:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def items(self):
|
|
|
|
self.login()
|
|
|
|
data = self.metadata()
|
2020-12-24 01:04:44 +01:00
|
|
|
tags = self.config("tags", False)
|
2021-04-13 23:40:24 +02:00
|
|
|
notes = self.config("notes", False)
|
2020-12-24 01:04:44 +01:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
for post in self.posts():
|
|
|
|
try:
|
2020-12-24 01:04:44 +01:00
|
|
|
url = self._file_url(post)
|
2020-12-16 22:11:10 +01:00
|
|
|
if url[0] == "/":
|
|
|
|
url = self.root + url
|
2020-12-20 17:57:07 +01:00
|
|
|
except (KeyError, TypeError):
|
2020-12-16 22:11:10 +01:00
|
|
|
self.log.debug("Unable to fetch download URL for post %s "
|
|
|
|
"(md5: %s)", post.get("id"), post.get("md5"))
|
2020-12-08 18:31:59 +01:00
|
|
|
continue
|
2020-12-24 01:04:44 +01:00
|
|
|
|
2021-04-13 23:40:24 +02:00
|
|
|
page_html = None
|
2020-12-24 01:04:44 +01:00
|
|
|
if tags:
|
2021-04-13 23:40:24 +02:00
|
|
|
page_html = self._extended_tags(post)
|
|
|
|
if notes:
|
|
|
|
self._notes(post, page_html)
|
2020-12-24 01:04:44 +01:00
|
|
|
self._prepare(post)
|
2020-12-08 18:31:59 +01:00
|
|
|
post.update(data)
|
|
|
|
text.nameext_from_url(url, post)
|
2020-12-24 01:04:44 +01:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
yield Message.Directory, post
|
|
|
|
yield Message.Url, url, post
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2018-01-03 23:52:01 +01:00
|
|
|
def skip(self, num):
|
|
|
|
pages = num // self.per_page
|
|
|
|
self.page_start += pages
|
|
|
|
return pages * self.per_page
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def login(self):
|
|
|
|
"""Login and set necessary cookies"""
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def metadata(self):
|
|
|
|
"""Return a dict with general metadata"""
|
|
|
|
return ()
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-08 18:31:59 +01:00
|
|
|
def posts(self):
|
|
|
|
"""Return an iterable with post objects"""
|
|
|
|
return ()
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-24 01:04:44 +01:00
|
|
|
_file_url = operator.itemgetter("file_url")
|
|
|
|
|
2021-02-17 00:12:51 +01:00
|
|
|
def _prepare(self, post):
|
|
|
|
"""Prepare the 'post's metadata"""
|
2015-04-11 00:16:17 +02:00
|
|
|
|
2020-12-24 01:04:44 +01:00
|
|
|
def _extended_tags(self, post, page=None):
|
2021-04-13 23:40:24 +02:00
|
|
|
"""Generate extended tag information
|
|
|
|
|
|
|
|
The return value of this function will be
|
|
|
|
passed to the _notes function as the page parameter.
|
|
|
|
This makes it possible to reuse the same HTML both for
|
|
|
|
extracting tags and notes.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def _notes(self, post, page=None):
|
|
|
|
"""Generate information about notes"""
|