gallery-dl/gallery_dl/extractor/booru.py

# -*- coding: utf-8 -*-

# Copyright 2015-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for *booru sites"""

from .common import BaseExtractor, Message
from .. import text
import operator


class BooruExtractor(BaseExtractor):
    """Base class for *booru extractors"""
    basecategory = "booru"
    filename_fmt = "{category}_{id}_{md5}.{extension}"
    page_start = 0
    per_page = 100

    def items(self):
        self.login()
        data = self.metadata()
        tags = self.config("tags", False)
        notes = self.config("notes", False)

        for post in self.posts():
            try:
                url = self._file_url(post)
                if url[0] == "/":
                    url = self.root + url
            except (KeyError, TypeError):
                self.log.debug("Unable to fetch download URL for post %s "
                               "(md5: %s)", post.get("id"), post.get("md5"))
                continue

            page_html = None
            if tags:
                page_html = self._extended_tags(post)
            if notes:
                self._notes(post, page_html)
            self._prepare(post)
            post.update(data)
            text.nameext_from_url(url, post)

            yield Message.Directory, post
            yield Message.Url, url, post

    def skip(self, num):
        pages = num // self.per_page
        self.page_start += pages
        return pages * self.per_page

    def login(self):
        """Login and set necessary cookies"""

    def metadata(self):
        """Return a dict with general metadata"""
        return ()

    def posts(self):
        """Return an iterable with post objects"""
        return ()

    _file_url = operator.itemgetter("file_url")

    def _prepare(self, post):
        """Prepare the 'post's metadata"""

    def _extended_tags(self, post, page=None):
        """Generate extended tag information

        The return value of this function will be
        passed to the _notes function as the page parameter.
        This makes it possible to reuse the same HTML both for
        extracting tags and notes.
        """

    def _notes(self, post, page=None):
        """Generate information about notes"""
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00			`# -- coding: utf-8 --`

[booru] use BaseExtractor 2021-01-27 01:33:01 +01:00			`# Copyright 2015-2021 Mike Fährmann`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`"""Extractors for *booru sites"""`

[booru] use BaseExtractor 2021-01-27 01:33:01 +01:00			`from .common import BaseExtractor, Message`
move extractors from booru.py into their own gelbooru_v02 module 2021-02-17 00:12:51 +01:00			`from .. import text`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00			`import operator`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
[booru] use BaseExtractor 2021-01-27 01:33:01 +01:00			`class BooruExtractor(BaseExtractor):`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`"""Base class for *booru extractors"""`
add common config category for boorus and foolslide 2017-08-29 22:42:48 +02:00			`basecategory = "booru"`
put common directory- and filename formats in base classes 2017-05-30 12:10:16 +02:00			`filename_fmt = "{category}_{id}_{md5}.{extension}"`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`page_start = 0`
			`per_page = 100`
[booru] rewrite using Mixin classes (#59) - improved code structure - improved URL patterns - better pagination to work around page limits on - Danbooru - e621 - 3dbooru 2018-01-03 23:52:01 +01:00
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`def items(self):`
			`self.login()`
			`data = self.metadata()`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00			`tags = self.config("tags", False)`
[booru] add an option to extract notes (only gelbooru for now) (#1457) * [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags 2021-04-13 23:40:24 +02:00			`notes = self.config("notes", False)`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`for post in self.posts():`
			`try:`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00			`url = self._file_url(post)`
[booru] improve error handling e.g. for posts without a valid 'file_url' (#1176) 2020-12-16 22:11:10 +01:00			`if url[0] == "/":`
			`url = self.root + url`
[booru] reduce exceptions caught during _prepare_post() don't catch HttpErrors etc. 2020-12-20 17:57:07 +01:00			`except (KeyError, TypeError):`
[booru] improve error handling e.g. for posts without a valid 'file_url' (#1176) 2020-12-16 22:11:10 +01:00			`self.log.debug("Unable to fetch download URL for post %s "`
			`"(md5: %s)", post.get("id"), post.get("md5"))`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`continue`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00
[booru] add an option to extract notes (only gelbooru for now) (#1457) * [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags 2021-04-13 23:40:24 +02:00			`page_html = None`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00			`if tags:`
[booru] add an option to extract notes (only gelbooru for now) (#1457) * [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags 2021-04-13 23:40:24 +02:00			`page_html = self._extended_tags(post)`
			`if notes:`
			`self._notes(post, page_html)`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00			`self._prepare(post)`
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`post.update(data)`
			`text.nameext_from_url(url, post)`
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`yield Message.Directory, post`
			`yield Message.Url, url, post`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00
[booru] rewrite using Mixin classes (#59) - improved code structure - improved URL patterns - better pagination to work around page limits on - Danbooru - e621 - 3dbooru 2018-01-03 23:52:01 +01:00			`def skip(self, num):`
			`pages = num // self.per_page`
			`self.page_start += pages`
			`return pages * self.per_page`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`def login(self):`
			`"""Login and set necessary cookies"""`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`def metadata(self):`
			`"""Return a dict with general metadata"""`
			`return ()`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00
[booru] add generalized extractors for *booru sites similar to cc15fbe7 2020-12-08 18:31:59 +01:00			`def posts(self):`
			`"""Return an iterable with post objects"""`
			`return ()`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00			`_file_url = operator.itemgetter("file_url")`

move extractors from booru.py into their own gelbooru_v02 module 2021-02-17 00:12:51 +01:00			`def _prepare(self, post):`
			`"""Prepare the 'post's metadata"""`
move and update BooruExtractor classes 2015-04-11 00:16:17 +02:00
[booru] split '_prepare_post()' 2020-12-24 01:04:44 +01:00			`def _extended_tags(self, post, page=None):`
[booru] add an option to extract notes (only gelbooru for now) (#1457) * [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags 2021-04-13 23:40:24 +02:00			`"""Generate extended tag information`

			`The return value of this function will be`
			`passed to the _notes function as the page parameter.`
			`This makes it possible to reuse the same HTML both for`
			`extracting tags and notes.`
			`"""`

			`def _notes(self, post, page=None):`
			`"""Generate information about notes"""`