[booru] add an option to extract notes (only gelbooru for now) (#1457)

* [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags
2021-04-13 23:40:24 +02:00 · 2021-04-13 23:40:24 +02:00 · dff03a6605
commit dff03a6605
parent 78d7ee3ef4
5 changed files with 74 additions and 3 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -1747,6 +1747,16 @@ Description
    Note: This requires 1 additional HTTP request for each post.
 extractor.[booru].notes
 ----------------------
 Type
    ``bool``
 Default
    ``false``
 Description
    Extract overlay notes (position and text).
    Note: This requires 1 additional HTTP request for each post.
 extractor.[manga-extractor].chapter-reverse
 -------------------------------------------
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -279,7 +279,8 @@
        },
        "booru":
        {
-            "tags": false
+            "tags": false,
            "notes": false
        }
    },
--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor):
        self.login()
        data = self.metadata()
        tags = self.config("tags", False)
        notes = self.config("notes", False)
        for post in self.posts():
            try:
@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor):
                               "(md5: %s)", post.get("id"), post.get("md5"))
                continue
            page_html = None
            if tags:
-                self._extended_tags(post)
+                page_html = self._extended_tags(post)
            if notes:
                self._notes(post, page_html)
            self._prepare(post)
            post.update(data)
            text.nameext_from_url(url, post)
@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor):
        """Prepare the 'post's metadata"""
    def _extended_tags(self, post, page=None):
-        """Generate extended tag information"""
+        """Generate extended tag information
        The return value of this function will be
        passed to the _notes function as the page parameter.
        This makes it possible to reuse the same HTML both for
        extracting tags and notes.
        """
    def _notes(self, post, page=None):
        """Generate information about notes"""
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@ -108,4 +108,26 @@ class GelbooruPostExtractor(GelbooruBase,
            "pattern": r"https://img\d\.gelbooru\.com/images"
                       r"/22/61/226111273615049235b001b381707bd0\.webm",
        }),
        # notes
        ("https://gelbooru.com/index.php?page=post&s=view&id=5997331", {
            "options": (("notes", True),),
            "keywords": {
                "notes": [
                    {
                        "height": 553,
                        "body": "Look over this way when you talk~",
                        "width": 246,
                        "x": 35,
                        "y": 72
                    },
                    {
                        "height": 557,
                        "body": "Hey~\nAre you listening~?",
                        "width": 246,
                        "x": 1233,
                        "y": 109
                    }
                ]
            }
        }),
    )
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@ -57,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor):
                tags[tag_type].append(text.unquote(tag_name))
            for key, value in tags.items():
                post["tags_" + key] = " ".join(value)
        return page
    def _notes(self, post, page=None):
        if not page:
            url = "{}/index.php?page=post&s=view&id={}".format(
                self.root, post["id"])
            page = self.request(url).text
        notes = []
        notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
        if not notes_data:
            return
        note_iter = text.extract_iter(notes_data, '<article', '</article>')
        extr = text.extract
        for note_data in note_iter:
            note = {
                "width": int(extr(note_data, 'data-width="', '"')[0]),
                "height": int(extr(note_data, 'data-height="', '"')[0]),
                "x": int(extr(note_data, 'data-x="', '"')[0]),
                "y": int(extr(note_data, 'data-y="', '"')[0]),
                "body": extr(note_data, 'data-body="', '"')[0],
            }
            notes.append(note)
        post["notes"] = notes
 BASE_PATTERN = GelbooruV02Extractor.update({