[booru] add an option to extract notes (only gelbooru for now) (#1457)

* [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags
2021-04-13 23:40:24 +02:00 · 2021-04-13 23:40:24 +02:00 · dff03a6605
commit dff03a6605
parent 78d7ee3ef4
5 changed files with 74 additions and 3 deletions
--- a/docs/configuration.rst
+++ b/docs/configuration.rst
@ -1747,6 +1747,16 @@ Description

    Note: This requires 1 additional HTTP request for each post.

+extractor.[booru].notes
+----------------------
+Type
+    ``bool``
+Default
+    ``false``
+Description
+    Extract overlay notes (position and text).
+
+    Note: This requires 1 additional HTTP request for each post.

 extractor.[manga-extractor].chapter-reverse
 -------------------------------------------
--- a/docs/gallery-dl.conf
+++ b/docs/gallery-dl.conf
@ -279,7 +279,8 @@
        },
        "booru":
        {
-            "tags": false
+            "tags": false,
+            "notes": false
        }
    },

--- a/gallery_dl/extractor/booru.py
+++ b/gallery_dl/extractor/booru.py
@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor):
        self.login()
        data = self.metadata()
        tags = self.config("tags", False)
+        notes = self.config("notes", False)

        for post in self.posts():
            try:
@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor):
                               "(md5: %s)", post.get("id"), post.get("md5"))
                continue

+            page_html = None
            if tags:
-                self._extended_tags(post)
+                page_html = self._extended_tags(post)
+            if notes:
+                self._notes(post, page_html)
            self._prepare(post)
            post.update(data)
            text.nameext_from_url(url, post)
@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor):
        """Prepare the 'post's metadata"""

    def _extended_tags(self, post, page=None):
-        """Generate extended tag information"""
+        """Generate extended tag information
+
+        The return value of this function will be
+        passed to the _notes function as the page parameter.
+        This makes it possible to reuse the same HTML both for
+        extracting tags and notes.
+        """
+
+    def _notes(self, post, page=None):
+        """Generate information about notes"""
--- a/gallery_dl/extractor/gelbooru.py
+++ b/gallery_dl/extractor/gelbooru.py
@ -108,4 +108,26 @@ class GelbooruPostExtractor(GelbooruBase,
            "pattern": r"https://img\d\.gelbooru\.com/images"
                       r"/22/61/226111273615049235b001b381707bd0\.webm",
        }),
+        # notes
+        ("https://gelbooru.com/index.php?page=post&s=view&id=5997331", {
+            "options": (("notes", True),),
+            "keywords": {
+                "notes": [
+                    {
+                        "height": 553,
+                        "body": "Look over this way when you talk~",
+                        "width": 246,
+                        "x": 35,
+                        "y": 72
+                    },
+                    {
+                        "height": 557,
+                        "body": "Hey~\nAre you listening~?",
+                        "width": 246,
+                        "x": 1233,
+                        "y": 109
+                    }
+                ]
+            }
+        }),
    )
--- a/gallery_dl/extractor/gelbooru_v02.py
+++ b/gallery_dl/extractor/gelbooru_v02.py
@ -57,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor):
                tags[tag_type].append(text.unquote(tag_name))
            for key, value in tags.items():
                post["tags_" + key] = " ".join(value)
+        return page
+
+    def _notes(self, post, page=None):
+        if not page:
+            url = "{}/index.php?page=post&s=view&id={}".format(
+                self.root, post["id"])
+            page = self.request(url).text
+        notes = []
+        notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
+        if not notes_data:
+            return
+
+        note_iter = text.extract_iter(notes_data, '<article', '</article>')
+        extr = text.extract
+        for note_data in note_iter:
+            note = {
+                "width": int(extr(note_data, 'data-width="', '"')[0]),
+                "height": int(extr(note_data, 'data-height="', '"')[0]),
+                "x": int(extr(note_data, 'data-x="', '"')[0]),
+                "y": int(extr(note_data, 'data-y="', '"')[0]),
+                "body": extr(note_data, 'data-body="', '"')[0],
+            }
+            notes.append(note)
+
+        post["notes"] = notes


 BASE_PATTERN = GelbooruV02Extractor.update({