[booru] add an option to extract notes (only gelbooru for now) (#1457)

* [booru] add an option to extract notes (currently implemented only for gelbooru)

* appease linter

* [gelbooru] rename "text" to "body" in note extraction

* add a code comment about reusing return value of _extended_tags
This commit is contained in:
thatfuckingbird 2021-04-13 23:40:24 +02:00 committed by GitHub
parent 78d7ee3ef4
commit dff03a6605
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 74 additions and 3 deletions

View File

@ -1747,6 +1747,16 @@ Description
Note: This requires 1 additional HTTP request for each post.
extractor.[booru].notes
----------------------
Type
``bool``
Default
``false``
Description
Extract overlay notes (position and text).
Note: This requires 1 additional HTTP request for each post.
extractor.[manga-extractor].chapter-reverse
-------------------------------------------

View File

@ -279,7 +279,8 @@
},
"booru":
{
"tags": false
"tags": false,
"notes": false
}
},

View File

@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor):
self.login()
data = self.metadata()
tags = self.config("tags", False)
notes = self.config("notes", False)
for post in self.posts():
try:
@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor):
"(md5: %s)", post.get("id"), post.get("md5"))
continue
page_html = None
if tags:
self._extended_tags(post)
page_html = self._extended_tags(post)
if notes:
self._notes(post, page_html)
self._prepare(post)
post.update(data)
text.nameext_from_url(url, post)
@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor):
"""Prepare the 'post's metadata"""
def _extended_tags(self, post, page=None):
"""Generate extended tag information"""
"""Generate extended tag information
The return value of this function will be
passed to the _notes function as the page parameter.
This makes it possible to reuse the same HTML both for
extracting tags and notes.
"""
def _notes(self, post, page=None):
"""Generate information about notes"""

View File

@ -108,4 +108,26 @@ class GelbooruPostExtractor(GelbooruBase,
"pattern": r"https://img\d\.gelbooru\.com/images"
r"/22/61/226111273615049235b001b381707bd0\.webm",
}),
# notes
("https://gelbooru.com/index.php?page=post&s=view&id=5997331", {
"options": (("notes", True),),
"keywords": {
"notes": [
{
"height": 553,
"body": "Look over this way when you talk~",
"width": 246,
"x": 35,
"y": 72
},
{
"height": 557,
"body": "Hey~\nAre you listening~?",
"width": 246,
"x": 1233,
"y": 109
}
]
}
}),
)

View File

@ -57,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor):
tags[tag_type].append(text.unquote(tag_name))
for key, value in tags.items():
post["tags_" + key] = " ".join(value)
return page
def _notes(self, post, page=None):
if not page:
url = "{}/index.php?page=post&s=view&id={}".format(
self.root, post["id"])
page = self.request(url).text
notes = []
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
if not notes_data:
return
note_iter = text.extract_iter(notes_data, '<article', '</article>')
extr = text.extract
for note_data in note_iter:
note = {
"width": int(extr(note_data, 'data-width="', '"')[0]),
"height": int(extr(note_data, 'data-height="', '"')[0]),
"x": int(extr(note_data, 'data-x="', '"')[0]),
"y": int(extr(note_data, 'data-y="', '"')[0]),
"body": extr(note_data, 'data-body="', '"')[0],
}
notes.append(note)
post["notes"] = notes
BASE_PATTERN = GelbooruV02Extractor.update({