[booru] add an option to extract notes (only gelbooru for now) (#1457)
* [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags
This commit is contained in:
parent
78d7ee3ef4
commit
dff03a6605
@ -1747,6 +1747,16 @@ Description
|
||||
|
||||
Note: This requires 1 additional HTTP request for each post.
|
||||
|
||||
extractor.[booru].notes
|
||||
----------------------
|
||||
Type
|
||||
``bool``
|
||||
Default
|
||||
``false``
|
||||
Description
|
||||
Extract overlay notes (position and text).
|
||||
|
||||
Note: This requires 1 additional HTTP request for each post.
|
||||
|
||||
extractor.[manga-extractor].chapter-reverse
|
||||
-------------------------------------------
|
||||
|
@ -279,7 +279,8 @@
|
||||
},
|
||||
"booru":
|
||||
{
|
||||
"tags": false
|
||||
"tags": false,
|
||||
"notes": false
|
||||
}
|
||||
},
|
||||
|
||||
|
@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor):
|
||||
self.login()
|
||||
data = self.metadata()
|
||||
tags = self.config("tags", False)
|
||||
notes = self.config("notes", False)
|
||||
|
||||
for post in self.posts():
|
||||
try:
|
||||
@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor):
|
||||
"(md5: %s)", post.get("id"), post.get("md5"))
|
||||
continue
|
||||
|
||||
page_html = None
|
||||
if tags:
|
||||
self._extended_tags(post)
|
||||
page_html = self._extended_tags(post)
|
||||
if notes:
|
||||
self._notes(post, page_html)
|
||||
self._prepare(post)
|
||||
post.update(data)
|
||||
text.nameext_from_url(url, post)
|
||||
@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor):
|
||||
"""Prepare the 'post's metadata"""
|
||||
|
||||
def _extended_tags(self, post, page=None):
|
||||
"""Generate extended tag information"""
|
||||
"""Generate extended tag information
|
||||
|
||||
The return value of this function will be
|
||||
passed to the _notes function as the page parameter.
|
||||
This makes it possible to reuse the same HTML both for
|
||||
extracting tags and notes.
|
||||
"""
|
||||
|
||||
def _notes(self, post, page=None):
|
||||
"""Generate information about notes"""
|
||||
|
@ -108,4 +108,26 @@ class GelbooruPostExtractor(GelbooruBase,
|
||||
"pattern": r"https://img\d\.gelbooru\.com/images"
|
||||
r"/22/61/226111273615049235b001b381707bd0\.webm",
|
||||
}),
|
||||
# notes
|
||||
("https://gelbooru.com/index.php?page=post&s=view&id=5997331", {
|
||||
"options": (("notes", True),),
|
||||
"keywords": {
|
||||
"notes": [
|
||||
{
|
||||
"height": 553,
|
||||
"body": "Look over this way when you talk~",
|
||||
"width": 246,
|
||||
"x": 35,
|
||||
"y": 72
|
||||
},
|
||||
{
|
||||
"height": 557,
|
||||
"body": "Hey~\nAre you listening~?",
|
||||
"width": 246,
|
||||
"x": 1233,
|
||||
"y": 109
|
||||
}
|
||||
]
|
||||
}
|
||||
}),
|
||||
)
|
||||
|
@ -57,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor):
|
||||
tags[tag_type].append(text.unquote(tag_name))
|
||||
for key, value in tags.items():
|
||||
post["tags_" + key] = " ".join(value)
|
||||
return page
|
||||
|
||||
def _notes(self, post, page=None):
|
||||
if not page:
|
||||
url = "{}/index.php?page=post&s=view&id={}".format(
|
||||
self.root, post["id"])
|
||||
page = self.request(url).text
|
||||
notes = []
|
||||
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
|
||||
if not notes_data:
|
||||
return
|
||||
|
||||
note_iter = text.extract_iter(notes_data, '<article', '</article>')
|
||||
extr = text.extract
|
||||
for note_data in note_iter:
|
||||
note = {
|
||||
"width": int(extr(note_data, 'data-width="', '"')[0]),
|
||||
"height": int(extr(note_data, 'data-height="', '"')[0]),
|
||||
"x": int(extr(note_data, 'data-x="', '"')[0]),
|
||||
"y": int(extr(note_data, 'data-y="', '"')[0]),
|
||||
"body": extr(note_data, 'data-body="', '"')[0],
|
||||
}
|
||||
notes.append(note)
|
||||
|
||||
post["notes"] = notes
|
||||
|
||||
|
||||
BASE_PATTERN = GelbooruV02Extractor.update({
|
||||
|
Loading…
x
Reference in New Issue
Block a user