[booru] add an option to extract notes (only gelbooru for now) (#1457)
* [booru] add an option to extract notes (currently implemented only for gelbooru) * appease linter * [gelbooru] rename "text" to "body" in note extraction * add a code comment about reusing return value of _extended_tags
This commit is contained in:
parent
78d7ee3ef4
commit
dff03a6605
@ -1747,6 +1747,16 @@ Description
|
|||||||
|
|
||||||
Note: This requires 1 additional HTTP request for each post.
|
Note: This requires 1 additional HTTP request for each post.
|
||||||
|
|
||||||
|
extractor.[booru].notes
|
||||||
|
----------------------
|
||||||
|
Type
|
||||||
|
``bool``
|
||||||
|
Default
|
||||||
|
``false``
|
||||||
|
Description
|
||||||
|
Extract overlay notes (position and text).
|
||||||
|
|
||||||
|
Note: This requires 1 additional HTTP request for each post.
|
||||||
|
|
||||||
extractor.[manga-extractor].chapter-reverse
|
extractor.[manga-extractor].chapter-reverse
|
||||||
-------------------------------------------
|
-------------------------------------------
|
||||||
|
@ -279,7 +279,8 @@
|
|||||||
},
|
},
|
||||||
"booru":
|
"booru":
|
||||||
{
|
{
|
||||||
"tags": false
|
"tags": false,
|
||||||
|
"notes": false
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
||||||
|
@ -24,6 +24,7 @@ class BooruExtractor(BaseExtractor):
|
|||||||
self.login()
|
self.login()
|
||||||
data = self.metadata()
|
data = self.metadata()
|
||||||
tags = self.config("tags", False)
|
tags = self.config("tags", False)
|
||||||
|
notes = self.config("notes", False)
|
||||||
|
|
||||||
for post in self.posts():
|
for post in self.posts():
|
||||||
try:
|
try:
|
||||||
@ -35,8 +36,11 @@ class BooruExtractor(BaseExtractor):
|
|||||||
"(md5: %s)", post.get("id"), post.get("md5"))
|
"(md5: %s)", post.get("id"), post.get("md5"))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
page_html = None
|
||||||
if tags:
|
if tags:
|
||||||
self._extended_tags(post)
|
page_html = self._extended_tags(post)
|
||||||
|
if notes:
|
||||||
|
self._notes(post, page_html)
|
||||||
self._prepare(post)
|
self._prepare(post)
|
||||||
post.update(data)
|
post.update(data)
|
||||||
text.nameext_from_url(url, post)
|
text.nameext_from_url(url, post)
|
||||||
@ -66,4 +70,13 @@ class BooruExtractor(BaseExtractor):
|
|||||||
"""Prepare the 'post's metadata"""
|
"""Prepare the 'post's metadata"""
|
||||||
|
|
||||||
def _extended_tags(self, post, page=None):
|
def _extended_tags(self, post, page=None):
|
||||||
"""Generate extended tag information"""
|
"""Generate extended tag information
|
||||||
|
|
||||||
|
The return value of this function will be
|
||||||
|
passed to the _notes function as the page parameter.
|
||||||
|
This makes it possible to reuse the same HTML both for
|
||||||
|
extracting tags and notes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def _notes(self, post, page=None):
|
||||||
|
"""Generate information about notes"""
|
||||||
|
@ -108,4 +108,26 @@ class GelbooruPostExtractor(GelbooruBase,
|
|||||||
"pattern": r"https://img\d\.gelbooru\.com/images"
|
"pattern": r"https://img\d\.gelbooru\.com/images"
|
||||||
r"/22/61/226111273615049235b001b381707bd0\.webm",
|
r"/22/61/226111273615049235b001b381707bd0\.webm",
|
||||||
}),
|
}),
|
||||||
|
# notes
|
||||||
|
("https://gelbooru.com/index.php?page=post&s=view&id=5997331", {
|
||||||
|
"options": (("notes", True),),
|
||||||
|
"keywords": {
|
||||||
|
"notes": [
|
||||||
|
{
|
||||||
|
"height": 553,
|
||||||
|
"body": "Look over this way when you talk~",
|
||||||
|
"width": 246,
|
||||||
|
"x": 35,
|
||||||
|
"y": 72
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"height": 557,
|
||||||
|
"body": "Hey~\nAre you listening~?",
|
||||||
|
"width": 246,
|
||||||
|
"x": 1233,
|
||||||
|
"y": 109
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}),
|
||||||
)
|
)
|
||||||
|
@ -57,6 +57,31 @@ class GelbooruV02Extractor(booru.BooruExtractor):
|
|||||||
tags[tag_type].append(text.unquote(tag_name))
|
tags[tag_type].append(text.unquote(tag_name))
|
||||||
for key, value in tags.items():
|
for key, value in tags.items():
|
||||||
post["tags_" + key] = " ".join(value)
|
post["tags_" + key] = " ".join(value)
|
||||||
|
return page
|
||||||
|
|
||||||
|
def _notes(self, post, page=None):
|
||||||
|
if not page:
|
||||||
|
url = "{}/index.php?page=post&s=view&id={}".format(
|
||||||
|
self.root, post["id"])
|
||||||
|
page = self.request(url).text
|
||||||
|
notes = []
|
||||||
|
notes_data = text.extract(page, '<section id="notes"', '</section>')[0]
|
||||||
|
if not notes_data:
|
||||||
|
return
|
||||||
|
|
||||||
|
note_iter = text.extract_iter(notes_data, '<article', '</article>')
|
||||||
|
extr = text.extract
|
||||||
|
for note_data in note_iter:
|
||||||
|
note = {
|
||||||
|
"width": int(extr(note_data, 'data-width="', '"')[0]),
|
||||||
|
"height": int(extr(note_data, 'data-height="', '"')[0]),
|
||||||
|
"x": int(extr(note_data, 'data-x="', '"')[0]),
|
||||||
|
"y": int(extr(note_data, 'data-y="', '"')[0]),
|
||||||
|
"body": extr(note_data, 'data-body="', '"')[0],
|
||||||
|
}
|
||||||
|
notes.append(note)
|
||||||
|
|
||||||
|
post["notes"] = notes
|
||||||
|
|
||||||
|
|
||||||
BASE_PATTERN = GelbooruV02Extractor.update({
|
BASE_PATTERN = GelbooruV02Extractor.update({
|
||||||
|
Loading…
x
Reference in New Issue
Block a user