[mememuseum] add 'tag' and 'post' extractors (closes #2264)

This commit is contained in:
Mike Fährmann 2022-02-20 02:15:38 +01:00
parent e5f6af6e32
commit 79a461a2c1
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
3 changed files with 127 additions and 0 deletions

View File

@ -469,6 +469,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Albums, Channels</td>
<td>Supported</td>
</tr>
<tr>
<td>Mememuseum</td>
<td>https://meme.museum/</td>
<td>Posts, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>My Hentai Gallery</td>
<td>https://myhentaigallery.com/</td>

View File

@ -81,6 +81,7 @@ modules = [
"mangapark",
"mangasee",
"mangoxo",
"mememuseum",
"myhentaigallery",
"myportfolio",
"naver",

View File

@ -0,0 +1,120 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://meme.museum/"""
from .common import Extractor, Message
from .. import text
class MememuseumExtractor(Extractor):
"""Base class for meme.museum extractors"""
basecategory = "booru"
category = "mememuseum"
filename_fmt = "{category}_{id}_{md5}.{extension}"
archive_fmt = "{id}"
root = "https://meme.museum"
def items(self):
data = self.metadata()
for post in self.posts():
url = post["file_url"]
for key in ("id", "width", "height"):
post[key] = text.parse_int(post[key])
post["tags"] = text.unquote(post["tags"])
post.update(data)
yield Message.Directory, post
yield Message.Url, url, text.nameext_from_url(url, post)
def metadata(self):
"""Return general metadata"""
return ()
def posts(self):
"""Return an iterable containing data of all relevant posts"""
return ()
class MememuseumTagExtractor(MememuseumExtractor):
"""Extractor for images from meme.museum by search-tags"""
subcategory = "tag"
directory_fmt = ("{category}", "{search_tags}")
pattern = r"(?:https?://)?meme\.museum/post/list/([^/?#]+)"
test = ("https://meme.museum/post/list/animated/1", {
"pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20",
"count": ">= 30"
})
per_page = 25
def __init__(self, match):
MememuseumExtractor.__init__(self, match)
self.tags = text.unquote(match.group(1))
def metadata(self):
return {"search_tags": self.tags}
def posts(self):
pnum = 1
while True:
url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum)
extr = text.extract_from(self.request(url).text)
while True:
mime = extr("data-mime='", "'")
if not mime:
break
pid = extr("data-post-id='", "'")
tags, dimensions, size = extr("title='", "'").split(" // ")
md5 = extr("/_thumbs/", "/")
width, _, height = dimensions.partition("x")
yield {
"file_url": "{}/_images/{}/{}%20-%20{}.{}".format(
self.root, md5, pid, text.quote(tags),
mime.rpartition("/")[2]),
"id": pid, "md5": md5, "tags": tags,
"width": width, "height": height,
"size": text.parse_bytes(size[:-1]),
}
if not extr(">Next<", ">"):
return
pnum += 1
class MememuseumPostExtractor(MememuseumExtractor):
"""Extractor for single images from meme.museum"""
subcategory = "post"
pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)"
test = ("https://meme.museum/post/view/10243", {
"pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997"
r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm"
r"an%20stallman%20tagme%20text\.jpg",
"keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e",
"content": "45565f3f141fc960a8ae1168b80e718a494c52d2",
})
def __init__(self, match):
MememuseumExtractor.__init__(self, match)
self.post_id = match.group(1)
def posts(self):
url = "{}/post/view/{}".format(self.root, self.post_id)
extr = text.extract_from(self.request(url).text)
return ({
"id" : self.post_id,
"tags" : extr(": ", "<"),
"md5" : extr("/_thumbs/", "/"),
"file_url": self.root + extr("id='main_image' src='", "'"),
"width" : extr("data-width=", " ").strip("'\""),
"height" : extr("data-height=", " ").strip("'\""),
"size" : 0,
},)