[bunkr] fix extraction (#2732)

move bunkr.is code to its own module
This commit is contained in:
Mike Fährmann 2022-07-15 12:38:30 +02:00
parent baf3815ebd
commit 46f11a3118
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 97 additions and 38 deletions

View File

@ -103,6 +103,12 @@ Consider all sites to be NSFW unless otherwise known.
<td>Blogs, Posts, Search Results</td>
<td></td>
</tr>
<tr>
<td>Bunkr</td>
<td>https://bunkr.is/</td>
<td>Albums</td>
<td></td>
</tr>
<tr>
<td>Comic Vine</td>
<td>https://comicvine.gamespot.com/</td>
@ -1261,12 +1267,6 @@ Consider all sites to be NSFW unless otherwise known.
<tr>
<td colspan="4"><strong>lolisafe and chibisafe</strong></td>
</tr>
<tr>
<td>Bunkr</td>
<td>https://app.bunkr.is/</td>
<td>Albums</td>
<td></td>
</tr>
<tr>
<td>ZzZz</td>
<td>https://zz.ht/</td>

View File

@ -25,6 +25,7 @@ modules = [
"bcy",
"behance",
"blogger",
"bunkr",
"comicvine",
"cyberdrop",
"danbooru",

View File

@ -0,0 +1,89 @@
# -*- coding: utf-8 -*-
# Copyright 2022 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for https://bunkr.is/"""
from .lolisafe import LolisafeAlbumExtractor
from .. import text
import json
class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"""Extractor for bunkr.is albums"""
category = "bunkr"
root = "https://app.bunkr.is"
pattern = r"(?:https?://)?(?:app\.)?bunkr\.(?:is|to)/a/([^/?#]+)"
test = (
("https://app.bunkr.is/a/Lktg9Keq", {
"pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
"keyword": {
"album_id": "Lktg9Keq",
"album_name": 'test テスト "&>',
"count": 1,
"filename": 'test-テスト-"&>-QjgneIQv',
"id": "QjgneIQv",
"name": 'test-テスト-"&>',
"num": int,
},
}),
# mp4 (#2239)
("https://bunkr.is/a/ptRHaCn2", {
"pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4",
"content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
}),
("https://bunkr.to/a/Lktg9Keq"),
)
def fetch_album(self, album_id):
if "//app." in self.root:
return self._fetch_album_api(album_id)
else:
return self._fetch_album_site(album_id)
def _fetch_album_api(self, album_id):
files, data = LolisafeAlbumExtractor.fetch_album(self, album_id)
for file in files:
url = file["file"]
if url.endswith(".mp4"):
file["file"] = url.replace(
"//cdn.bunkr.is/", "//media-files.bunkr.is/", 1)
else:
file["_fallback"] = (url.replace("//cdn.", "//cdn3.", 1),)
return files, data
def _fetch_album_site(self, album_id):
url = self.root + "/a/" + self.album_id
try:
data = json.loads(text.extract(
self.request(url).text,
'id="__NEXT_DATA__" type="application/json">', '<')[0])
props = data["props"]["pageProps"]
album = props["album"]
files = props["files"]
except Exception as exc:
self.log.debug(exc)
self.root = self.root.replace("bunkr", "app.bunkr", 1)
return self._fetch_album_api(album_id)
for file in files:
name = file["name"]
if name.endswith(".mp4"):
file["file"] = "https://media-files.bunkr.is/" + name
else:
file["file"] = file["cdn"] + "/" + name
return files, {
"album_id" : self.album_id,
"album_name" : text.unescape(album["name"]),
"description": text.unescape(album["description"]),
"count" : len(files),
}

View File

@ -20,10 +20,6 @@ class LolisafeExtractor(BaseExtractor):
BASE_PATTERN = LolisafeExtractor.update({
"bunkr": {
"root": "https://app.bunkr.is",
"pattern": r"(?:app\.)?bunkr\.(?:is|to)",
},
"zzzz" : {
"root": "https://zz.ht",
"pattern": r"zz\.(?:ht|fo)",
@ -35,25 +31,6 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
subcategory = "album"
pattern = BASE_PATTERN + "/a/([^/?#]+)"
test = (
("https://app.bunkr.is/a/Lktg9Keq", {
"pattern": r"https://cdn\.bunkr\.is/test-テスト-\"&>-QjgneIQv\.png",
"content": "0c8768055e4e20e7c7259608b67799171b691140",
"keyword": {
"album_id": "Lktg9Keq",
"album_name": 'test テスト "&>',
"count": 1,
"filename": 'test-テスト-"&>-QjgneIQv',
"id": "QjgneIQv",
"name": 'test-テスト-"&>',
"num": int,
},
}),
# mp4 (#2239)
("https://bunkr.is/a/ptRHaCn2", {
"pattern": r"https://media-files\.bunkr\.is/_-RnHoW69L\.mp4",
"content": "80e61d1dbc5896ae7ef9a28734c747b28b320471",
}),
("https://bunkr.to/a/Lktg9Keq"),
("https://zz.ht/a/lop7W6EZ", {
"pattern": r"https://z\.zz\.fo/(4anuY|ih560)\.png",
"count": 2,
@ -71,11 +48,7 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
domain = self.config("domain")
if domain is None or domain == "auto":
if self.category == "bunkr":
self.root = "https://app.bunkr.is"
else:
self.root = text.root_from_url(match.group(0))
self.root = text.root_from_url(match.group(0))
else:
self.root = text.ensure_http_scheme(domain)
@ -89,10 +62,6 @@ class LolisafeAlbumExtractor(LolisafeExtractor):
data["_fallback"] = file["_fallback"]
text.nameext_from_url(url, data)
data["name"], sep, data["id"] = data["filename"].rpartition("-")
if data["extension"] == "mp4":
url = url.replace(
"//cdn.bunkr.is/", "//media-files.bunkr.is/", 1)
yield Message.Url, url, data
def fetch_album(self, album_id):