[foolfuuka] dynamically generate extractor classes
This commit is contained in:
parent
22d7a783d5
commit
58a9eede38
@ -1,21 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://archive.4plebs.org/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class FourplebsThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on 4plebs.org"""
|
||||
category = "4plebs"
|
||||
root = "https://archive.4plebs.org"
|
||||
pattern = [r"(?:https?://)?(?:archive\.)?4plebs\.org/([^/]+)/thread/(\d+)"]
|
||||
test = [("https://archive.4plebs.org/tg/thread/54059290", {
|
||||
"url": "07452944164b602502b02b24521f8cee5c484d2a",
|
||||
})]
|
@ -13,23 +13,17 @@ modules = [
|
||||
"2chan",
|
||||
"3dbooru",
|
||||
"4chan",
|
||||
"4plebs",
|
||||
"8chan",
|
||||
"archivedmoe",
|
||||
"archiveofsins",
|
||||
"artstation",
|
||||
"b4k",
|
||||
"behance",
|
||||
"bobx",
|
||||
"danbooru",
|
||||
"desuarchive",
|
||||
"deviantart",
|
||||
"dokireader",
|
||||
"dynastyscans",
|
||||
"e621",
|
||||
"exhentai",
|
||||
"fallenangels",
|
||||
"fireden",
|
||||
"flickr",
|
||||
"gelbooru",
|
||||
"gfycat",
|
||||
@ -66,7 +60,6 @@ modules = [
|
||||
"ngomik",
|
||||
"nhentai",
|
||||
"nijie",
|
||||
"nyafuu",
|
||||
"paheal",
|
||||
"photobucket",
|
||||
"piczel",
|
||||
@ -75,7 +68,6 @@ modules = [
|
||||
"powermanga",
|
||||
"reactor",
|
||||
"readcomiconline",
|
||||
"rebeccablacktech",
|
||||
"reddit",
|
||||
"rule34",
|
||||
"safebooru",
|
||||
@ -87,7 +79,6 @@ modules = [
|
||||
"simplyhentai",
|
||||
"slideshare",
|
||||
"smugmug",
|
||||
"thebarchive",
|
||||
"tsumino",
|
||||
"tumblr",
|
||||
"twitter",
|
||||
@ -97,6 +88,7 @@ modules = [
|
||||
"yandere",
|
||||
"xvideos",
|
||||
"yuki",
|
||||
"foolfuuka",
|
||||
"mastodon",
|
||||
"imagehosts",
|
||||
"directlink",
|
||||
|
@ -1,27 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://archived.moe/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class ArchivedmoeThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on archived.moe"""
|
||||
category = "archivedmoe"
|
||||
root = "https://archived.moe"
|
||||
pattern = [r"(?:https?://)?archived\.moe/([^/]+)/thread/(\d+)"]
|
||||
test = [
|
||||
("https://archived.moe/gd/thread/309639/", {
|
||||
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
|
||||
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
|
||||
}),
|
||||
("https://archived.moe/a/thread/159767162/", {
|
||||
"url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
|
||||
}),
|
||||
]
|
@ -1,23 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://archiveofsins.com/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class ArchiveofsinsThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on archiveofsins.com"""
|
||||
category = "archiveofsins"
|
||||
root = "https://archiveofsins.com"
|
||||
pattern = [r"(?:https?://)?(?:www\.)?archiveofsins\.com"
|
||||
r"/([^/]+)/thread/(\d+)"]
|
||||
test = [("https://www.archiveofsins.com/h/thread/4668813/", {
|
||||
"url": "f612d287087e10a228ef69517cf811539db9a102",
|
||||
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
|
||||
})]
|
@ -1,24 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://arch.b4k.co/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class BfourkThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on arch.b4k.co"""
|
||||
category = "b4k"
|
||||
root = "https://arch.b4k.co"
|
||||
pattern = [r"(?:https?://)?arch\.b4k\.co/([^/]+)/thread/(\d+)"]
|
||||
test = [("http://arch.b4k.co/meta/thread/196/", {
|
||||
"url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
|
||||
})]
|
||||
|
||||
def remote(self, media):
|
||||
return media["remote_media_link"]
|
@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2015-2018 Mike Fährmann
|
||||
# Copyright 2015-2019 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@ -8,10 +8,8 @@
|
||||
|
||||
"""Base classes for extractors for different Futaba Channel-like boards"""
|
||||
|
||||
from .common import Extractor, SharedConfigExtractor, Message
|
||||
from .common import Extractor, Message
|
||||
from .. import text
|
||||
import itertools
|
||||
import operator
|
||||
|
||||
|
||||
class ChanThreadExtractor(Extractor):
|
||||
@ -61,58 +59,3 @@ class ChanThreadExtractor(Extractor):
|
||||
"""Return thread title from first post"""
|
||||
title = post["sub"] if "sub" in post else text.remove_html(post["com"])
|
||||
return text.unescape(title)[:50]
|
||||
|
||||
|
||||
class FoolfuukaThreadExtractor(SharedConfigExtractor):
|
||||
"""Base extractor for FoolFuuka based boards/archives"""
|
||||
basecategory = "foolfuuka"
|
||||
subcategory = "thread"
|
||||
directory_fmt = ["{category}", "{board[shortname]}",
|
||||
"{thread_num}{title:? - //}"]
|
||||
filename_fmt = "{media[media]}"
|
||||
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
|
||||
root = ""
|
||||
referer = True
|
||||
|
||||
def __init__(self, match):
|
||||
SharedConfigExtractor.__init__(self)
|
||||
self.board, self.thread = match.groups()
|
||||
if self.referer:
|
||||
self.session.headers["Referer"] = self.root
|
||||
|
||||
def items(self):
|
||||
op = True
|
||||
yield Message.Version, 1
|
||||
for post in self.posts():
|
||||
if op:
|
||||
yield Message.Directory, post
|
||||
op = False
|
||||
if not post["media"]:
|
||||
continue
|
||||
|
||||
media = post["media"]
|
||||
url = media["media_link"]
|
||||
|
||||
if not url and "remote_media_link" in media:
|
||||
url = self.remote(media)
|
||||
if url.startswith("/"):
|
||||
url = self.root + url
|
||||
|
||||
post["extension"] = url.rpartition(".")[2]
|
||||
yield Message.Url, url, post
|
||||
|
||||
def posts(self):
|
||||
url = self.root + "/_/api/chan/thread/"
|
||||
params = {"board": self.board, "num": self.thread}
|
||||
data = self.request(url, params=params).json()[self.thread]
|
||||
|
||||
# sort post-objects by their key
|
||||
posts = sorted(data.get("posts", {}).items())
|
||||
posts = map(operator.itemgetter(1), posts)
|
||||
|
||||
return itertools.chain((data["op"],), posts)
|
||||
|
||||
def remote(self, media):
|
||||
needle = '<meta http-equiv="Refresh" content="0; url='
|
||||
page = self.request(media["remote_media_link"]).text
|
||||
return text.extract(page, needle, '"')[0]
|
||||
|
@ -1,21 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://desuarchive.org/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class DesuarchiveThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on desuarchive.org"""
|
||||
category = "desuarchive"
|
||||
root = "https://desuarchive.org"
|
||||
pattern = [r"(?:https?://)?desuarchive\.org/([^/]+)/thread/(\d+)"]
|
||||
test = [("https://desuarchive.org/a/thread/159542679/", {
|
||||
"url": "e7d624aded15a069194e38dc731ec23217a422fb",
|
||||
})]
|
@ -1,21 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://boards.fireden.net/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class FiredenThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on boards.fireden.net"""
|
||||
category = "fireden"
|
||||
root = "https://boards.fireden.net"
|
||||
pattern = [r"(?:https?://)?boards\.fireden\.net/([^/]+)/thread/(\d+)"]
|
||||
test = [("https://boards.fireden.net/a/thread/159803223/", {
|
||||
"url": "01b7baacfb0656a68e566368290e3072b27f86c9",
|
||||
})]
|
185
gallery_dl/extractor/foolfuuka.py
Normal file
185
gallery_dl/extractor/foolfuuka.py
Normal file
@ -0,0 +1,185 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2019 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extractors for 4chan archives based on FoolFuuka"""
|
||||
|
||||
from .common import SharedConfigExtractor, Message
|
||||
from .. import text, config
|
||||
import itertools
|
||||
import operator
|
||||
import re
|
||||
|
||||
|
||||
class FoolfuukaThreadExtractor(SharedConfigExtractor):
|
||||
"""Base extractor for FoolFuuka based boards/archives"""
|
||||
basecategory = "foolfuuka"
|
||||
subcategory = "thread"
|
||||
directory_fmt = ["{category}", "{board[shortname]}",
|
||||
"{thread_num}{title:? - //}"]
|
||||
filename_fmt = "{media[media]}"
|
||||
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
|
||||
root = ""
|
||||
|
||||
def __init__(self, match):
|
||||
SharedConfigExtractor.__init__(self)
|
||||
self.board, self.thread = match.groups()
|
||||
self.session.headers["Referer"] = self.root
|
||||
|
||||
def items(self):
|
||||
op = True
|
||||
yield Message.Version, 1
|
||||
for post in self.posts():
|
||||
if op:
|
||||
yield Message.Directory, post
|
||||
op = False
|
||||
if not post["media"]:
|
||||
continue
|
||||
|
||||
media = post["media"]
|
||||
url = media["media_link"]
|
||||
|
||||
if not url and "remote_media_link" in media:
|
||||
url = self.remote(media)
|
||||
if url.startswith("/"):
|
||||
url = self.root + url
|
||||
|
||||
post["extension"] = url.rpartition(".")[2]
|
||||
yield Message.Url, url, post
|
||||
|
||||
def posts(self):
|
||||
url = self.root + "/_/api/chan/thread/"
|
||||
params = {"board": self.board, "num": self.thread}
|
||||
data = self.request(url, params=params).json()[self.thread]
|
||||
|
||||
# sort post-objects by key
|
||||
posts = sorted(data.get("posts", {}).items())
|
||||
posts = map(operator.itemgetter(1), posts)
|
||||
|
||||
return itertools.chain((data["op"],), posts)
|
||||
|
||||
def remote(self, media):
|
||||
needle = '<meta http-equiv="Refresh" content="0; url='
|
||||
page = self.request(media["remote_media_link"]).text
|
||||
return text.extract(page, needle, '"')[0]
|
||||
|
||||
def _remote_simple(self, media):
|
||||
return media["remote_media_link"]
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
"4plebs": {
|
||||
"name": "fourplebs",
|
||||
"root": "https://archive.4plebs.org",
|
||||
"pattern": r"(?:archive\.)?4plebs\.org",
|
||||
"test": [("https://archive.4plebs.org/tg/thread/54059290", {
|
||||
"url": "07452944164b602502b02b24521f8cee5c484d2a",
|
||||
})],
|
||||
},
|
||||
"archivedmoe": {
|
||||
"root": "https://archived.moe",
|
||||
"test": [
|
||||
("https://archived.moe/gd/thread/309639/", {
|
||||
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
|
||||
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
|
||||
}),
|
||||
("https://archived.moe/a/thread/159767162/", {
|
||||
"url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
|
||||
}),
|
||||
],
|
||||
},
|
||||
"archiveofsins": {
|
||||
"root": "https://archiveofsins.com",
|
||||
"pattern": r"(?:www\.)?archiveofsins\.com",
|
||||
"test": [("https://archiveofsins.com/h/thread/4668813/", {
|
||||
"url": "f612d287087e10a228ef69517cf811539db9a102",
|
||||
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
|
||||
})],
|
||||
},
|
||||
"b4k": {
|
||||
"root": "https://arch.b4k.co",
|
||||
"remote": "simple",
|
||||
"test": [("https://arch.b4k.co/meta/thread/196/", {
|
||||
"url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
|
||||
})],
|
||||
},
|
||||
"desuarchive": {
|
||||
"root": "https://desuarchive.org",
|
||||
"test": [("https://desuarchive.org/a/thread/159542679/", {
|
||||
"url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
|
||||
})],
|
||||
},
|
||||
"fireden": {
|
||||
"root": "https://boards.fireden.net",
|
||||
"test": [("https://boards.fireden.net/a/thread/159803223/", {
|
||||
"url": "01b7baacfb0656a68e566368290e3072b27f86c9",
|
||||
})],
|
||||
},
|
||||
"nyafuu": {
|
||||
"root": "https://archive.nyafuu.org",
|
||||
"pattern": r"(?:archive\.)?nyafuu\.org",
|
||||
"test": [("https://archive.nyafuu.org/c/thread/2849220/", {
|
||||
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
|
||||
})],
|
||||
},
|
||||
"rbt": {
|
||||
"root": "https://rbt.asia",
|
||||
"pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
|
||||
"test": [
|
||||
("https://rbt.asia/g/thread/61487650/", {
|
||||
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
|
||||
}),
|
||||
("https://archive.rebeccablacktech.com/g/thread/61487650/", {
|
||||
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
|
||||
}),
|
||||
],
|
||||
},
|
||||
"thebarchive": {
|
||||
"root": "https://thebarchive.com",
|
||||
"pattern": r"thebarchive\.com",
|
||||
"test": [("https://thebarchive.com/b/thread/739772332/", {
|
||||
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
|
||||
})],
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def generate_extractors():
|
||||
"""Dynamically generate Extractor classes for FoolFuuka instances"""
|
||||
|
||||
symtable = globals()
|
||||
extractors = config.get(("extractor", "foolfuuka"))
|
||||
|
||||
if extractors:
|
||||
EXTRACTORS.update(extractors)
|
||||
|
||||
for _category, info in EXTRACTORS.items():
|
||||
|
||||
if not isinstance(info, dict):
|
||||
continue
|
||||
|
||||
_root = info["root"]
|
||||
_domain = _root.rpartition("/")[2]
|
||||
_pattern = info.get("pattern") or re.escape(_domain)
|
||||
_name = info.get("name") or _category
|
||||
|
||||
class ThreadExtractor(FoolfuukaThreadExtractor):
|
||||
category = _category
|
||||
pattern = [r"(?:https?://)?{}/([^/]+)/thread/(\d+)".format(
|
||||
_pattern)]
|
||||
test = info.get("test")
|
||||
root = _root
|
||||
|
||||
if info.get("remote") == "simple":
|
||||
ThreadExtractor.remote = ThreadExtractor._remote_simple
|
||||
|
||||
ThreadExtractor.__name__ = _name.capitalize() + "ThreadExtractor"
|
||||
ThreadExtractor.__doc__ = "Extractor for threads on " + _domain
|
||||
symtable[ThreadExtractor.__name__] = ThreadExtractor
|
||||
|
||||
|
||||
generate_extractors()
|
@ -1,21 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://archive.nyafuu.org/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class NyafuuThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on nyafuu.org"""
|
||||
category = "nyafuu"
|
||||
root = "https://archive.nyafuu.org"
|
||||
pattern = [r"(?:https?://)?(?:archive\.)?nyafuu\.org/([^/]+)/thread/(\d+)"]
|
||||
test = [("http://archive.nyafuu.org/c/thread/2849220/", {
|
||||
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
|
||||
})]
|
@ -1,27 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://rbt.asia/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class RebeccablacktechThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on rbt.asia"""
|
||||
category = "rbt"
|
||||
root = "https://rbt.asia"
|
||||
pattern = [r"(?:https?://)?(?:(?:archive\.)?rebeccablacktech\.com"
|
||||
r"|rbt\.asia)/([^/]+)/thread/(\d+)"]
|
||||
test = [
|
||||
("https://rbt.asia/g/thread/61487650/", {
|
||||
"url": "fadd274b25150a1bdf03a40c58db320fa3b617c4",
|
||||
}),
|
||||
("https://archive.rebeccablacktech.com/g/thread/61487650/", {
|
||||
"url": "fadd274b25150a1bdf03a40c58db320fa3b617c4",
|
||||
}),
|
||||
]
|
@ -1,21 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2017 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
# published by the Free Software Foundation.
|
||||
|
||||
"""Extract images from https://thebarchive.com/"""
|
||||
|
||||
from . import chan
|
||||
|
||||
|
||||
class ThebarchiveThreadExtractor(chan.FoolfuukaThreadExtractor):
|
||||
"""Extractor for images from threads on thebarchive.com"""
|
||||
category = "thebarchive"
|
||||
root = "https://thebarchive.com"
|
||||
pattern = [r"(?:https?://)?thebarchive\.com/([^/]+)/thread/(\d+)"]
|
||||
test = [("https://thebarchive.com/b/thread/739772332/", {
|
||||
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
|
||||
})]
|
Loading…
x
Reference in New Issue
Block a user