[foolfuuka] dynamically generate extractor classes

This commit is contained in:
Mike Fährmann 2019-02-03 00:40:12 +01:00
parent 22d7a783d5
commit 58a9eede38
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
12 changed files with 188 additions and 274 deletions

View File

@ -1,21 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://archive.4plebs.org/"""
from . import chan
class FourplebsThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on 4plebs.org"""
category = "4plebs"
root = "https://archive.4plebs.org"
pattern = [r"(?:https?://)?(?:archive\.)?4plebs\.org/([^/]+)/thread/(\d+)"]
test = [("https://archive.4plebs.org/tg/thread/54059290", {
"url": "07452944164b602502b02b24521f8cee5c484d2a",
})]

View File

@ -13,23 +13,17 @@ modules = [
"2chan",
"3dbooru",
"4chan",
"4plebs",
"8chan",
"archivedmoe",
"archiveofsins",
"artstation",
"b4k",
"behance",
"bobx",
"danbooru",
"desuarchive",
"deviantart",
"dokireader",
"dynastyscans",
"e621",
"exhentai",
"fallenangels",
"fireden",
"flickr",
"gelbooru",
"gfycat",
@ -66,7 +60,6 @@ modules = [
"ngomik",
"nhentai",
"nijie",
"nyafuu",
"paheal",
"photobucket",
"piczel",
@ -75,7 +68,6 @@ modules = [
"powermanga",
"reactor",
"readcomiconline",
"rebeccablacktech",
"reddit",
"rule34",
"safebooru",
@ -87,7 +79,6 @@ modules = [
"simplyhentai",
"slideshare",
"smugmug",
"thebarchive",
"tsumino",
"tumblr",
"twitter",
@ -97,6 +88,7 @@ modules = [
"yandere",
"xvideos",
"yuki",
"foolfuuka",
"mastodon",
"imagehosts",
"directlink",

View File

@ -1,27 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://archived.moe/"""
from . import chan
class ArchivedmoeThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on archived.moe"""
category = "archivedmoe"
root = "https://archived.moe"
pattern = [r"(?:https?://)?archived\.moe/([^/]+)/thread/(\d+)"]
test = [
("https://archived.moe/gd/thread/309639/", {
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
}),
("https://archived.moe/a/thread/159767162/", {
"url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
}),
]

View File

@ -1,23 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://archiveofsins.com/"""
from . import chan
class ArchiveofsinsThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on archiveofsins.com"""
category = "archiveofsins"
root = "https://archiveofsins.com"
pattern = [r"(?:https?://)?(?:www\.)?archiveofsins\.com"
r"/([^/]+)/thread/(\d+)"]
test = [("https://www.archiveofsins.com/h/thread/4668813/", {
"url": "f612d287087e10a228ef69517cf811539db9a102",
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
})]

View File

@ -1,24 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://arch.b4k.co/"""
from . import chan
class BfourkThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on arch.b4k.co"""
category = "b4k"
root = "https://arch.b4k.co"
pattern = [r"(?:https?://)?arch\.b4k\.co/([^/]+)/thread/(\d+)"]
test = [("http://arch.b4k.co/meta/thread/196/", {
"url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
})]
def remote(self, media):
return media["remote_media_link"]

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2015-2018 Mike Fährmann
# Copyright 2015-2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -8,10 +8,8 @@
"""Base classes for extractors for different Futaba Channel-like boards"""
from .common import Extractor, SharedConfigExtractor, Message
from .common import Extractor, Message
from .. import text
import itertools
import operator
class ChanThreadExtractor(Extractor):
@ -61,58 +59,3 @@ class ChanThreadExtractor(Extractor):
"""Return thread title from first post"""
title = post["sub"] if "sub" in post else text.remove_html(post["com"])
return text.unescape(title)[:50]
class FoolfuukaThreadExtractor(SharedConfigExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
basecategory = "foolfuuka"
subcategory = "thread"
directory_fmt = ["{category}", "{board[shortname]}",
"{thread_num}{title:? - //}"]
filename_fmt = "{media[media]}"
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
root = ""
referer = True
def __init__(self, match):
SharedConfigExtractor.__init__(self)
self.board, self.thread = match.groups()
if self.referer:
self.session.headers["Referer"] = self.root
def items(self):
op = True
yield Message.Version, 1
for post in self.posts():
if op:
yield Message.Directory, post
op = False
if not post["media"]:
continue
media = post["media"]
url = media["media_link"]
if not url and "remote_media_link" in media:
url = self.remote(media)
if url.startswith("/"):
url = self.root + url
post["extension"] = url.rpartition(".")[2]
yield Message.Url, url, post
def posts(self):
url = self.root + "/_/api/chan/thread/"
params = {"board": self.board, "num": self.thread}
data = self.request(url, params=params).json()[self.thread]
# sort post-objects by their key
posts = sorted(data.get("posts", {}).items())
posts = map(operator.itemgetter(1), posts)
return itertools.chain((data["op"],), posts)
def remote(self, media):
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
return text.extract(page, needle, '"')[0]

View File

@ -1,21 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://desuarchive.org/"""
from . import chan
class DesuarchiveThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on desuarchive.org"""
category = "desuarchive"
root = "https://desuarchive.org"
pattern = [r"(?:https?://)?desuarchive\.org/([^/]+)/thread/(\d+)"]
test = [("https://desuarchive.org/a/thread/159542679/", {
"url": "e7d624aded15a069194e38dc731ec23217a422fb",
})]

View File

@ -1,21 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://boards.fireden.net/"""
from . import chan
class FiredenThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on boards.fireden.net"""
category = "fireden"
root = "https://boards.fireden.net"
pattern = [r"(?:https?://)?boards\.fireden\.net/([^/]+)/thread/(\d+)"]
test = [("https://boards.fireden.net/a/thread/159803223/", {
"url": "01b7baacfb0656a68e566368290e3072b27f86c9",
})]

View File

@ -0,0 +1,185 @@
# -*- coding: utf-8 -*-
# Copyright 2019 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extractors for 4chan archives based on FoolFuuka"""
from .common import SharedConfigExtractor, Message
from .. import text, config
import itertools
import operator
import re
class FoolfuukaThreadExtractor(SharedConfigExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
basecategory = "foolfuuka"
subcategory = "thread"
directory_fmt = ["{category}", "{board[shortname]}",
"{thread_num}{title:? - //}"]
filename_fmt = "{media[media]}"
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
root = ""
def __init__(self, match):
SharedConfigExtractor.__init__(self)
self.board, self.thread = match.groups()
self.session.headers["Referer"] = self.root
def items(self):
op = True
yield Message.Version, 1
for post in self.posts():
if op:
yield Message.Directory, post
op = False
if not post["media"]:
continue
media = post["media"]
url = media["media_link"]
if not url and "remote_media_link" in media:
url = self.remote(media)
if url.startswith("/"):
url = self.root + url
post["extension"] = url.rpartition(".")[2]
yield Message.Url, url, post
def posts(self):
url = self.root + "/_/api/chan/thread/"
params = {"board": self.board, "num": self.thread}
data = self.request(url, params=params).json()[self.thread]
# sort post-objects by key
posts = sorted(data.get("posts", {}).items())
posts = map(operator.itemgetter(1), posts)
return itertools.chain((data["op"],), posts)
def remote(self, media):
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
return text.extract(page, needle, '"')[0]
def _remote_simple(self, media):
return media["remote_media_link"]
EXTRACTORS = {
"4plebs": {
"name": "fourplebs",
"root": "https://archive.4plebs.org",
"pattern": r"(?:archive\.)?4plebs\.org",
"test": [("https://archive.4plebs.org/tg/thread/54059290", {
"url": "07452944164b602502b02b24521f8cee5c484d2a",
})],
},
"archivedmoe": {
"root": "https://archived.moe",
"test": [
("https://archived.moe/gd/thread/309639/", {
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
}),
("https://archived.moe/a/thread/159767162/", {
"url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
}),
],
},
"archiveofsins": {
"root": "https://archiveofsins.com",
"pattern": r"(?:www\.)?archiveofsins\.com",
"test": [("https://archiveofsins.com/h/thread/4668813/", {
"url": "f612d287087e10a228ef69517cf811539db9a102",
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
})],
},
"b4k": {
"root": "https://arch.b4k.co",
"remote": "simple",
"test": [("https://arch.b4k.co/meta/thread/196/", {
"url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
})],
},
"desuarchive": {
"root": "https://desuarchive.org",
"test": [("https://desuarchive.org/a/thread/159542679/", {
"url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
})],
},
"fireden": {
"root": "https://boards.fireden.net",
"test": [("https://boards.fireden.net/a/thread/159803223/", {
"url": "01b7baacfb0656a68e566368290e3072b27f86c9",
})],
},
"nyafuu": {
"root": "https://archive.nyafuu.org",
"pattern": r"(?:archive\.)?nyafuu\.org",
"test": [("https://archive.nyafuu.org/c/thread/2849220/", {
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
})],
},
"rbt": {
"root": "https://rbt.asia",
"pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
"test": [
("https://rbt.asia/g/thread/61487650/", {
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
}),
("https://archive.rebeccablacktech.com/g/thread/61487650/", {
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
}),
],
},
"thebarchive": {
"root": "https://thebarchive.com",
"pattern": r"thebarchive\.com",
"test": [("https://thebarchive.com/b/thread/739772332/", {
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
})],
},
}
def generate_extractors():
"""Dynamically generate Extractor classes for FoolFuuka instances"""
symtable = globals()
extractors = config.get(("extractor", "foolfuuka"))
if extractors:
EXTRACTORS.update(extractors)
for _category, info in EXTRACTORS.items():
if not isinstance(info, dict):
continue
_root = info["root"]
_domain = _root.rpartition("/")[2]
_pattern = info.get("pattern") or re.escape(_domain)
_name = info.get("name") or _category
class ThreadExtractor(FoolfuukaThreadExtractor):
category = _category
pattern = [r"(?:https?://)?{}/([^/]+)/thread/(\d+)".format(
_pattern)]
test = info.get("test")
root = _root
if info.get("remote") == "simple":
ThreadExtractor.remote = ThreadExtractor._remote_simple
ThreadExtractor.__name__ = _name.capitalize() + "ThreadExtractor"
ThreadExtractor.__doc__ = "Extractor for threads on " + _domain
symtable[ThreadExtractor.__name__] = ThreadExtractor
generate_extractors()

View File

@ -1,21 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://archive.nyafuu.org/"""
from . import chan
class NyafuuThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on nyafuu.org"""
category = "nyafuu"
root = "https://archive.nyafuu.org"
pattern = [r"(?:https?://)?(?:archive\.)?nyafuu\.org/([^/]+)/thread/(\d+)"]
test = [("http://archive.nyafuu.org/c/thread/2849220/", {
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
})]

View File

@ -1,27 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://rbt.asia/"""
from . import chan
class RebeccablacktechThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on rbt.asia"""
category = "rbt"
root = "https://rbt.asia"
pattern = [r"(?:https?://)?(?:(?:archive\.)?rebeccablacktech\.com"
r"|rbt\.asia)/([^/]+)/thread/(\d+)"]
test = [
("https://rbt.asia/g/thread/61487650/", {
"url": "fadd274b25150a1bdf03a40c58db320fa3b617c4",
}),
("https://archive.rebeccablacktech.com/g/thread/61487650/", {
"url": "fadd274b25150a1bdf03a40c58db320fa3b617c4",
}),
]

View File

@ -1,21 +0,0 @@
# -*- coding: utf-8 -*-
# Copyright 2017 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Extract images from https://thebarchive.com/"""
from . import chan
class ThebarchiveThreadExtractor(chan.FoolfuukaThreadExtractor):
"""Extractor for images from threads on thebarchive.com"""
category = "thebarchive"
root = "https://thebarchive.com"
pattern = [r"(?:https?://)?thebarchive\.com/([^/]+)/thread/(\d+)"]
test = [("https://thebarchive.com/b/thread/739772332/", {
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
})]