[foolfuuka] dynamically generate extractor classes

2019-02-03 00:40:12 +01:00 · 2019-02-03 00:40:12 +01:00 · 58a9eede38
commit 58a9eede38
parent 22d7a783d5
12 changed files with 188 additions and 274 deletions
--- a/gallery_dl/extractor/4plebs.py
+++ b/gallery_dl/extractor/4plebs.py
@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://archive.4plebs.org/"""
-
-from . import chan
-
-
-class FourplebsThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on 4plebs.org"""
-    category = "4plebs"
-    root = "https://archive.4plebs.org"
-    pattern = [r"(?:https?://)?(?:archive\.)?4plebs\.org/([^/]+)/thread/(\d+)"]
-    test = [("https://archive.4plebs.org/tg/thread/54059290", {
-        "url": "07452944164b602502b02b24521f8cee5c484d2a",
-    })]
--- a/gallery_dl/extractor/init.py
+++ b/gallery_dl/extractor/init.py
@ -13,23 +13,17 @@ modules = [
    "2chan",
    "3dbooru",
    "4chan",
-    "4plebs",
    "8chan",
-    "archivedmoe",
-    "archiveofsins",
    "artstation",
-    "b4k",
    "behance",
    "bobx",
    "danbooru",
-    "desuarchive",
    "deviantart",
    "dokireader",
    "dynastyscans",
    "e621",
    "exhentai",
    "fallenangels",
-    "fireden",
    "flickr",
    "gelbooru",
    "gfycat",
@ -66,7 +60,6 @@ modules = [
    "ngomik",
    "nhentai",
    "nijie",
-    "nyafuu",
    "paheal",
    "photobucket",
    "piczel",
@ -75,7 +68,6 @@ modules = [
    "powermanga",
    "reactor",
    "readcomiconline",
-    "rebeccablacktech",
    "reddit",
    "rule34",
    "safebooru",
@ -87,7 +79,6 @@ modules = [
    "simplyhentai",
    "slideshare",
    "smugmug",
-    "thebarchive",
    "tsumino",
    "tumblr",
    "twitter",
@ -97,6 +88,7 @@ modules = [
    "yandere",
    "xvideos",
    "yuki",
+    "foolfuuka",
    "mastodon",
    "imagehosts",
    "directlink",
--- a/gallery_dl/extractor/archivedmoe.py
+++ b/gallery_dl/extractor/archivedmoe.py
@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://archived.moe/"""
-
-from . import chan
-
-
-class ArchivedmoeThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on archived.moe"""
-    category = "archivedmoe"
-    root = "https://archived.moe"
-    pattern = [r"(?:https?://)?archived\.moe/([^/]+)/thread/(\d+)"]
-    test = [
-        ("https://archived.moe/gd/thread/309639/", {
-            "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
-            "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
-        }),
-        ("https://archived.moe/a/thread/159767162/", {
-            "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
-        }),
-    ]
--- a/gallery_dl/extractor/archiveofsins.py
+++ b/gallery_dl/extractor/archiveofsins.py
@ -1,23 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://archiveofsins.com/"""
-
-from . import chan
-
-
-class ArchiveofsinsThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on archiveofsins.com"""
-    category = "archiveofsins"
-    root = "https://archiveofsins.com"
-    pattern = [r"(?:https?://)?(?:www\.)?archiveofsins\.com"
-               r"/([^/]+)/thread/(\d+)"]
-    test = [("https://www.archiveofsins.com/h/thread/4668813/", {
-        "url": "f612d287087e10a228ef69517cf811539db9a102",
-        "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
-    })]
--- a/gallery_dl/extractor/b4k.py
+++ b/gallery_dl/extractor/b4k.py
@ -1,24 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://arch.b4k.co/"""
-
-from . import chan
-
-
-class BfourkThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on arch.b4k.co"""
-    category = "b4k"
-    root = "https://arch.b4k.co"
-    pattern = [r"(?:https?://)?arch\.b4k\.co/([^/]+)/thread/(\d+)"]
-    test = [("http://arch.b4k.co/meta/thread/196/", {
-        "url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
-    })]
-
-    def remote(self, media):
-        return media["remote_media_link"]
--- a/gallery_dl/extractor/chan.py
+++ b/gallery_dl/extractor/chan.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -8,10 +8,8 @@

 """Base classes for extractors for different Futaba Channel-like boards"""

-from .common import Extractor, SharedConfigExtractor, Message
+from .common import Extractor, Message
 from .. import text
-import itertools
-import operator


 class ChanThreadExtractor(Extractor):
@ -61,58 +59,3 @@ class ChanThreadExtractor(Extractor):
        """Return thread title from first post"""
        title = post["sub"] if "sub" in post else text.remove_html(post["com"])
        return text.unescape(title)[:50]
-
-
-class FoolfuukaThreadExtractor(SharedConfigExtractor):
-    """Base extractor for FoolFuuka based boards/archives"""
-    basecategory = "foolfuuka"
-    subcategory = "thread"
-    directory_fmt = ["{category}", "{board[shortname]}",
-                     "{thread_num}{title:? - //}"]
-    filename_fmt = "{media[media]}"
-    archive_fmt = "{board[shortname]}_{num}_{timestamp}"
-    root = ""
-    referer = True
-
-    def __init__(self, match):
-        SharedConfigExtractor.__init__(self)
-        self.board, self.thread = match.groups()
-        if self.referer:
-            self.session.headers["Referer"] = self.root
-
-    def items(self):
-        op = True
-        yield Message.Version, 1
-        for post in self.posts():
-            if op:
-                yield Message.Directory, post
-                op = False
-            if not post["media"]:
-                continue
-
-            media = post["media"]
-            url = media["media_link"]
-
-            if not url and "remote_media_link" in media:
-                url = self.remote(media)
-            if url.startswith("/"):
-                url = self.root + url
-
-            post["extension"] = url.rpartition(".")[2]
-            yield Message.Url, url, post
-
-    def posts(self):
-        url = self.root + "/_/api/chan/thread/"
-        params = {"board": self.board, "num": self.thread}
-        data = self.request(url, params=params).json()[self.thread]
-
-        # sort post-objects by their key
-        posts = sorted(data.get("posts", {}).items())
-        posts = map(operator.itemgetter(1), posts)
-
-        return itertools.chain((data["op"],), posts)
-
-    def remote(self, media):
-        needle = '<meta http-equiv="Refresh" content="0; url='
-        page = self.request(media["remote_media_link"]).text
-        return text.extract(page, needle, '"')[0]
--- a/gallery_dl/extractor/desuarchive.py
+++ b/gallery_dl/extractor/desuarchive.py
@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://desuarchive.org/"""
-
-from . import chan
-
-
-class DesuarchiveThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on desuarchive.org"""
-    category = "desuarchive"
-    root = "https://desuarchive.org"
-    pattern = [r"(?:https?://)?desuarchive\.org/([^/]+)/thread/(\d+)"]
-    test = [("https://desuarchive.org/a/thread/159542679/", {
-        "url": "e7d624aded15a069194e38dc731ec23217a422fb",
-    })]
--- a/gallery_dl/extractor/fireden.py
+++ b/gallery_dl/extractor/fireden.py
@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://boards.fireden.net/"""
-
-from . import chan
-
-
-class FiredenThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on boards.fireden.net"""
-    category = "fireden"
-    root = "https://boards.fireden.net"
-    pattern = [r"(?:https?://)?boards\.fireden\.net/([^/]+)/thread/(\d+)"]
-    test = [("https://boards.fireden.net/a/thread/159803223/", {
-        "url": "01b7baacfb0656a68e566368290e3072b27f86c9",
-    })]
--- a/gallery_dl/extractor/foolfuuka.py
+++ b/gallery_dl/extractor/foolfuuka.py
@ -0,0 +1,185 @@
+# -*- coding: utf-8 -*-
+
+# Copyright 2019 Mike Fährmann
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2 as
+# published by the Free Software Foundation.
+
+"""Extractors for 4chan archives based on FoolFuuka"""
+
+from .common import SharedConfigExtractor, Message
+from .. import text, config
+import itertools
+import operator
+import re
+
+
+class FoolfuukaThreadExtractor(SharedConfigExtractor):
+    """Base extractor for FoolFuuka based boards/archives"""
+    basecategory = "foolfuuka"
+    subcategory = "thread"
+    directory_fmt = ["{category}", "{board[shortname]}",
+                     "{thread_num}{title:? - //}"]
+    filename_fmt = "{media[media]}"
+    archive_fmt = "{board[shortname]}_{num}_{timestamp}"
+    root = ""
+
+    def __init__(self, match):
+        SharedConfigExtractor.__init__(self)
+        self.board, self.thread = match.groups()
+        self.session.headers["Referer"] = self.root
+
+    def items(self):
+        op = True
+        yield Message.Version, 1
+        for post in self.posts():
+            if op:
+                yield Message.Directory, post
+                op = False
+            if not post["media"]:
+                continue
+
+            media = post["media"]
+            url = media["media_link"]
+
+            if not url and "remote_media_link" in media:
+                url = self.remote(media)
+            if url.startswith("/"):
+                url = self.root + url
+
+            post["extension"] = url.rpartition(".")[2]
+            yield Message.Url, url, post
+
+    def posts(self):
+        url = self.root + "/_/api/chan/thread/"
+        params = {"board": self.board, "num": self.thread}
+        data = self.request(url, params=params).json()[self.thread]
+
+        # sort post-objects by key
+        posts = sorted(data.get("posts", {}).items())
+        posts = map(operator.itemgetter(1), posts)
+
+        return itertools.chain((data["op"],), posts)
+
+    def remote(self, media):
+        needle = '<meta http-equiv="Refresh" content="0; url='
+        page = self.request(media["remote_media_link"]).text
+        return text.extract(page, needle, '"')[0]
+
+    def _remote_simple(self, media):
+        return media["remote_media_link"]
+
+
+EXTRACTORS = {
+    "4plebs": {
+        "name": "fourplebs",
+        "root": "https://archive.4plebs.org",
+        "pattern": r"(?:archive\.)?4plebs\.org",
+        "test": [("https://archive.4plebs.org/tg/thread/54059290", {
+            "url": "07452944164b602502b02b24521f8cee5c484d2a",
+        })],
+    },
+    "archivedmoe": {
+        "root": "https://archived.moe",
+        "test": [
+            ("https://archived.moe/gd/thread/309639/", {
+                "url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
+                "content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
+            }),
+            ("https://archived.moe/a/thread/159767162/", {
+                "url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
+            }),
+        ],
+    },
+    "archiveofsins": {
+        "root": "https://archiveofsins.com",
+        "pattern": r"(?:www\.)?archiveofsins\.com",
+        "test": [("https://archiveofsins.com/h/thread/4668813/", {
+            "url": "f612d287087e10a228ef69517cf811539db9a102",
+            "content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
+        })],
+    },
+    "b4k": {
+        "root": "https://arch.b4k.co",
+        "remote": "simple",
+        "test": [("https://arch.b4k.co/meta/thread/196/", {
+            "url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
+        })],
+    },
+    "desuarchive": {
+        "root": "https://desuarchive.org",
+        "test": [("https://desuarchive.org/a/thread/159542679/", {
+            "url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
+        })],
+    },
+    "fireden": {
+        "root": "https://boards.fireden.net",
+        "test": [("https://boards.fireden.net/a/thread/159803223/", {
+            "url": "01b7baacfb0656a68e566368290e3072b27f86c9",
+        })],
+    },
+    "nyafuu": {
+        "root": "https://archive.nyafuu.org",
+        "pattern": r"(?:archive\.)?nyafuu\.org",
+        "test": [("https://archive.nyafuu.org/c/thread/2849220/", {
+            "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
+        })],
+    },
+    "rbt": {
+        "root": "https://rbt.asia",
+        "pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
+        "test": [
+            ("https://rbt.asia/g/thread/61487650/", {
+                "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+            }),
+            ("https://archive.rebeccablacktech.com/g/thread/61487650/", {
+                "url": "61896d9d9a2edb556b619000a308a984307b6d30",
+            }),
+        ],
+    },
+    "thebarchive": {
+        "root": "https://thebarchive.com",
+        "pattern": r"thebarchive\.com",
+        "test": [("https://thebarchive.com/b/thread/739772332/", {
+            "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
+        })],
+    },
+}
+
+
+def generate_extractors():
+    """Dynamically generate Extractor classes for FoolFuuka instances"""
+
+    symtable = globals()
+    extractors = config.get(("extractor", "foolfuuka"))
+
+    if extractors:
+        EXTRACTORS.update(extractors)
+
+    for _category, info in EXTRACTORS.items():
+
+        if not isinstance(info, dict):
+            continue
+
+        _root = info["root"]
+        _domain = _root.rpartition("/")[2]
+        _pattern = info.get("pattern") or re.escape(_domain)
+        _name = info.get("name") or _category
+
+        class ThreadExtractor(FoolfuukaThreadExtractor):
+            category = _category
+            pattern = [r"(?:https?://)?{}/([^/]+)/thread/(\d+)".format(
+                _pattern)]
+            test = info.get("test")
+            root = _root
+
+        if info.get("remote") == "simple":
+            ThreadExtractor.remote = ThreadExtractor._remote_simple
+
+        ThreadExtractor.__name__ = _name.capitalize() + "ThreadExtractor"
+        ThreadExtractor.__doc__ = "Extractor for threads on " + _domain
+        symtable[ThreadExtractor.__name__] = ThreadExtractor
+
+
+generate_extractors()
--- a/gallery_dl/extractor/nyafuu.py
+++ b/gallery_dl/extractor/nyafuu.py
@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://archive.nyafuu.org/"""
-
-from . import chan
-
-
-class NyafuuThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on nyafuu.org"""
-    category = "nyafuu"
-    root = "https://archive.nyafuu.org"
-    pattern = [r"(?:https?://)?(?:archive\.)?nyafuu\.org/([^/]+)/thread/(\d+)"]
-    test = [("http://archive.nyafuu.org/c/thread/2849220/", {
-        "url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
-    })]
--- a/gallery_dl/extractor/rebeccablacktech.py
+++ b/gallery_dl/extractor/rebeccablacktech.py
@ -1,27 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://rbt.asia/"""
-
-from . import chan
-
-
-class RebeccablacktechThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on rbt.asia"""
-    category = "rbt"
-    root = "https://rbt.asia"
-    pattern = [r"(?:https?://)?(?:(?:archive\.)?rebeccablacktech\.com"
-               r"|rbt\.asia)/([^/]+)/thread/(\d+)"]
-    test = [
-        ("https://rbt.asia/g/thread/61487650/", {
-            "url": "fadd274b25150a1bdf03a40c58db320fa3b617c4",
-        }),
-        ("https://archive.rebeccablacktech.com/g/thread/61487650/", {
-            "url": "fadd274b25150a1bdf03a40c58db320fa3b617c4",
-        }),
-    ]
--- a/gallery_dl/extractor/thebarchive.py
+++ b/gallery_dl/extractor/thebarchive.py
@ -1,21 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Copyright 2017 Mike Fährmann
-#
-# This program is free software; you can redistribute it and/or modify
-# it under the terms of the GNU General Public License version 2 as
-# published by the Free Software Foundation.
-
-"""Extract images from https://thebarchive.com/"""
-
-from . import chan
-
-
-class ThebarchiveThreadExtractor(chan.FoolfuukaThreadExtractor):
-    """Extractor for images from threads on thebarchive.com"""
-    category = "thebarchive"
-    root = "https://thebarchive.com"
-    pattern = [r"(?:https?://)?thebarchive\.com/([^/]+)/thread/(\d+)"]
-    test = [("https://thebarchive.com/b/thread/739772332/", {
-        "url": "e8b18001307d130d67db31740ce57c8561b5d80c",
-    })]