generalize extractor creation code

This commit is contained in:
Mike Fährmann 2019-03-07 22:55:26 +01:00
parent 8dc6be246b
commit 09d872a2b1
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 97 additions and 154 deletions

View File

@ -337,6 +337,48 @@ class SharedConfigMixin():
return value
def generate_extractors(extractor_data, symtable, classes):
"""Dynamically generate Extractor classes"""
extractors = config.get(("extractor", classes[0].basecategory))
ckey = extractor_data.get("_ckey")
prev = None
if extractors:
extractor_data.update(extractors)
for category, info in extractor_data.items():
if not isinstance(info, dict):
continue
root = info["root"]
domain = root[root.index(":") + 3:]
pattern = info.get("pattern") or re.escape(domain)
name = (info.get("name") or category).capitalize()
for cls in classes:
class Extr(cls):
pass
Extr.__module__ = cls.__module__
Extr.__name__ = Extr.__qualname__ = \
name + cls.subcategory.capitalize() + "Extractor"
Extr.__doc__ = \
"Extractor for " + cls.subcategory + "s from " + domain
Extr.category = category
Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt
Extr.test = info.get("test-" + cls.subcategory)
Extr.root = root
if "extra" in info:
for key, value in info["extra"].items():
setattr(Extr, key, value)
if prev and ckey:
setattr(Extr, ckey, prev)
symtable[Extr.__name__] = prev = Extr
# Reduce strictness of the expected magic string in cookiejar files.
# (This allows the use of Wget-generated cookiejars without modification)

View File

@ -8,11 +8,10 @@
"""Extractors for 4chan archives based on FoolFuuka"""
from .common import Extractor, Message, SharedConfigMixin
from .. import text, config
from .common import Extractor, Message, SharedConfigMixin, generate_extractors
from .. import text
import itertools
import operator
import re
class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
@ -23,12 +22,16 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
"{thread_num}{title:? - //}")
filename_fmt = "{media[media]}"
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
pattern_fmt = r"/([^/]+)/thread/(\d+)"
resolve = "default"
root = ""
def __init__(self, match):
Extractor.__init__(self, match)
self.board, self.thread = match.groups()
self.session.headers["Referer"] = self.root
if self.resolve == "direct":
self.remote = self._remote_direct
def items(self):
op = True
@ -52,6 +55,7 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
yield Message.Url, url, post
def posts(self):
"""Return an iterable with all posts in this thread"""
url = self.root + "/_/api/chan/thread/"
params = {"board": self.board, "num": self.thread}
data = self.request(url, params=params).json()[self.thread]
@ -63,59 +67,28 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
return itertools.chain((data["op"],), posts)
def remote(self, media):
"""Resolve a remote media link"""
needle = '<meta http-equiv="Refresh" content="0; url='
page = self.request(media["remote_media_link"]).text
return text.extract(page, needle, '"')[0]
def _remote_simple(self, media):
@staticmethod
def _remote_direct(media):
return media["remote_media_link"]
def generate_extractors():
"""Dynamically generate Extractor classes for FoolFuuka instances"""
symtable = globals()
extractors = config.get(("extractor", "foolfuuka"))
if extractors:
EXTRACTORS.update(extractors)
for category, info in EXTRACTORS.items():
if not isinstance(info, dict):
continue
root = info["root"]
domain = root[root.index(":") + 3:]
pattern = info.get("pattern") or re.escape(domain)
name = (info.get("name") or category).capitalize()
class Extr(FoolfuukaThreadExtractor):
pass
Extr.__name__ = Extr.__qualname__ = name + "ThreadExtractor"
Extr.__doc__ = "Extractor for threads on " + domain
Extr.category = category
Extr.pattern = r"(?:https?://)?" + pattern + r"/([^/]+)/thread/(\d+)"
Extr.test = info.get("test")
Extr.root = root
if info.get("remote") == "simple":
Extr.remote = Extr._remote_simple
symtable[Extr.__name__] = Extr
EXTRACTORS = {
"4plebs": {
"name": "fourplebs",
"root": "https://archive.4plebs.org",
"pattern": r"(?:archive\.)?4plebs\.org",
"test": ("https://archive.4plebs.org/tg/thread/54059290", {
"test-thread": ("https://archive.4plebs.org/tg/thread/54059290", {
"url": "07452944164b602502b02b24521f8cee5c484d2a",
}),
},
"archivedmoe": {
"root": "https://archived.moe",
"test": (
"test-thread": (
("https://archived.moe/gd/thread/309639/", {
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
@ -128,41 +101,41 @@ EXTRACTORS = {
"archiveofsins": {
"root": "https://archiveofsins.com",
"pattern": r"(?:www\.)?archiveofsins\.com",
"test": ("https://archiveofsins.com/h/thread/4668813/", {
"test-thread": ("https://archiveofsins.com/h/thread/4668813/", {
"url": "f612d287087e10a228ef69517cf811539db9a102",
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
}),
},
"b4k": {
"root": "https://arch.b4k.co",
"remote": "simple",
"test": ("https://arch.b4k.co/meta/thread/196/", {
"extra": {"resolve": "direct"},
"test-thread": ("https://arch.b4k.co/meta/thread/196/", {
"url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
}),
},
"desuarchive": {
"root": "https://desuarchive.org",
"test": ("https://desuarchive.org/a/thread/159542679/", {
"test-thread": ("https://desuarchive.org/a/thread/159542679/", {
"url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
}),
},
"fireden": {
"root": "https://boards.fireden.net",
"test": ("https://boards.fireden.net/a/thread/159803223/", {
"test-thread": ("https://boards.fireden.net/a/thread/159803223/", {
"url": "01b7baacfb0656a68e566368290e3072b27f86c9",
}),
},
"nyafuu": {
"root": "https://archive.nyafuu.org",
"pattern": r"(?:archive\.)?nyafuu\.org",
"test": ("https://archive.nyafuu.org/c/thread/2849220/", {
"test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", {
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
}),
},
"rbt": {
"root": "https://rbt.asia",
"pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
"test": (
"test-thread": (
("https://rbt.asia/g/thread/61487650/", {
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
}),
@ -174,11 +147,12 @@ EXTRACTORS = {
"thebarchive": {
"root": "https://thebarchive.com",
"pattern": r"thebarchive\.com",
"test": ("https://thebarchive.com/b/thread/739772332/", {
"test-thread": ("https://thebarchive.com/b/thread/739772332/", {
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
}),
},
}
generate_extractors()
generate_extractors(EXTRACTORS, globals(), (
FoolfuukaThreadExtractor,
))

View File

@ -9,11 +9,16 @@
"""Extractors for FoOlSlide based sites"""
from .common import (
Extractor, ChapterExtractor, MangaExtractor, Message, SharedConfigMixin)
from .. import text, util, config
Extractor,
ChapterExtractor,
MangaExtractor,
SharedConfigMixin,
Message,
generate_extractors,
)
from .. import text, util
import base64
import json
import re
class FoolslideBase(SharedConfigMixin):
@ -41,6 +46,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
directory_fmt = (
"{category}", "{manga}", "{chapter_string}")
archive_fmt = "{id}"
pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
decode = "default"
def items(self):
@ -92,6 +98,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
"""Base class for manga extractors for FoOlSlide based sites"""
pattern_fmt = r"(/series/[^/?&#]+)"
def chapters(self, page):
manga , pos = text.extract(page, '<h1 class="title">', '</h1>')
@ -116,52 +123,6 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
})))
def generate_extractors():
"""Dynamically generate Extractor classes for FoOlSlide instances"""
symtable = globals()
extractors = config.get(("extractor", "foolslide"))
if extractors:
EXTRACTORS.update(extractors)
for category, info in EXTRACTORS.items():
if not isinstance(info, dict):
continue
root = info["root"]
domain = root[root.index(":") + 3:]
pattern = info.get("pattern") or re.escape(domain)
name = (info.get("name") or category).capitalize()
class ChExtr(FoolslideChapterExtractor):
pass
ChExtr.__name__ = ChExtr.__qualname__ = name + "ChapterExtractor"
ChExtr.__doc__ = "Extractor for manga-chapters from " + domain
ChExtr.category = category
ChExtr.pattern = (r"(?:https?://)?" + pattern +
r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
ChExtr.test = info.get("test-chapter")
ChExtr.root = root
if "decode" in info:
ChExtr.decode = info["decode"]
symtable[ChExtr.__name__] = ChExtr
class MaExtr(FoolslideMangaExtractor):
pass
MaExtr.__name__ = MaExtr.__qualname__ = name + "MangaExtractor"
MaExtr.__doc__ = "Extractor for manga from " + domain
MaExtr.category = category
MaExtr.pattern = r"(?:https?://)?" + pattern + r"(/series/[^/?&#]+)"
MaExtr.test = info.get("test-manga")
MaExtr.root = root
MaExtr.chapterclass = ChExtr
symtable[MaExtr.__name__] = MaExtr
EXTRACTORS = {
"dokireader": {
"root": "https://kobato.hologfx.com/reader",
@ -180,7 +141,7 @@ EXTRACTORS = {
"jaiminisbox": {
"root": "https://jaiminisbox.com/reader",
"pattern": r"(?:www\.)?jaiminisbox\.com/reader",
"decode": "base64",
"extra": {"decode": "base64"},
"test-chapter": (
("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", {
"keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673",
@ -290,7 +251,10 @@ EXTRACTORS = {
"keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120",
}),
},
"_ckey": "chapterclass",
}
generate_extractors()
generate_extractors(EXTRACTORS, globals(), (
FoolslideChapterExtractor,
FoolslideMangaExtractor,
))

View File

@ -8,8 +8,8 @@
"""Extractors for Shopify instances"""
from .common import Extractor, Message, SharedConfigMixin
from .. import text, config
from .common import Extractor, Message, SharedConfigMixin, generate_extractors
from .. import text
import time
import re
@ -63,13 +63,13 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
def products(self):
"""Return an iterable with all relevant product URLs"""
return ()
class ShopifyCollectionExtractor(ShopifyExtractor):
"""Base class for collection extractors for Shopify based sites"""
subcategory = "collection"
directory_fmt = ("{category}", "{collection[title]}")
pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)"
def __init__(self, match):
ShopifyExtractor.__init__(self, match)
@ -98,58 +98,23 @@ class ShopifyProductExtractor(ShopifyExtractor):
"""Base class for product extractors for Shopify based sites"""
subcategory = "product"
directory_fmt = ("{category}", "Products")
pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)"
def products(self):
return (self.item_url,)
def generate_extractors():
"""Dynamically generate Extractor classes for Shopify instances"""
symtable = globals()
extractors = config.get(("extractor", "shopify"))
if extractors:
EXTRACTORS.update(extractors)
for category, info in EXTRACTORS.items():
if not isinstance(info, dict):
continue
root = info["root"]
domain = root[root.index(":") + 3:]
pattern = info.get("pattern") or re.escape(domain)
name = (info.get("name") or category).capitalize()
class CoExtr(ShopifyCollectionExtractor):
pass
CoExtr.__name__ = CoExtr.__qualname__ = name + "CollectionExtractor"
CoExtr.__doc__ = "Extractor for product collections from " + domain
CoExtr.category = category
CoExtr.pattern = (r"(?:https?://)?" + pattern +
r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)")
CoExtr.test = info.get("test-collection")
CoExtr.root = root
symtable[CoExtr.__name__] = CoExtr
class PrExtr(ShopifyProductExtractor):
pass
PrExtr.__name__ = PrExtr.__qualname__ = name + "ProductExtractor"
PrExtr.__doc__ = "Extractor for individual products from " + domain
PrExtr.category = category
PrExtr.pattern = (r"(?:https?://)?" + pattern +
r"((?:/collections/[\w-]+)?/products/[\w-]+)")
PrExtr.test = info.get("test-product")
PrExtr.root = root
symtable[PrExtr.__name__] = PrExtr
EXTRACTORS = {
"fashionnova": {
"root": "https://www.fashionnova.com",
"pattern": r"(?:www\.)?fashionnova\.com",
"test-product": (
("https://www.fashionnova.com/products/essential-slide-red", {
"pattern": r"https?://cdn\.shopify.com/",
"count": 3,
}),
("https://www.fashionnova.com/collections/flats/products/name"),
),
"test-collection": (
("https://www.fashionnova.com/collections/mini-dresses", {
"range": "1-20",
@ -158,13 +123,11 @@ EXTRACTORS = {
("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
("https://www.fashionnova.com/collections/mini-dresses#1"),
),
"test-product": (
("https://www.fashionnova.com/products"
"/only-here-tonight-cut-out-dress-black"),
("https://www.fashionnova.com/collections/mini-dresses/products"
"/only-here-tonight-cut-out-dress-black"),
)
},
}
generate_extractors()
generate_extractors(EXTRACTORS, globals(), (
ShopifyProductExtractor,
ShopifyCollectionExtractor,
))