generalize extractor creation code
This commit is contained in:
parent
8dc6be246b
commit
09d872a2b1
@ -337,6 +337,48 @@ class SharedConfigMixin():
|
||||
return value
|
||||
|
||||
|
||||
def generate_extractors(extractor_data, symtable, classes):
|
||||
"""Dynamically generate Extractor classes"""
|
||||
extractors = config.get(("extractor", classes[0].basecategory))
|
||||
ckey = extractor_data.get("_ckey")
|
||||
prev = None
|
||||
|
||||
if extractors:
|
||||
extractor_data.update(extractors)
|
||||
|
||||
for category, info in extractor_data.items():
|
||||
|
||||
if not isinstance(info, dict):
|
||||
continue
|
||||
|
||||
root = info["root"]
|
||||
domain = root[root.index(":") + 3:]
|
||||
pattern = info.get("pattern") or re.escape(domain)
|
||||
name = (info.get("name") or category).capitalize()
|
||||
|
||||
for cls in classes:
|
||||
|
||||
class Extr(cls):
|
||||
pass
|
||||
Extr.__module__ = cls.__module__
|
||||
Extr.__name__ = Extr.__qualname__ = \
|
||||
name + cls.subcategory.capitalize() + "Extractor"
|
||||
Extr.__doc__ = \
|
||||
"Extractor for " + cls.subcategory + "s from " + domain
|
||||
Extr.category = category
|
||||
Extr.pattern = r"(?:https?://)?" + pattern + cls.pattern_fmt
|
||||
Extr.test = info.get("test-" + cls.subcategory)
|
||||
Extr.root = root
|
||||
|
||||
if "extra" in info:
|
||||
for key, value in info["extra"].items():
|
||||
setattr(Extr, key, value)
|
||||
if prev and ckey:
|
||||
setattr(Extr, ckey, prev)
|
||||
|
||||
symtable[Extr.__name__] = prev = Extr
|
||||
|
||||
|
||||
# Reduce strictness of the expected magic string in cookiejar files.
|
||||
# (This allows the use of Wget-generated cookiejars without modification)
|
||||
|
||||
|
@ -8,11 +8,10 @@
|
||||
|
||||
"""Extractors for 4chan archives based on FoolFuuka"""
|
||||
|
||||
from .common import Extractor, Message, SharedConfigMixin
|
||||
from .. import text, config
|
||||
from .common import Extractor, Message, SharedConfigMixin, generate_extractors
|
||||
from .. import text
|
||||
import itertools
|
||||
import operator
|
||||
import re
|
||||
|
||||
|
||||
class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
|
||||
@ -23,12 +22,16 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
|
||||
"{thread_num}{title:? - //}")
|
||||
filename_fmt = "{media[media]}"
|
||||
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
|
||||
pattern_fmt = r"/([^/]+)/thread/(\d+)"
|
||||
resolve = "default"
|
||||
root = ""
|
||||
|
||||
def __init__(self, match):
|
||||
Extractor.__init__(self, match)
|
||||
self.board, self.thread = match.groups()
|
||||
self.session.headers["Referer"] = self.root
|
||||
if self.resolve == "direct":
|
||||
self.remote = self._remote_direct
|
||||
|
||||
def items(self):
|
||||
op = True
|
||||
@ -52,6 +55,7 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
|
||||
yield Message.Url, url, post
|
||||
|
||||
def posts(self):
|
||||
"""Return an iterable with all posts in this thread"""
|
||||
url = self.root + "/_/api/chan/thread/"
|
||||
params = {"board": self.board, "num": self.thread}
|
||||
data = self.request(url, params=params).json()[self.thread]
|
||||
@ -63,59 +67,28 @@ class FoolfuukaThreadExtractor(SharedConfigMixin, Extractor):
|
||||
return itertools.chain((data["op"],), posts)
|
||||
|
||||
def remote(self, media):
|
||||
"""Resolve a remote media link"""
|
||||
needle = '<meta http-equiv="Refresh" content="0; url='
|
||||
page = self.request(media["remote_media_link"]).text
|
||||
return text.extract(page, needle, '"')[0]
|
||||
|
||||
def _remote_simple(self, media):
|
||||
@staticmethod
|
||||
def _remote_direct(media):
|
||||
return media["remote_media_link"]
|
||||
|
||||
|
||||
def generate_extractors():
|
||||
"""Dynamically generate Extractor classes for FoolFuuka instances"""
|
||||
|
||||
symtable = globals()
|
||||
extractors = config.get(("extractor", "foolfuuka"))
|
||||
|
||||
if extractors:
|
||||
EXTRACTORS.update(extractors)
|
||||
|
||||
for category, info in EXTRACTORS.items():
|
||||
|
||||
if not isinstance(info, dict):
|
||||
continue
|
||||
|
||||
root = info["root"]
|
||||
domain = root[root.index(":") + 3:]
|
||||
pattern = info.get("pattern") or re.escape(domain)
|
||||
name = (info.get("name") or category).capitalize()
|
||||
|
||||
class Extr(FoolfuukaThreadExtractor):
|
||||
pass
|
||||
|
||||
Extr.__name__ = Extr.__qualname__ = name + "ThreadExtractor"
|
||||
Extr.__doc__ = "Extractor for threads on " + domain
|
||||
Extr.category = category
|
||||
Extr.pattern = r"(?:https?://)?" + pattern + r"/([^/]+)/thread/(\d+)"
|
||||
Extr.test = info.get("test")
|
||||
Extr.root = root
|
||||
if info.get("remote") == "simple":
|
||||
Extr.remote = Extr._remote_simple
|
||||
symtable[Extr.__name__] = Extr
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
"4plebs": {
|
||||
"name": "fourplebs",
|
||||
"root": "https://archive.4plebs.org",
|
||||
"pattern": r"(?:archive\.)?4plebs\.org",
|
||||
"test": ("https://archive.4plebs.org/tg/thread/54059290", {
|
||||
"test-thread": ("https://archive.4plebs.org/tg/thread/54059290", {
|
||||
"url": "07452944164b602502b02b24521f8cee5c484d2a",
|
||||
}),
|
||||
},
|
||||
"archivedmoe": {
|
||||
"root": "https://archived.moe",
|
||||
"test": (
|
||||
"test-thread": (
|
||||
("https://archived.moe/gd/thread/309639/", {
|
||||
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
|
||||
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
|
||||
@ -128,41 +101,41 @@ EXTRACTORS = {
|
||||
"archiveofsins": {
|
||||
"root": "https://archiveofsins.com",
|
||||
"pattern": r"(?:www\.)?archiveofsins\.com",
|
||||
"test": ("https://archiveofsins.com/h/thread/4668813/", {
|
||||
"test-thread": ("https://archiveofsins.com/h/thread/4668813/", {
|
||||
"url": "f612d287087e10a228ef69517cf811539db9a102",
|
||||
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
|
||||
}),
|
||||
},
|
||||
"b4k": {
|
||||
"root": "https://arch.b4k.co",
|
||||
"remote": "simple",
|
||||
"test": ("https://arch.b4k.co/meta/thread/196/", {
|
||||
"extra": {"resolve": "direct"},
|
||||
"test-thread": ("https://arch.b4k.co/meta/thread/196/", {
|
||||
"url": "cdd4931ac1cd00264b0b54e2e3b0d8f6ae48957e",
|
||||
}),
|
||||
},
|
||||
"desuarchive": {
|
||||
"root": "https://desuarchive.org",
|
||||
"test": ("https://desuarchive.org/a/thread/159542679/", {
|
||||
"test-thread": ("https://desuarchive.org/a/thread/159542679/", {
|
||||
"url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
|
||||
}),
|
||||
},
|
||||
"fireden": {
|
||||
"root": "https://boards.fireden.net",
|
||||
"test": ("https://boards.fireden.net/a/thread/159803223/", {
|
||||
"test-thread": ("https://boards.fireden.net/a/thread/159803223/", {
|
||||
"url": "01b7baacfb0656a68e566368290e3072b27f86c9",
|
||||
}),
|
||||
},
|
||||
"nyafuu": {
|
||||
"root": "https://archive.nyafuu.org",
|
||||
"pattern": r"(?:archive\.)?nyafuu\.org",
|
||||
"test": ("https://archive.nyafuu.org/c/thread/2849220/", {
|
||||
"test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", {
|
||||
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
|
||||
}),
|
||||
},
|
||||
"rbt": {
|
||||
"root": "https://rbt.asia",
|
||||
"pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
|
||||
"test": (
|
||||
"test-thread": (
|
||||
("https://rbt.asia/g/thread/61487650/", {
|
||||
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
|
||||
}),
|
||||
@ -174,11 +147,12 @@ EXTRACTORS = {
|
||||
"thebarchive": {
|
||||
"root": "https://thebarchive.com",
|
||||
"pattern": r"thebarchive\.com",
|
||||
"test": ("https://thebarchive.com/b/thread/739772332/", {
|
||||
"test-thread": ("https://thebarchive.com/b/thread/739772332/", {
|
||||
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
|
||||
}),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
generate_extractors()
|
||||
generate_extractors(EXTRACTORS, globals(), (
|
||||
FoolfuukaThreadExtractor,
|
||||
))
|
||||
|
@ -9,11 +9,16 @@
|
||||
"""Extractors for FoOlSlide based sites"""
|
||||
|
||||
from .common import (
|
||||
Extractor, ChapterExtractor, MangaExtractor, Message, SharedConfigMixin)
|
||||
from .. import text, util, config
|
||||
Extractor,
|
||||
ChapterExtractor,
|
||||
MangaExtractor,
|
||||
SharedConfigMixin,
|
||||
Message,
|
||||
generate_extractors,
|
||||
)
|
||||
from .. import text, util
|
||||
import base64
|
||||
import json
|
||||
import re
|
||||
|
||||
|
||||
class FoolslideBase(SharedConfigMixin):
|
||||
@ -41,6 +46,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
|
||||
directory_fmt = (
|
||||
"{category}", "{manga}", "{chapter_string}")
|
||||
archive_fmt = "{id}"
|
||||
pattern_fmt = r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)"
|
||||
decode = "default"
|
||||
|
||||
def items(self):
|
||||
@ -92,6 +98,7 @@ class FoolslideChapterExtractor(FoolslideBase, ChapterExtractor):
|
||||
|
||||
class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
|
||||
"""Base class for manga extractors for FoOlSlide based sites"""
|
||||
pattern_fmt = r"(/series/[^/?&#]+)"
|
||||
|
||||
def chapters(self, page):
|
||||
manga , pos = text.extract(page, '<h1 class="title">', '</h1>')
|
||||
@ -116,52 +123,6 @@ class FoolslideMangaExtractor(FoolslideBase, MangaExtractor):
|
||||
})))
|
||||
|
||||
|
||||
def generate_extractors():
|
||||
"""Dynamically generate Extractor classes for FoOlSlide instances"""
|
||||
|
||||
symtable = globals()
|
||||
extractors = config.get(("extractor", "foolslide"))
|
||||
|
||||
if extractors:
|
||||
EXTRACTORS.update(extractors)
|
||||
|
||||
for category, info in EXTRACTORS.items():
|
||||
|
||||
if not isinstance(info, dict):
|
||||
continue
|
||||
|
||||
root = info["root"]
|
||||
domain = root[root.index(":") + 3:]
|
||||
pattern = info.get("pattern") or re.escape(domain)
|
||||
name = (info.get("name") or category).capitalize()
|
||||
|
||||
class ChExtr(FoolslideChapterExtractor):
|
||||
pass
|
||||
|
||||
ChExtr.__name__ = ChExtr.__qualname__ = name + "ChapterExtractor"
|
||||
ChExtr.__doc__ = "Extractor for manga-chapters from " + domain
|
||||
ChExtr.category = category
|
||||
ChExtr.pattern = (r"(?:https?://)?" + pattern +
|
||||
r"(/read/[^/?&#]+/[a-z-]+/\d+/\d+(?:/\d+)?)")
|
||||
ChExtr.test = info.get("test-chapter")
|
||||
ChExtr.root = root
|
||||
if "decode" in info:
|
||||
ChExtr.decode = info["decode"]
|
||||
symtable[ChExtr.__name__] = ChExtr
|
||||
|
||||
class MaExtr(FoolslideMangaExtractor):
|
||||
pass
|
||||
|
||||
MaExtr.__name__ = MaExtr.__qualname__ = name + "MangaExtractor"
|
||||
MaExtr.__doc__ = "Extractor for manga from " + domain
|
||||
MaExtr.category = category
|
||||
MaExtr.pattern = r"(?:https?://)?" + pattern + r"(/series/[^/?&#]+)"
|
||||
MaExtr.test = info.get("test-manga")
|
||||
MaExtr.root = root
|
||||
MaExtr.chapterclass = ChExtr
|
||||
symtable[MaExtr.__name__] = MaExtr
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
"dokireader": {
|
||||
"root": "https://kobato.hologfx.com/reader",
|
||||
@ -180,7 +141,7 @@ EXTRACTORS = {
|
||||
"jaiminisbox": {
|
||||
"root": "https://jaiminisbox.com/reader",
|
||||
"pattern": r"(?:www\.)?jaiminisbox\.com/reader",
|
||||
"decode": "base64",
|
||||
"extra": {"decode": "base64"},
|
||||
"test-chapter": (
|
||||
("https://jaiminisbox.com/reader/read/uratarou/en/0/1/", {
|
||||
"keyword": "6009af77cc9c05528ab1fdda47b1ad9d4811c673",
|
||||
@ -290,7 +251,10 @@ EXTRACTORS = {
|
||||
"keyword": "3a24f1088b4d7f3b798a96163f21ca251293a120",
|
||||
}),
|
||||
},
|
||||
"_ckey": "chapterclass",
|
||||
}
|
||||
|
||||
|
||||
generate_extractors()
|
||||
generate_extractors(EXTRACTORS, globals(), (
|
||||
FoolslideChapterExtractor,
|
||||
FoolslideMangaExtractor,
|
||||
))
|
||||
|
@ -8,8 +8,8 @@
|
||||
|
||||
"""Extractors for Shopify instances"""
|
||||
|
||||
from .common import Extractor, Message, SharedConfigMixin
|
||||
from .. import text, config
|
||||
from .common import Extractor, Message, SharedConfigMixin, generate_extractors
|
||||
from .. import text
|
||||
import time
|
||||
import re
|
||||
|
||||
@ -63,13 +63,13 @@ class ShopifyExtractor(SharedConfigMixin, Extractor):
|
||||
|
||||
def products(self):
|
||||
"""Return an iterable with all relevant product URLs"""
|
||||
return ()
|
||||
|
||||
|
||||
class ShopifyCollectionExtractor(ShopifyExtractor):
|
||||
"""Base class for collection extractors for Shopify based sites"""
|
||||
subcategory = "collection"
|
||||
directory_fmt = ("{category}", "{collection[title]}")
|
||||
pattern_fmt = r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)"
|
||||
|
||||
def __init__(self, match):
|
||||
ShopifyExtractor.__init__(self, match)
|
||||
@ -98,58 +98,23 @@ class ShopifyProductExtractor(ShopifyExtractor):
|
||||
"""Base class for product extractors for Shopify based sites"""
|
||||
subcategory = "product"
|
||||
directory_fmt = ("{category}", "Products")
|
||||
pattern_fmt = r"((?:/collections/[\w-]+)?/products/[\w-]+)"
|
||||
|
||||
def products(self):
|
||||
return (self.item_url,)
|
||||
|
||||
|
||||
def generate_extractors():
|
||||
"""Dynamically generate Extractor classes for Shopify instances"""
|
||||
symtable = globals()
|
||||
extractors = config.get(("extractor", "shopify"))
|
||||
|
||||
if extractors:
|
||||
EXTRACTORS.update(extractors)
|
||||
|
||||
for category, info in EXTRACTORS.items():
|
||||
|
||||
if not isinstance(info, dict):
|
||||
continue
|
||||
|
||||
root = info["root"]
|
||||
domain = root[root.index(":") + 3:]
|
||||
pattern = info.get("pattern") or re.escape(domain)
|
||||
name = (info.get("name") or category).capitalize()
|
||||
|
||||
class CoExtr(ShopifyCollectionExtractor):
|
||||
pass
|
||||
|
||||
CoExtr.__name__ = CoExtr.__qualname__ = name + "CollectionExtractor"
|
||||
CoExtr.__doc__ = "Extractor for product collections from " + domain
|
||||
CoExtr.category = category
|
||||
CoExtr.pattern = (r"(?:https?://)?" + pattern +
|
||||
r"(/collections/[\w-]+)/?(?:\?([^#]+))?(?:$|#)")
|
||||
CoExtr.test = info.get("test-collection")
|
||||
CoExtr.root = root
|
||||
symtable[CoExtr.__name__] = CoExtr
|
||||
|
||||
class PrExtr(ShopifyProductExtractor):
|
||||
pass
|
||||
|
||||
PrExtr.__name__ = PrExtr.__qualname__ = name + "ProductExtractor"
|
||||
PrExtr.__doc__ = "Extractor for individual products from " + domain
|
||||
PrExtr.category = category
|
||||
PrExtr.pattern = (r"(?:https?://)?" + pattern +
|
||||
r"((?:/collections/[\w-]+)?/products/[\w-]+)")
|
||||
PrExtr.test = info.get("test-product")
|
||||
PrExtr.root = root
|
||||
symtable[PrExtr.__name__] = PrExtr
|
||||
|
||||
|
||||
EXTRACTORS = {
|
||||
"fashionnova": {
|
||||
"root": "https://www.fashionnova.com",
|
||||
"pattern": r"(?:www\.)?fashionnova\.com",
|
||||
"test-product": (
|
||||
("https://www.fashionnova.com/products/essential-slide-red", {
|
||||
"pattern": r"https?://cdn\.shopify.com/",
|
||||
"count": 3,
|
||||
}),
|
||||
("https://www.fashionnova.com/collections/flats/products/name"),
|
||||
),
|
||||
"test-collection": (
|
||||
("https://www.fashionnova.com/collections/mini-dresses", {
|
||||
"range": "1-20",
|
||||
@ -158,13 +123,11 @@ EXTRACTORS = {
|
||||
("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
|
||||
("https://www.fashionnova.com/collections/mini-dresses#1"),
|
||||
),
|
||||
"test-product": (
|
||||
("https://www.fashionnova.com/products"
|
||||
"/only-here-tonight-cut-out-dress-black"),
|
||||
("https://www.fashionnova.com/collections/mini-dresses/products"
|
||||
"/only-here-tonight-cut-out-dress-black"),
|
||||
)
|
||||
|
||||
},
|
||||
}
|
||||
|
||||
generate_extractors()
|
||||
generate_extractors(EXTRACTORS, globals(), (
|
||||
ShopifyProductExtractor,
|
||||
ShopifyCollectionExtractor,
|
||||
))
|
||||
|
Loading…
x
Reference in New Issue
Block a user