[foolfuuka] use BaseExtractor

This commit is contained in:
Mike Fährmann 2021-01-26 03:49:34 +01:00
parent 745a114c61
commit 671a95cae5
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -8,21 +8,21 @@
"""Extractors for 4chan archives based on FoolFuuka"""
from .common import Extractor, Message, generate_extractors
from .common import BaseExtractor, Message
from .. import text
import itertools
class FoolfuukaExtractor(Extractor):
class FoolfuukaExtractor(BaseExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
basecategory = "foolfuuka"
archive_fmt = "{board[shortname]}_{num}_{timestamp}"
external = "default"
def __init__(self, match):
Extractor.__init__(self, match)
BaseExtractor.__init__(self, match)
self.session.headers["Referer"] = self.root
if self.external == "direct":
if self.category == "b4k":
self.remote = self._remote_direct
def items(self):
@ -43,7 +43,7 @@ class FoolfuukaExtractor(Extractor):
yield Message.Url, url, post
def metadata(self):
""" """
"""Return general metadata"""
def posts(self):
"""Return an iterable with all relevant posts"""
@ -59,16 +59,90 @@ class FoolfuukaExtractor(Extractor):
return media["remote_media_link"]
BASE_PATTERN = FoolfuukaExtractor.update({
"4plebs": {
"root": "https://archive.4plebs.org",
"pattern": r"(?:archive\.)?4plebs\.org",
},
"archivedmoe": {
"root": "https://archived.moe",
},
"archiveofsins": {
"root": "https://archiveofsins.com",
"pattern": r"(?:www\.)?archiveofsins\.com",
},
"b4k": {
"root": "https://arch.b4k.co",
},
"desuarchive": {
"root": "https://desuarchive.org",
},
"fireden": {
"root": "https://boards.fireden.net",
},
"nyafuu": {
"root": "https://archive.nyafuu.org",
"pattern": r"(?:archive\.)?nyafuu\.org",
},
"rbt": {
"root": "https://rbt.asia",
"pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
},
"thebarchive": {
"root": "https://thebarchive.com",
"pattern": r"thebarchive\.com",
},
})
class FoolfuukaThreadExtractor(FoolfuukaExtractor):
"""Base extractor for threads on FoolFuuka based boards/archives"""
subcategory = "thread"
directory_fmt = ("{category}", "{board[shortname]}",
"{thread_num}{title:? - //}")
pattern_fmt = r"/([^/?#]+)/thread/(\d+)"
pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)"
test = (
("https://archive.4plebs.org/tg/thread/54059290", {
"url": "07452944164b602502b02b24521f8cee5c484d2a",
}),
("https://archived.moe/gd/thread/309639/", {
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
}),
("https://archived.moe/a/thread/159767162/", {
"url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
}),
("https://archiveofsins.com/h/thread/4668813/", {
"url": "f612d287087e10a228ef69517cf811539db9a102",
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
}),
("https://arch.b4k.co/meta/thread/196/", {
"url": "d309713d2f838797096b3e9cb44fe514a9c9d07a",
}),
("https://desuarchive.org/a/thread/159542679/", {
"url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
}),
("https://boards.fireden.net/sci/thread/11264294/", {
"url": "3adfe181ee86a8c23021c705f623b3657a9b0a43",
}),
("https://archive.nyafuu.org/c/thread/2849220/", {
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
}),
("https://rbt.asia/g/thread/61487650/", {
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
}),
("https://archive.rebeccablacktech.com/g/thread/61487650/", {
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
}),
("https://thebarchive.com/b/thread/739772332/", {
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
}),
)
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
self.board, self.thread = match.groups()
self.board = match.group(match.lastindex-1)
self.thread = match.group(match.lastindex)
self.data = None
def metadata(self):
@ -78,23 +152,34 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
return self.data["op"]
def posts(self):
op = (self.data["op"],)
posts = self.data.get("posts")
if posts:
posts = list(posts.values())
posts.sort(key=lambda p: p["timestamp"])
else:
posts = ()
return itertools.chain((self.data["op"],), posts)
return itertools.chain(op, posts)
return op
class FoolfuukaBoardExtractor(FoolfuukaExtractor):
"""Base extractor for FoolFuuka based boards/archives"""
subcategory = "board"
pattern_fmt = r"/([^/?#]+)/\d*$"
pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$"
test = (
("https://archive.4plebs.org/tg/"),
("https://archived.moe/gd/"),
("https://archiveofsins.com/h/"),
("https://arch.b4k.co/meta/"),
("https://desuarchive.org/a/"),
("https://boards.fireden.net/sci/"),
("https://archive.nyafuu.org/c/"),
("https://rbt.asia/g/"),
("https://thebarchive.com/b/"),
)
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
self.board = match.group(1)
self.board = match.group(match.lastindex)
def items(self):
index_base = "{}/_/api/chan/index/?board={}&page=".format(
@ -113,7 +198,7 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor):
for num, thread in threads.items():
thread["url"] = thread_base + format(num)
thread["_extractor"] = self.childclass
thread["_extractor"] = FoolfuukaThreadExtractor
yield Message.Queue, thread["url"], thread
@ -121,15 +206,24 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
"""Base extractor for search results on FoolFuuka based boards/archives"""
subcategory = "search"
directory_fmt = ("{category}", "search", "{search}")
pattern_fmt = r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
pattern = BASE_PATTERN + r"/([^/?#]+)/search((?:/[^/?#]+/[^/?#]+)+)"
request_interval = 1.0
test = (
("https://archive.4plebs.org/_/search/text/test/"),
("https://archived.moe/_/search/text/test/"),
("https://archiveofsins.com/_/search/text/test/"),
("https://archiveofsins.com/_/search/text/test/"),
("https://desuarchive.org/_/search/text/test/"),
("https://boards.fireden.net/_/search/text/test/"),
("https://archive.nyafuu.org/_/search/text/test/"),
("https://rbt.asia/_/search/text/test/"),
("https://thebarchive.com/_/search/text/test/"),
)
def __init__(self, match):
FoolfuukaExtractor.__init__(self, match)
board, search = match.groups()
self.params = params = {}
args = search.split("/")
args = match.group(match.lastindex).split("/")
key = None
for arg in args:
@ -138,6 +232,8 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
key = None
else:
key = arg
board = match.group(match.lastindex-1)
if board != "_":
params["boards"] = board
@ -170,105 +266,3 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
if len(posts) <= 3:
return
params["page"] += 1
EXTRACTORS = {
"4plebs": {
"name": "_4plebs",
"root": "https://archive.4plebs.org",
"pattern": r"(?:archive\.)?4plebs\.org",
"test-thread": ("https://archive.4plebs.org/tg/thread/54059290", {
"url": "07452944164b602502b02b24521f8cee5c484d2a",
}),
"test-board": ("https://archive.4plebs.org/tg/",),
"test-search": ("https://archive.4plebs.org/_/search/text/test/",),
},
"archivedmoe": {
"root": "https://archived.moe",
"test-thread": (
("https://archived.moe/gd/thread/309639/", {
"url": "fdd533840e2d535abd162c02d6dfadbc12e2dcd8",
"content": "c27e2a7be3bc989b5dd859f7789cc854db3f5573",
}),
("https://archived.moe/a/thread/159767162/", {
"url": "ffec05a1a1b906b5ca85992513671c9155ee9e87",
}),
),
"test-board": ("https://archived.moe/gd/",),
"test-search": ("https://archived.moe/_/search/text/test/",),
},
"archiveofsins": {
"root": "https://archiveofsins.com",
"pattern": r"(?:www\.)?archiveofsins\.com",
"test-thread": ("https://archiveofsins.com/h/thread/4668813/", {
"url": "f612d287087e10a228ef69517cf811539db9a102",
"content": "0dd92d0d8a7bf6e2f7d1f5ac8954c1bcf18c22a4",
}),
"test-board": ("https://archiveofsins.com/h/",),
"test-search": ("https://archiveofsins.com/_/search/text/test/",),
},
"b4k": {
"root": "https://arch.b4k.co",
"extra": {"external": "direct"},
"test-thread": ("https://arch.b4k.co/meta/thread/196/", {
"url": "d309713d2f838797096b3e9cb44fe514a9c9d07a",
}),
"test-board": ("https://arch.b4k.co/meta/",),
"test-search": ("https://arch.b4k.co/_/search/text/test/",),
},
"desuarchive": {
"root": "https://desuarchive.org",
"test-thread": ("https://desuarchive.org/a/thread/159542679/", {
"url": "3ae1473f6916ac831efe5cc4d4e7d3298ce79406",
}),
"test-board": ("https://desuarchive.org/a/",),
"test-search": ("https://desuarchive.org/_/search/text/test/",),
},
"fireden": {
"root": "https://boards.fireden.net",
"test-thread": ("https://boards.fireden.net/sci/thread/11264294/", {
"url": "3adfe181ee86a8c23021c705f623b3657a9b0a43",
}),
"test-board": ("https://boards.fireden.net/sci/",),
"test-search": ("https://boards.fireden.net/_/search/text/test/",),
},
"nyafuu": {
"root": "https://archive.nyafuu.org",
"pattern": r"(?:archive\.)?nyafuu\.org",
"test-thread": ("https://archive.nyafuu.org/c/thread/2849220/", {
"url": "bbe6f82944a45e359f5c8daf53f565913dc13e4f",
}),
"test-board": ("https://archive.nyafuu.org/c/",),
"test-search": ("https://archive.nyafuu.org/_/search/text/test/",),
},
"rbt": {
"root": "https://rbt.asia",
"pattern": r"(?:rbt\.asia|(?:archive\.)?rebeccablacktech\.com)",
"test-thread": (
("https://rbt.asia/g/thread/61487650/", {
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
}),
("https://archive.rebeccablacktech.com/g/thread/61487650/", {
"url": "61896d9d9a2edb556b619000a308a984307b6d30",
}),
),
"test-board": ("https://rbt.asia/g/",),
"test-search": ("https://rbt.asia/_/search/text/test/",),
},
"thebarchive": {
"root": "https://thebarchive.com",
"pattern": r"thebarchive\.com",
"test-thread": ("https://thebarchive.com/b/thread/739772332/", {
"url": "e8b18001307d130d67db31740ce57c8561b5d80c",
}),
"test-board": ("https://thebarchive.com/b/",),
"test-search": ("https://thebarchive.com/_/search/text/test/",),
},
"_ckey": "childclass",
}
generate_extractors(EXTRACTORS, globals(), (
FoolfuukaThreadExtractor,
FoolfuukaBoardExtractor,
FoolfuukaSearchExtractor,
))