[simplyhentai] fix extraction

This commit is contained in:
Mike Fährmann 2021-04-25 01:47:51 +02:00
parent ba8180b5e6
commit d900edfcfb
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2018-2019 Mike Fährmann
# Copyright 2018-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -16,9 +16,9 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
"""Extractor for image galleries from simply-hentai.com"""
category = "simplyhentai"
archive_fmt = "{image_id}"
pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.simply-hentai\.com"
pattern = (r"(?:https?://)?(?!videos\.)([\w-]+\.)?simply-hentai\.com"
r"(?!/(?:album|gifs?|images?|series)(?:/|$))"
r"(?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?#]+)+)")
r"((?:/(?!(?:page|all-pages)(?:/|\.|$))[^/?#]+)+)")
test = (
(("https://original-work.simply-hentai.com"
"/amazon-no-hiyaku-amazon-elixir"), {
@ -35,7 +35,10 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
)
def __init__(self, match):
url = "https://" + match.group(1)
subdomain, path = match.groups()
if subdomain and subdomain not in ("www.", "old."):
path = "/" + subdomain.rstrip(".") + path
url = "https://old.simply-hentai.com" + path
GalleryExtractor.__init__(self, match, url)
self.session.headers["Referer"] = url
@ -43,7 +46,6 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor):
extr = text.extract_from(page)
split = text.split_html
self.gallery_url = extr('<link rel="canonical" href="', '"')
title = extr('<meta property="og:title" content="', '"')
image = extr('<meta property="og:image" content="', '"')
if not title:
@ -99,7 +101,7 @@ class SimplyhentaiImageExtractor(Extractor):
def __init__(self, match):
Extractor.__init__(self, match)
self.page_url = "https://www." + match.group(1)
self.page_url = "https://old." + match.group(1)
self.type = match.group(2)
def items(self):