[kissmanga][readcomiconline] add 'captcha' option (#279)

to configure how to handle CAPTCHA page redirects:
- either interactively wait for the user to solve the CAPTCHA
- or raise StopExtraction like before
This commit is contained in:
Mike Fährmann 2019-05-27 22:24:48 +02:00
parent e30ada162d
commit 4465a3ea68
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 60 additions and 22 deletions

View File

@ -593,6 +593,18 @@ Description Controls whether to choose the GIF or MP4 version of an animation.
=========== =====
extractor.kissmanga.captcha
---------------------------
=========== =====
Type ``string``
Default ``"stop"``
Description Controls how to handle redirects to CAPTCHA pages.
* ``"stop``: Stop the current extractor run.
* ``"wait``: Ask the user to solve the CAPTCHA and wait.
=========== =====
extractor.oauth.browser
-----------------------
=========== =====
@ -646,6 +658,18 @@ Description Minimum and maximum wait time in seconds between HTTP requests
=========== =====
extractor.readcomiconline.captcha
---------------------------------
=========== =====
Type ``string``
Default ``"stop"``
Description Controls how to handle redirects to CAPTCHA pages.
* ``"stop``: Stop the current extractor run.
* ``"wait``: Ask the user to solve the CAPTCHA and wait.
=========== =====
extractor.recursive.blacklist
-----------------------------
=========== =====

View File

@ -62,6 +62,10 @@
{
"mp4": true
},
"kissmanga":
{
"captcha": "stop"
},
"nijie":
{
"username": null,
@ -82,6 +86,10 @@
"wait-min": 3.0,
"wait-max": 6.0
},
"readcomiconline":
{
"captcha": "stop"
},
"recursive":
{
"blacklist": ["directlink", "oauth", "recursive", "test"]

View File

@ -8,7 +8,7 @@
"""Extract manga-chapters and entire manga from https://kissmanga.com/"""
from .common import ChapterExtractor, MangaExtractor
from .common import ChapterExtractor, MangaExtractor, Extractor
from .. import text, aes, exception
from ..cache import cache
import hashlib
@ -16,21 +16,35 @@ import ast
import re
class KissmangaBase():
class RedirectMixin():
"""Detect and handle redirects to CAPTCHA pages"""
def request(self, url):
while True:
response = Extractor.request(self, url)
if not response.history or "/AreYouHuman" not in response.url:
return response
if self.config("captcha", "stop") == "wait":
self.log.warning(
"Redirect to \n%s\nVisit this URL in your browser, solve "
"the CAPTCHA, and press ENTER to continue", response.url)
try:
input()
except (EOFError, OSError):
pass
else:
self.log.error(
"Redirect to \n%s\nVisit this URL in your browser and "
"solve the CAPTCHA to continue", response.url)
raise exception.StopExtraction()
class KissmangaBase(RedirectMixin):
"""Base class for kissmanga extractors"""
category = "kissmanga"
archive_fmt = "{chapter_id}_{page}"
root = "https://kissmanga.com"
def request(self, url):
response = super().request(url)
if response.history and "/AreYouHuman" in response.url:
self.log.error("Redirect to \n%s\n"
"Visit this URL in your browser and solve "
"the CAPTCHA to continue.", response.url)
raise exception.StopExtraction()
return response
@staticmethod
def parse_chapter_string(data):
"""Parse 'chapter_string' value contained in 'data'"""

View File

@ -9,11 +9,12 @@
"""Extract comic-issues and entire comics from https://readcomiconline.to/"""
from .common import ChapterExtractor, MangaExtractor
from .. import text, exception
from .kissmanga import RedirectMixin
from .. import text
import re
class ReadcomiconlineBase():
class ReadcomiconlineBase(RedirectMixin):
"""Base class for readcomiconline extractors"""
category = "readcomiconline"
directory_fmt = ("{category}", "{comic}", "{issue:>03}")
@ -21,15 +22,6 @@ class ReadcomiconlineBase():
archive_fmt = "{issue_id}_{page}"
root = "https://readcomiconline.to"
def request(self, url):
response = super().request(url)
if response.history and "/AreYouHuman" in response.url:
self.log.error("Redirect to \n%s\n"
"Visit this URL in your browser and solve "
"the CAPTCHA to continue.", response.url)
raise exception.StopExtraction()
return response
class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
"""Extractor for comic-issues from readcomiconline.to"""