add 'browser' option (#1117)
- change default user agent to Firefox ESR 78 on Windows 10 - remove 'ciphers' option
This commit is contained in:
parent
92071d02f4
commit
cf5fa75d4c
@ -380,7 +380,7 @@ extractor.*.user-agent
|
||||
Type
|
||||
``string``
|
||||
Default
|
||||
``"Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"``
|
||||
``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"``
|
||||
Description
|
||||
User-Agent header value to be used for HTTP requests.
|
||||
|
||||
@ -388,6 +388,24 @@ Description
|
||||
as these need specific values to function correctly.
|
||||
|
||||
|
||||
extractor.*.browser
|
||||
-------------------
|
||||
Type
|
||||
``string``
|
||||
Example
|
||||
* ``"firefox"``
|
||||
* ``"chrome:macos"``
|
||||
Description
|
||||
Try to emulate a real browser (``firefox`` or ``chrome``)
|
||||
by using their default HTTP headers and TLS ciphers for HTTP requests.
|
||||
|
||||
Optionally, the operating system used in the ``User-Agent`` header can be
|
||||
specified after a ``:`` (``windows``, ``linux``, or ``macos``).
|
||||
|
||||
Note: ``requests`` and ``urllib3`` only support HTTP/1.1, while a real
|
||||
browser would use HTTP/2.
|
||||
|
||||
|
||||
extractor.*.keywords
|
||||
--------------------
|
||||
Type
|
||||
@ -2457,20 +2475,6 @@ Description
|
||||
this cache.
|
||||
|
||||
|
||||
ciphers
|
||||
-------
|
||||
Type
|
||||
``bool`` or ``string``
|
||||
Default
|
||||
``true``
|
||||
Description
|
||||
* ``true``: Update urllib3's default cipher list
|
||||
* ``false``: Leave the default cipher list as is
|
||||
* Any ``string``: Replace urllib3's default ciphers with these
|
||||
(See `SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>`__
|
||||
for details)
|
||||
|
||||
|
||||
pyopenssl
|
||||
---------
|
||||
Type
|
||||
|
@ -1,6 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
# Copyright 2014-2020 Mike Fährmann
|
||||
# Copyright 2014-2021 Mike Fährmann
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License version 2 as
|
||||
@ -94,7 +94,7 @@ class HttpDownloader(DownloaderBase):
|
||||
time.sleep(tries)
|
||||
|
||||
tries += 1
|
||||
headers = {}
|
||||
headers = {"Accept": "*/*"}
|
||||
file_header = None
|
||||
|
||||
# check for .part file
|
||||
|
@ -9,6 +9,7 @@
|
||||
"""Common classes and constants used by extractor modules."""
|
||||
|
||||
import re
|
||||
import ssl
|
||||
import time
|
||||
import netrc
|
||||
import queue
|
||||
@ -16,6 +17,7 @@ import logging
|
||||
import datetime
|
||||
import requests
|
||||
import threading
|
||||
from requests.adapters import HTTPAdapter
|
||||
from .message import Message
|
||||
from .. import config, text, util, exception
|
||||
|
||||
@ -38,9 +40,10 @@ class Extractor():
|
||||
|
||||
def __init__(self, match):
|
||||
self.session = requests.Session()
|
||||
self.session.headers.clear()
|
||||
|
||||
self.log = logging.getLogger(self.category)
|
||||
self.url = match.string
|
||||
|
||||
self._cookiefile = None
|
||||
self._cookiejar = self.session.cookies
|
||||
self._parentdir = ""
|
||||
@ -62,6 +65,10 @@ class Extractor():
|
||||
self.config = self._config_shared
|
||||
self.config_accumulate = self._config_shared_accumulate
|
||||
|
||||
browser = self.config("browser")
|
||||
if browser:
|
||||
self._emulate_browser(browser)
|
||||
else:
|
||||
self._init_headers()
|
||||
self._init_cookies()
|
||||
self._init_proxies()
|
||||
@ -211,19 +218,33 @@ class Extractor():
|
||||
|
||||
return username, password
|
||||
|
||||
def _init_headers(self):
|
||||
"""Initialize HTTP headers for the 'session' object"""
|
||||
headers = self.session.headers
|
||||
headers.clear()
|
||||
def _emulate_browser(self, browser):
|
||||
browser, _, platform = browser.lower().partition(":")
|
||||
|
||||
if not platform or platform == "auto":
|
||||
platform = "windows" if util.WINDOWS else "linux"
|
||||
|
||||
if platform == "windows":
|
||||
platform = "Windows NT 10.0; Win64; x64"
|
||||
elif platform == "linux":
|
||||
platform = "X11; Linux x86_64"
|
||||
elif platform == "macos":
|
||||
platform = "Macintosh; Intel Mac OS X 11.2"
|
||||
|
||||
if browser == "chrome":
|
||||
_emulate_browser_chrome(self.session, platform)
|
||||
else:
|
||||
_emulate_browser_firefox(self.session, platform)
|
||||
|
||||
def _init_headers(self):
|
||||
"""Initialize HTTP headers for 'session'"""
|
||||
headers = self.session.headers
|
||||
headers["User-Agent"] = self.config(
|
||||
"user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) "
|
||||
"Gecko/20100101 Firefox/68.0"))
|
||||
"user-agent", ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
|
||||
"rv:78.0) Gecko/20100101 Firefox/78.0"))
|
||||
headers["Accept"] = "*/*"
|
||||
headers["Accept-Language"] = "en-US,en;q=0.5"
|
||||
headers["Accept-Encoding"] = "gzip, deflate"
|
||||
headers["Connection"] = "keep-alive"
|
||||
headers["Upgrade-Insecure-Requests"] = "1"
|
||||
|
||||
def _init_proxies(self):
|
||||
"""Update the session's proxy map"""
|
||||
@ -554,24 +575,39 @@ class BaseExtractor(Extractor):
|
||||
return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")"
|
||||
|
||||
|
||||
# Undo automatic pyOpenSSL injection by requests
|
||||
pyopenssl = config.get((), "pyopenssl", False)
|
||||
if not pyopenssl:
|
||||
try:
|
||||
from requests.packages.urllib3.contrib import pyopenssl # noqa
|
||||
pyopenssl.extract_from_urllib3()
|
||||
except ImportError:
|
||||
pass
|
||||
del pyopenssl
|
||||
class HTTPSAdapter(HTTPAdapter):
|
||||
|
||||
def __init__(self, ciphers):
|
||||
context = self.ssl_context = ssl.create_default_context()
|
||||
context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
|
||||
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
|
||||
context.set_ecdh_curve("prime256v1")
|
||||
context.set_ciphers(ciphers)
|
||||
HTTPAdapter.__init__(self)
|
||||
|
||||
def init_poolmanager(self, *args, **kwargs):
|
||||
kwargs["ssl_context"] = self.ssl_context
|
||||
return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
|
||||
|
||||
def proxy_manager_for(self, *args, **kwargs):
|
||||
kwargs["ssl_context"] = self.ssl_context
|
||||
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
|
||||
|
||||
|
||||
# Replace urllib3's default cipher list to avoid Cloudflare CAPTCHAs
|
||||
ciphers = config.get((), "ciphers", True)
|
||||
if ciphers:
|
||||
def _emulate_browser_firefox(session, platform):
|
||||
headers = session.headers
|
||||
|
||||
if ciphers is True:
|
||||
ciphers = (
|
||||
# Firefox's list
|
||||
headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:78.0) "
|
||||
"Gecko/20100101 Firefox/78.0")
|
||||
headers["Accept"] = ("text/html,application/xhtml+xml,"
|
||||
"application/xml;q=0.9,image/webp,*/*;q=0.8")
|
||||
headers["Accept-Language"] = "en-US,en;q=0.5"
|
||||
headers["Accept-Encoding"] = "gzip, deflate"
|
||||
headers["Referer"] = None
|
||||
headers["Upgrade-Insecure-Requests"] = "1"
|
||||
headers["Cookie"] = None
|
||||
|
||||
session.mount("https://", HTTPSAdapter(
|
||||
"TLS_AES_128_GCM_SHA256:"
|
||||
"TLS_CHACHA20_POLY1305_SHA256:"
|
||||
"TLS_AES_256_GCM_SHA384:"
|
||||
@ -590,11 +626,51 @@ if ciphers:
|
||||
"AES128-SHA:"
|
||||
"AES256-SHA:"
|
||||
"DES-CBC3-SHA"
|
||||
)
|
||||
elif isinstance(ciphers, list):
|
||||
ciphers = ":".join(ciphers)
|
||||
))
|
||||
|
||||
from requests.packages.urllib3.util import ssl_ # noqa
|
||||
ssl_.DEFAULT_CIPHERS = ciphers
|
||||
del ssl_
|
||||
del ciphers
|
||||
|
||||
def _emulate_browser_chrome(session, platform):
|
||||
headers = session.headers
|
||||
if platform.startswith("Macintosh"):
|
||||
platform = platform.replace(".", "_") + "_0"
|
||||
|
||||
headers["Upgrade-Insecure-Requests"] = "1"
|
||||
headers["User-Agent"] = (
|
||||
"Mozilla/5.0 (" + platform + ") AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36")
|
||||
headers["Accept"] = ("text/html,application/xhtml+xml,application/xml;"
|
||||
"q=0.9,image/webp,image/apng,*/*;q=0.8")
|
||||
headers["Referer"] = None
|
||||
headers["Accept-Encoding"] = "gzip, deflate"
|
||||
headers["Accept-Language"] = "en-US,en;q=0.9"
|
||||
headers["Cookie"] = None
|
||||
|
||||
session.mount("https://", HTTPSAdapter(
|
||||
"TLS_AES_128_GCM_SHA256:"
|
||||
"TLS_AES_256_GCM_SHA384:"
|
||||
"TLS_CHACHA20_POLY1305_SHA256:"
|
||||
"ECDHE-ECDSA-AES128-GCM-SHA256:"
|
||||
"ECDHE-RSA-AES128-GCM-SHA256:"
|
||||
"ECDHE-ECDSA-AES256-GCM-SHA384:"
|
||||
"ECDHE-RSA-AES256-GCM-SHA384:"
|
||||
"ECDHE-ECDSA-CHACHA20-POLY1305:"
|
||||
"ECDHE-RSA-CHACHA20-POLY1305:"
|
||||
"ECDHE-RSA-AES128-SHA:"
|
||||
"ECDHE-RSA-AES256-SHA:"
|
||||
"AES128-GCM-SHA256:"
|
||||
"AES256-GCM-SHA384:"
|
||||
"AES128-SHA:"
|
||||
"AES256-SHA:"
|
||||
"DES-CBC3-SHA"
|
||||
))
|
||||
|
||||
|
||||
# Undo automatic pyOpenSSL injection by requests
|
||||
pyopenssl = config.get((), "pyopenssl", False)
|
||||
if not pyopenssl:
|
||||
try:
|
||||
from requests.packages.urllib3.contrib import pyopenssl # noqa
|
||||
pyopenssl.extract_from_urllib3()
|
||||
except ImportError:
|
||||
pass
|
||||
del pyopenssl
|
||||
|
@ -63,17 +63,18 @@ class HentainexusGalleryExtractor(GalleryExtractor):
|
||||
data = json.loads(self._decode(text.extract(
|
||||
page, 'initReader("', '"')[0]))
|
||||
|
||||
headers = None
|
||||
if not self.config("original", True):
|
||||
self.session.headers["Accept"] = "image/webp,*/*"
|
||||
headers = {"_http_headers": {"Accept": "image/webp,*/*"}}
|
||||
|
||||
pages = data.get("pages")
|
||||
if pages:
|
||||
return [(page, None) for page in pages]
|
||||
return [(page, headers) for page in pages]
|
||||
|
||||
base = data["b"] + data["r"]
|
||||
gid = data["i"]
|
||||
return [
|
||||
("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), None)
|
||||
("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), headers)
|
||||
for page in data["f"]
|
||||
]
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user