add 'browser' option (#1117)

- change default user agent to Firefox ESR 78 on Windows 10
- remove 'ciphers' option
This commit is contained in:
Mike Fährmann 2021-02-25 23:39:34 +01:00
parent 92071d02f4
commit cf5fa75d4c
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 146 additions and 65 deletions

View File

@ -380,7 +380,7 @@ extractor.*.user-agent
Type
``string``
Default
``"Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"``
``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"``
Description
User-Agent header value to be used for HTTP requests.
@ -388,6 +388,24 @@ Description
as these need specific values to function correctly.
extractor.*.browser
-------------------
Type
``string``
Example
* ``"firefox"``
* ``"chrome:macos"``
Description
Try to emulate a real browser (``firefox`` or ``chrome``)
by using their default HTTP headers and TLS ciphers for HTTP requests.
Optionally, the operating system used in the ``User-Agent`` header can be
specified after a ``:`` (``windows``, ``linux``, or ``macos``).
Note: ``requests`` and ``urllib3`` only support HTTP/1.1, while a real
browser would use HTTP/2.
extractor.*.keywords
--------------------
Type
@ -2457,20 +2475,6 @@ Description
this cache.
ciphers
-------
Type
``bool`` or ``string``
Default
``true``
Description
* ``true``: Update urllib3's default cipher list
* ``false``: Leave the default cipher list as is
* Any ``string``: Replace urllib3's default ciphers with these
(See `SSLContext.set_ciphers() <https://docs.python.org/3/library/ssl.html#ssl.SSLContext.set_ciphers>`__
for details)
pyopenssl
---------
Type

View File

@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
# Copyright 2014-2020 Mike Fährmann
# Copyright 2014-2021 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
@ -94,7 +94,7 @@ class HttpDownloader(DownloaderBase):
time.sleep(tries)
tries += 1
headers = {}
headers = {"Accept": "*/*"}
file_header = None
# check for .part file

View File

@ -9,6 +9,7 @@
"""Common classes and constants used by extractor modules."""
import re
import ssl
import time
import netrc
import queue
@ -16,6 +17,7 @@ import logging
import datetime
import requests
import threading
from requests.adapters import HTTPAdapter
from .message import Message
from .. import config, text, util, exception
@ -38,9 +40,10 @@ class Extractor():
def __init__(self, match):
self.session = requests.Session()
self.session.headers.clear()
self.log = logging.getLogger(self.category)
self.url = match.string
self._cookiefile = None
self._cookiejar = self.session.cookies
self._parentdir = ""
@ -62,6 +65,10 @@ class Extractor():
self.config = self._config_shared
self.config_accumulate = self._config_shared_accumulate
browser = self.config("browser")
if browser:
self._emulate_browser(browser)
else:
self._init_headers()
self._init_cookies()
self._init_proxies()
@ -211,19 +218,33 @@ class Extractor():
return username, password
def _init_headers(self):
"""Initialize HTTP headers for the 'session' object"""
headers = self.session.headers
headers.clear()
def _emulate_browser(self, browser):
browser, _, platform = browser.lower().partition(":")
if not platform or platform == "auto":
platform = "windows" if util.WINDOWS else "linux"
if platform == "windows":
platform = "Windows NT 10.0; Win64; x64"
elif platform == "linux":
platform = "X11; Linux x86_64"
elif platform == "macos":
platform = "Macintosh; Intel Mac OS X 11.2"
if browser == "chrome":
_emulate_browser_chrome(self.session, platform)
else:
_emulate_browser_firefox(self.session, platform)
def _init_headers(self):
"""Initialize HTTP headers for 'session'"""
headers = self.session.headers
headers["User-Agent"] = self.config(
"user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) "
"Gecko/20100101 Firefox/68.0"))
"user-agent", ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; "
"rv:78.0) Gecko/20100101 Firefox/78.0"))
headers["Accept"] = "*/*"
headers["Accept-Language"] = "en-US,en;q=0.5"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Connection"] = "keep-alive"
headers["Upgrade-Insecure-Requests"] = "1"
def _init_proxies(self):
"""Update the session's proxy map"""
@ -554,24 +575,39 @@ class BaseExtractor(Extractor):
return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")"
# Undo automatic pyOpenSSL injection by requests
pyopenssl = config.get((), "pyopenssl", False)
if not pyopenssl:
try:
from requests.packages.urllib3.contrib import pyopenssl # noqa
pyopenssl.extract_from_urllib3()
except ImportError:
pass
del pyopenssl
class HTTPSAdapter(HTTPAdapter):
def __init__(self, ciphers):
context = self.ssl_context = ssl.create_default_context()
context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 |
ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1)
context.set_ecdh_curve("prime256v1")
context.set_ciphers(ciphers)
HTTPAdapter.__init__(self)
def init_poolmanager(self, *args, **kwargs):
kwargs["ssl_context"] = self.ssl_context
return HTTPAdapter.init_poolmanager(self, *args, **kwargs)
def proxy_manager_for(self, *args, **kwargs):
kwargs["ssl_context"] = self.ssl_context
return HTTPAdapter.proxy_manager_for(self, *args, **kwargs)
# Replace urllib3's default cipher list to avoid Cloudflare CAPTCHAs
ciphers = config.get((), "ciphers", True)
if ciphers:
def _emulate_browser_firefox(session, platform):
headers = session.headers
if ciphers is True:
ciphers = (
# Firefox's list
headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:78.0) "
"Gecko/20100101 Firefox/78.0")
headers["Accept"] = ("text/html,application/xhtml+xml,"
"application/xml;q=0.9,image/webp,*/*;q=0.8")
headers["Accept-Language"] = "en-US,en;q=0.5"
headers["Accept-Encoding"] = "gzip, deflate"
headers["Referer"] = None
headers["Upgrade-Insecure-Requests"] = "1"
headers["Cookie"] = None
session.mount("https://", HTTPSAdapter(
"TLS_AES_128_GCM_SHA256:"
"TLS_CHACHA20_POLY1305_SHA256:"
"TLS_AES_256_GCM_SHA384:"
@ -590,11 +626,51 @@ if ciphers:
"AES128-SHA:"
"AES256-SHA:"
"DES-CBC3-SHA"
)
elif isinstance(ciphers, list):
ciphers = ":".join(ciphers)
))
from requests.packages.urllib3.util import ssl_ # noqa
ssl_.DEFAULT_CIPHERS = ciphers
del ssl_
del ciphers
def _emulate_browser_chrome(session, platform):
headers = session.headers
if platform.startswith("Macintosh"):
platform = platform.replace(".", "_") + "_0"
headers["Upgrade-Insecure-Requests"] = "1"
headers["User-Agent"] = (
"Mozilla/5.0 (" + platform + ") AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36")
headers["Accept"] = ("text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/webp,image/apng,*/*;q=0.8")
headers["Referer"] = None
headers["Accept-Encoding"] = "gzip, deflate"
headers["Accept-Language"] = "en-US,en;q=0.9"
headers["Cookie"] = None
session.mount("https://", HTTPSAdapter(
"TLS_AES_128_GCM_SHA256:"
"TLS_AES_256_GCM_SHA384:"
"TLS_CHACHA20_POLY1305_SHA256:"
"ECDHE-ECDSA-AES128-GCM-SHA256:"
"ECDHE-RSA-AES128-GCM-SHA256:"
"ECDHE-ECDSA-AES256-GCM-SHA384:"
"ECDHE-RSA-AES256-GCM-SHA384:"
"ECDHE-ECDSA-CHACHA20-POLY1305:"
"ECDHE-RSA-CHACHA20-POLY1305:"
"ECDHE-RSA-AES128-SHA:"
"ECDHE-RSA-AES256-SHA:"
"AES128-GCM-SHA256:"
"AES256-GCM-SHA384:"
"AES128-SHA:"
"AES256-SHA:"
"DES-CBC3-SHA"
))
# Undo automatic pyOpenSSL injection by requests
pyopenssl = config.get((), "pyopenssl", False)
if not pyopenssl:
try:
from requests.packages.urllib3.contrib import pyopenssl # noqa
pyopenssl.extract_from_urllib3()
except ImportError:
pass
del pyopenssl

View File

@ -63,17 +63,18 @@ class HentainexusGalleryExtractor(GalleryExtractor):
data = json.loads(self._decode(text.extract(
page, 'initReader("', '"')[0]))
headers = None
if not self.config("original", True):
self.session.headers["Accept"] = "image/webp,*/*"
headers = {"_http_headers": {"Accept": "image/webp,*/*"}}
pages = data.get("pages")
if pages:
return [(page, None) for page in pages]
return [(page, headers) for page in pages]
base = data["b"] + data["r"]
gid = data["i"]
return [
("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), None)
("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), headers)
for page in data["f"]
]