From cf5fa75d4c463a5b8d916147ca32a5ed8b27d691 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Thu, 25 Feb 2021 23:39:34 +0100 Subject: [PATCH] add 'browser' option (#1117) - change default user agent to Firefox ESR 78 on Windows 10 - remove 'ciphers' option --- docs/configuration.rst | 34 +++--- gallery_dl/downloader/http.py | 4 +- gallery_dl/extractor/common.py | 166 ++++++++++++++++++++-------- gallery_dl/extractor/hentainexus.py | 7 +- 4 files changed, 146 insertions(+), 65 deletions(-) diff --git a/docs/configuration.rst b/docs/configuration.rst index a070b472..8a4ff62a 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -380,7 +380,7 @@ extractor.*.user-agent Type ``string`` Default - ``"Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0"`` + ``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0"`` Description User-Agent header value to be used for HTTP requests. @@ -388,6 +388,24 @@ Description as these need specific values to function correctly. +extractor.*.browser +------------------- +Type + ``string`` +Example + * ``"firefox"`` + * ``"chrome:macos"`` +Description + Try to emulate a real browser (``firefox`` or ``chrome``) + by using their default HTTP headers and TLS ciphers for HTTP requests. + + Optionally, the operating system used in the ``User-Agent`` header can be + specified after a ``:`` (``windows``, ``linux``, or ``macos``). + + Note: ``requests`` and ``urllib3`` only support HTTP/1.1, while a real + browser would use HTTP/2. + + extractor.*.keywords -------------------- Type @@ -2457,20 +2475,6 @@ Description this cache. -ciphers -------- -Type - ``bool`` or ``string`` -Default - ``true`` -Description - * ``true``: Update urllib3's default cipher list - * ``false``: Leave the default cipher list as is - * Any ``string``: Replace urllib3's default ciphers with these - (See `SSLContext.set_ciphers() `__ - for details) - - pyopenssl --------- Type diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index b08aae15..bc42d7cc 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2020 Mike Fährmann +# Copyright 2014-2021 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -94,7 +94,7 @@ class HttpDownloader(DownloaderBase): time.sleep(tries) tries += 1 - headers = {} + headers = {"Accept": "*/*"} file_header = None # check for .part file diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index 6a8c6586..e7a42522 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -9,6 +9,7 @@ """Common classes and constants used by extractor modules.""" import re +import ssl import time import netrc import queue @@ -16,6 +17,7 @@ import logging import datetime import requests import threading +from requests.adapters import HTTPAdapter from .message import Message from .. import config, text, util, exception @@ -38,9 +40,10 @@ class Extractor(): def __init__(self, match): self.session = requests.Session() + self.session.headers.clear() + self.log = logging.getLogger(self.category) self.url = match.string - self._cookiefile = None self._cookiejar = self.session.cookies self._parentdir = "" @@ -62,7 +65,11 @@ class Extractor(): self.config = self._config_shared self.config_accumulate = self._config_shared_accumulate - self._init_headers() + browser = self.config("browser") + if browser: + self._emulate_browser(browser) + else: + self._init_headers() self._init_cookies() self._init_proxies() @@ -211,19 +218,33 @@ class Extractor(): return username, password - def _init_headers(self): - """Initialize HTTP headers for the 'session' object""" - headers = self.session.headers - headers.clear() + def _emulate_browser(self, browser): + browser, _, platform = browser.lower().partition(":") + if not platform or platform == "auto": + platform = "windows" if util.WINDOWS else "linux" + + if platform == "windows": + platform = "Windows NT 10.0; Win64; x64" + elif platform == "linux": + platform = "X11; Linux x86_64" + elif platform == "macos": + platform = "Macintosh; Intel Mac OS X 11.2" + + if browser == "chrome": + _emulate_browser_chrome(self.session, platform) + else: + _emulate_browser_firefox(self.session, platform) + + def _init_headers(self): + """Initialize HTTP headers for 'session'""" + headers = self.session.headers headers["User-Agent"] = self.config( - "user-agent", ("Mozilla/5.0 (X11; Linux x86_64; rv:68.0) " - "Gecko/20100101 Firefox/68.0")) + "user-agent", ("Mozilla/5.0 (Windows NT 10.0; Win64; x64; " + "rv:78.0) Gecko/20100101 Firefox/78.0")) headers["Accept"] = "*/*" headers["Accept-Language"] = "en-US,en;q=0.5" headers["Accept-Encoding"] = "gzip, deflate" - headers["Connection"] = "keep-alive" - headers["Upgrade-Insecure-Requests"] = "1" def _init_proxies(self): """Update the session's proxy map""" @@ -554,6 +575,96 @@ class BaseExtractor(Extractor): return r"(?:https?://)?(?:" + "|".join(pattern_list) + r")" +class HTTPSAdapter(HTTPAdapter): + + def __init__(self, ciphers): + context = self.ssl_context = ssl.create_default_context() + context.options |= (ssl.OP_NO_SSLv2 | ssl.OP_NO_SSLv3 | + ssl.OP_NO_TLSv1 | ssl.OP_NO_TLSv1_1) + context.set_ecdh_curve("prime256v1") + context.set_ciphers(ciphers) + HTTPAdapter.__init__(self) + + def init_poolmanager(self, *args, **kwargs): + kwargs["ssl_context"] = self.ssl_context + return HTTPAdapter.init_poolmanager(self, *args, **kwargs) + + def proxy_manager_for(self, *args, **kwargs): + kwargs["ssl_context"] = self.ssl_context + return HTTPAdapter.proxy_manager_for(self, *args, **kwargs) + + +def _emulate_browser_firefox(session, platform): + headers = session.headers + + headers["User-Agent"] = ("Mozilla/5.0 (" + platform + "; rv:78.0) " + "Gecko/20100101 Firefox/78.0") + headers["Accept"] = ("text/html,application/xhtml+xml," + "application/xml;q=0.9,image/webp,*/*;q=0.8") + headers["Accept-Language"] = "en-US,en;q=0.5" + headers["Accept-Encoding"] = "gzip, deflate" + headers["Referer"] = None + headers["Upgrade-Insecure-Requests"] = "1" + headers["Cookie"] = None + + session.mount("https://", HTTPSAdapter( + "TLS_AES_128_GCM_SHA256:" + "TLS_CHACHA20_POLY1305_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-AES256-SHA:" + "ECDHE-ECDSA-AES128-SHA:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "DHE-RSA-AES128-SHA:" + "DHE-RSA-AES256-SHA:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + )) + + +def _emulate_browser_chrome(session, platform): + headers = session.headers + if platform.startswith("Macintosh"): + platform = platform.replace(".", "_") + "_0" + + headers["Upgrade-Insecure-Requests"] = "1" + headers["User-Agent"] = ( + "Mozilla/5.0 (" + platform + ") AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36") + headers["Accept"] = ("text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/webp,image/apng,*/*;q=0.8") + headers["Referer"] = None + headers["Accept-Encoding"] = "gzip, deflate" + headers["Accept-Language"] = "en-US,en;q=0.9" + headers["Cookie"] = None + + session.mount("https://", HTTPSAdapter( + "TLS_AES_128_GCM_SHA256:" + "TLS_AES_256_GCM_SHA384:" + "TLS_CHACHA20_POLY1305_SHA256:" + "ECDHE-ECDSA-AES128-GCM-SHA256:" + "ECDHE-RSA-AES128-GCM-SHA256:" + "ECDHE-ECDSA-AES256-GCM-SHA384:" + "ECDHE-RSA-AES256-GCM-SHA384:" + "ECDHE-ECDSA-CHACHA20-POLY1305:" + "ECDHE-RSA-CHACHA20-POLY1305:" + "ECDHE-RSA-AES128-SHA:" + "ECDHE-RSA-AES256-SHA:" + "AES128-GCM-SHA256:" + "AES256-GCM-SHA384:" + "AES128-SHA:" + "AES256-SHA:" + "DES-CBC3-SHA" + )) + + # Undo automatic pyOpenSSL injection by requests pyopenssl = config.get((), "pyopenssl", False) if not pyopenssl: @@ -563,38 +674,3 @@ if not pyopenssl: except ImportError: pass del pyopenssl - - -# Replace urllib3's default cipher list to avoid Cloudflare CAPTCHAs -ciphers = config.get((), "ciphers", True) -if ciphers: - - if ciphers is True: - ciphers = ( - # Firefox's list - "TLS_AES_128_GCM_SHA256:" - "TLS_CHACHA20_POLY1305_SHA256:" - "TLS_AES_256_GCM_SHA384:" - "ECDHE-ECDSA-AES128-GCM-SHA256:" - "ECDHE-RSA-AES128-GCM-SHA256:" - "ECDHE-ECDSA-CHACHA20-POLY1305:" - "ECDHE-RSA-CHACHA20-POLY1305:" - "ECDHE-ECDSA-AES256-GCM-SHA384:" - "ECDHE-RSA-AES256-GCM-SHA384:" - "ECDHE-ECDSA-AES256-SHA:" - "ECDHE-ECDSA-AES128-SHA:" - "ECDHE-RSA-AES128-SHA:" - "ECDHE-RSA-AES256-SHA:" - "DHE-RSA-AES128-SHA:" - "DHE-RSA-AES256-SHA:" - "AES128-SHA:" - "AES256-SHA:" - "DES-CBC3-SHA" - ) - elif isinstance(ciphers, list): - ciphers = ":".join(ciphers) - - from requests.packages.urllib3.util import ssl_ # noqa - ssl_.DEFAULT_CIPHERS = ciphers - del ssl_ -del ciphers diff --git a/gallery_dl/extractor/hentainexus.py b/gallery_dl/extractor/hentainexus.py index dbe576db..6c1879c6 100644 --- a/gallery_dl/extractor/hentainexus.py +++ b/gallery_dl/extractor/hentainexus.py @@ -63,17 +63,18 @@ class HentainexusGalleryExtractor(GalleryExtractor): data = json.loads(self._decode(text.extract( page, 'initReader("', '"')[0])) + headers = None if not self.config("original", True): - self.session.headers["Accept"] = "image/webp,*/*" + headers = {"_http_headers": {"Accept": "image/webp,*/*"}} pages = data.get("pages") if pages: - return [(page, None) for page in pages] + return [(page, headers) for page in pages] base = data["b"] + data["r"] gid = data["i"] return [ - ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), None) + ("{}{}/{}/{}".format(base, page["h"], gid, page["p"]), headers) for page in data["f"] ]