Merge pull request #503 from ArjixWasTaken/patch-16

added a cache system for SeleScrape and Improve SeleScrape generally
2021-08-20 16:17:26 +03:00 · 2021-08-20 16:17:26 +03:00 · 0fc9613400
parent 7b6d9e71ab 4e184b3f6c
commit 0fc9613400
2 changed files with 217 additions and 113 deletions
--- a/anime_downloader/sites/helpers/request.py
+++ b/anime_downloader/sites/helpers/request.py
@ -46,6 +46,8 @@ def setup(func):
        cf : bool
            cf if True performs the request through cfscrape.
            For cloudflare protected sites.
        sel : bool
            sel if True perfroms the request through selescrape (selenium).
        referer : str
            a url sent as referer in request headers
        '''
@ -57,6 +59,7 @@ def setup(func):
                from selenium import webdriver
                from anime_downloader.sites.helpers import selescrape
                sess = selescrape
                sess.cache = cache
            except ImportError:
                sess = cf_session
                logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run:  'pip install selenium' .")
@ -107,6 +110,8 @@ def get(url: str,
    cf : bool
        cf if True performs the request through cfscrape.
        For cloudflare protected sites.
    sel : bool
        sel if True perfroms the request through selescrape (selenium).
    referer : str
        a url sent as referer in request headers
    '''
--- a/anime_downloader/sites/helpers/selescrape.py
+++ b/anime_downloader/sites/helpers/selescrape.py
@ -1,31 +1,14 @@
 from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger
 from selenium.webdriver.support.ui import WebDriverWait
 from anime_downloader.const import get_random_header
 from selenium.webdriver.common.by import By
 from urllib.parse import urlencode
 from urllib.parse import urlsplit
 from selenium import webdriver
 from bs4 import BeautifulSoup
 from logging import exception
 from sys import platform
-import requests
+import tempfile
 import os
 import logging
 import click
 import time
 import json
-serverLogger.setLevel(logging.ERROR)
+import os
 logger = logging.getLogger(__name__)
 def get_data_dir():
    '''
    Gets the folder directory selescrape will store data, 
    such as cookies or browser extensions and logs.
    '''
    APP_NAME = 'anime downloader'
    return os.path.join(click.get_app_dir(APP_NAME), 'data')
 def open_config():
@ -33,8 +16,24 @@ def open_config():
    return Config
 cache = False
 serverLogger.setLevel(logging.ERROR)
 logger = logging.getLogger(__name__)
 TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
 data = open_config()
 if not os.path.isdir(TEMP_FOLDER):
    os.makedirs(TEMP_FOLDER)
 def get_data_dir():
    '''
    Gets the folder directory selescrape will store data,
    such as cookies or browser extensions and logs.
    '''
    APP_NAME = 'anime downloader'
    return os.path.join(click.get_app_dir(APP_NAME), 'data')
 def get_browser_config():
    '''
@ -50,148 +49,248 @@ def get_browser_config():
            browser = os_browser[a]
        else:
            browser = 'chrome'
    value = data['dl']['selescrape_browser']
    value = value.lower() if value else value
    if value in ['chrome', 'firefox']:
        browser = value
    return browser
 def get_browser_executable():
    value = data['dl']['selescrape_browser_executable_path']
    executable_value = value.lower() if value else value
-    return executable_value
+    if executable_value:
        return executable_value
 def get_driver_binary():
    value = data['dl']['selescrape_driver_binary_path']
-    binary_path = value.lower() if value else value
+    if value:
-    return binary_path
+        return value
-def add_url_params(url, params):
+def cache_request(sele_response):
-    return url if not params else url + '?' + urlencode(params)
+    """
    This function saves the response from a Selenium request in a json.
    It uses timestamps to can know if the cache has expired or not.
    """
    if not cache:
        return
    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
    if os.path.isfile(file):
        with open(file, 'r') as f:
            tmp_cache = json.load(f)
    else:
        tmp_cache = {}
    data = sele_response.__dict__
    url = data['url']
    url = (url[:-1] if url and url[-1] == '/' else url)
    tmp_cache[url] = {
        'data': data['text'],
        'expiry': time.time(),
        'method': data['method'],
        'cookies': data['cookies'],
        'user_agent': data['user_agent']
    }
    with open(file, 'w') as f:
        json.dump(tmp_cache, f, indent=4)
 def check_cache(url):
    """
    This function checks if the cache file exists,
    if it exists then it will read the file
    And it will verify if the cache is less than or equal to 30 mins old
    If it is, it will return it as it is.
    If it isn't, it will delete the expired cache from the file and return None
    If the file doesn't exist at all it will return None
    """
    if not cache:
        return
    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
    if os.path.isfile(file):
        with open(file, 'r') as f:
            data = json.load(f)
        # Yes, this is ugly,
        # but its the best way that I found to find the cache
        # when the url is not exactly the same (a slash at the end or not)
        clean_url = (url[:-1] if url and url[-1] == '/' else url)
        found = False
        for link in data:
            if link == clean_url:
                url = link
                found = True
        if not found:
            return
        timestamp = data[url]['expiry']
        if (time.time() - timestamp <= 1800):
            return data[url]
        else:
            data.pop(url, None)
            with open(file, 'w') as f:
                json.dump(data, f, indent=4)
 def driver_select():
    '''
-    it configures what each browser should do 
+    This configures what each browser should do
-    and gives the driver variable that is used 
+    and returns the corresponding driver.
    to perform any actions below this function.
    '''
    browser = get_browser_config()
    data_dir = get_data_dir()
    executable = get_browser_executable()
-    driver_binary = get_driver_binary()
+    binary = get_driver_binary()
-    binary = None if not driver_binary else driver_binary
+
    if browser == 'firefox':
-        fireFoxOptions = webdriver.FirefoxOptions()
+        fireFox_Options = webdriver.FirefoxOptions()
-        fireFoxOptions.headless = True
+        ops = [
-        fireFoxOptions.add_argument('--log fatal')
+            "--width=1920", "--height=1080",
-        if binary == None:
+            "-headless", "--log fatal"
-            driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
+        ]
-        else:
+
-            try:
+        for option in ops:
-                driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
+            fireFox_Options.add_argument(option)
-            except:
+
-                driver = webdriver.Firefox(executable_path=binary, options=fireFoxOptions, service_log_path=os.path.devnull)
+        fireFox_Profile = webdriver.FirefoxProfile()
        fireFox_Profile.set_preference(
            "general.useragent.override", get_random_header()['user-agent']
        )
        driver = webdriver.Firefox(
            # sets user-agent
            firefox_profile=fireFox_Profile,
            # sets various firefox settings
            options=fireFox_Options,
            # by default it will be None, if a binary location is in the config then it will use that
            firefox_binary=None if not executable else executable,
            # by default it will be "geckodriver", if a geckodriver location is in the config then it will use that
            executable_path=(binary if binary else "geckodriver"),
            # an attempt at stopping selenium from printing a pile of garbage to the console.
            service_log_path=os.path.devnull
        )
    elif browser == 'chrome':
        from selenium.webdriver.chrome.options import Options
-        chrome_options = Options()
+
        chrome_options.add_argument("--headless")
        chrome_options.add_argument("--disable-gpu")
        profile_path = os.path.join(data_dir, 'Selenium_chromium')
-        log_path = os.path.join(data_dir, 'chromedriver.log')
+        chrome_options = Options()
-        chrome_options.add_argument('--log-level=OFF')
+
-        chrome_options.add_argument(f"--user-data-dir={profile_path}")
+        ops = [
-        chrome_options.add_argument("--no-sandbox")
+            "--headless", "--disable-gpu", '--log-level=OFF',
-        chrome_options.add_argument("--window-size=1920,1080")
+            f"--user-data-dir={profile_path}", "--no-sandbox",
-        chrome_options.add_argument(f'user-agent={get_random_header()}')
+            "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"  # noqa
-        if binary == None:
+        ]
-            if executable == None:
+
-                driver = webdriver.Chrome(options=chrome_options)
+        for option in ops:
-            else:
+            chrome_options.add_argument(option)
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+
-                cap = DesiredCapabilities.CHROME
+        cap = None
-                cap['binary_location'] = executable
+
-                driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
+        if executable:
-        else:
+            from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-            if executable == None:
+
-                driver = webdriver.Chrome(options=chrome_options)
+            cap = DesiredCapabilities.CHROME
-            else:
+            cap['binary_location'] = executable
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+
-                cap = DesiredCapabilities.CHROME
+        driver = webdriver.Chrome(
-                cap['binary_location'] = executable
+            # sets user-agent, and various chrome settings
-                driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, service_log_path=os.path.devnull)
+            options=chrome_options,
            # by default it will be "chromedriver", if a chromedriver location is in the config then it will use that
            executable_path=(binary if binary else "chromedriver"),
            # by default it will be None, if a binary location is in the config then it will use that
            desired_capabilities=cap,
            # an attempt at stopping selenium from printing a pile of garbage to the console.
            service_log_path=os.path.devnull
        )
    return driver
 def status_select(driver, url, status='hide'):
    '''
    For now it doesnt do what its name suggests, 
    I have planned to add a status reporter of the http response code.
    This part of the code is not removed because it is part of its core.
    Treat it like it isnt here.
    '''
    try:
        if status == 'hide':
            driver.get(url)
        elif status == 'show':
            r = requests.head(url)
            if r.status_code == 503:
                raise RuntimeError("This website's sevice is unavailable or has cloudflare on.")
            driver.get(url)
            return r.status_code
        else:
            driver.get(url)
    except requests.ConnectionError:
        raise RuntimeError("Failed to establish a connection using the requests library.")
 def cloudflare_wait(driver):
    '''
    It waits until cloudflare has gone away before doing any further actions.
-    The way it works is by getting the title of the page 
+    The way it works is by getting the title of the page
    and as long as it is "Just a moment..." it will keep waiting.
-    This part of the code won't make the code execute slower 
+    This part of the code won't make the code execute slower
-    if the target website has not a Cloudflare redirection.
+    if the target website has no Cloudflare redirection.
-    At most it will sleep 1 second as a precaution. 
+    At most it will sleep 1 second as a precaution.
-    Also, i have made it time out after 30 seconds, useful if the target website is not responsive 
+    Also, i have made it time out after 50 seconds, useful if the target website is not responsive
    and to stop it from running infinitely.
    '''
-    abort_after = 30
+    abort_after = 50  # seconds
    start = time.time()
    title = driver.title  # title = "Just a moment..."
-    while title == "Just a moment...":
+    while "Just a moment" in title:
-        time.sleep(0.25)
+        time.sleep(0.35)
        delta = time.time() - start
        if delta >= abort_after:
-            logger.error(f'Timeout:\nCouldnt bypass cloudflare. \
+            logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
-            See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
+            See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
            return 1
        title = driver.title
-        if not title == "Just a moment...":
+        if not "Just a moment" in title:
            break
-    time.sleep(1)  # This is necessary to make sure everything has loaded fine.
+    time.sleep(2)  # This is necessary to make sure everything has loaded fine.
    return 0
 def request(request_type, url, **kwargs):  # Headers not yet supported , headers={}
    params = kwargs.get('params', {})
-    new_url = add_url_params(url, params)
+
-    driver = driver_select()
+    url = url if not params else url + '?' + urlencode(params)
-    status = status_select(driver, new_url, 'hide')
+    cached_data = check_cache(url)
-    try:
+
-        cloudflare_wait(driver)
+    if cached_data:
-        user_agent = driver.execute_script("return navigator.userAgent;")  # dirty, but allows for all sorts of things above
+        text = cached_data['data']
-        cookies = driver.get_cookies()
+        user_agent = cached_data['user_agent']
-        text = driver.page_source
+        request_type = cached_data['method']
-        driver.close()
+        cookies = cached_data['cookies']
        return SeleResponse(url, request_type, text, cookies, user_agent)
-    except:
+
-        driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
+    else:
-        driver.close()
+        driver = driver_select()
-        logger.error(f'There was a problem getting the page: {new_url}. \
+        driver.get(url)
-        See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
+
        try:
            exit_code = cloudflare_wait(driver)
            user_agent = driver.execute_script("return navigator.userAgent;")
            cookies = driver.get_cookies()
            text = driver.page_source
            driver.close()
            if exit_code != 0:
                return SeleResponse(url, request_type, None, cookies, user_agent)
            seleResponse = SeleResponse(
                url, request_type,
                text, cookies,
                user_agent
            )
            cache_request(seleResponse)
            return seleResponse
        except:
            driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
            driver.close()
            logger.error(f'There was a problem getting the page: {url}.' +
                         '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
            return
 class SeleResponse:
@ -224,5 +323,5 @@ class SeleResponse:
        return self.text
    def __repr__(self):
-        return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES {} USERAGENT {}>'.format(
+        return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES: {} USERAGENT: {}>'.format(
            self.url, self.method, self.text, self.cookies, self.user_agent)