Merge pull request #503 from ArjixWasTaken/patch-16

added a cache system for SeleScrape and Improve SeleScrape generally
2021-08-20 16:17:26 +03:00 · 2021-08-20 16:17:26 +03:00 · 0fc9613400
parent 7b6d9e71ab 4e184b3f6c
commit 0fc9613400
2 changed files with 217 additions and 113 deletions
--- a/anime_downloader/sites/helpers/request.py
+++ b/anime_downloader/sites/helpers/request.py
@ -46,6 +46,8 @@ def setup(func):
        cf : bool
            cf if True performs the request through cfscrape.
            For cloudflare protected sites.
+        sel : bool
+            sel if True perfroms the request through selescrape (selenium).
        referer : str
            a url sent as referer in request headers
        '''
@ -57,6 +59,7 @@ def setup(func):
                from selenium import webdriver
                from anime_downloader.sites.helpers import selescrape
                sess = selescrape
+                sess.cache = cache
            except ImportError:
                sess = cf_session
                logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run:  'pip install selenium' .")
@ -107,6 +110,8 @@ def get(url: str,
    cf : bool
        cf if True performs the request through cfscrape.
        For cloudflare protected sites.
+    sel : bool
+        sel if True perfroms the request through selescrape (selenium).
    referer : str
        a url sent as referer in request headers
    '''
--- a/anime_downloader/sites/helpers/selescrape.py
+++ b/anime_downloader/sites/helpers/selescrape.py
@ -1,31 +1,14 @@
-from selenium.webdriver.support import expected_conditions as EC
 from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger
-from selenium.webdriver.support.ui import WebDriverWait
 from anime_downloader.const import get_random_header
-from selenium.webdriver.common.by import By
 from urllib.parse import urlencode
-from urllib.parse import urlsplit
 from selenium import webdriver
-from bs4 import BeautifulSoup
-from logging import exception
 from sys import platform
-import requests
-import os
+import tempfile
 import logging
 import click
 import time
 import json
-serverLogger.setLevel(logging.ERROR)
-logger = logging.getLogger(__name__)
-
-
-def get_data_dir():
-    '''
-    Gets the folder directory selescrape will store data, 
-    such as cookies or browser extensions and logs.
-    '''
-    APP_NAME = 'anime downloader'
-    return os.path.join(click.get_app_dir(APP_NAME), 'data')
+import os


 def open_config():
@ -33,8 +16,24 @@ def open_config():
    return Config


+cache = False
+serverLogger.setLevel(logging.ERROR)
+logger = logging.getLogger(__name__)
+TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
 data = open_config()

+if not os.path.isdir(TEMP_FOLDER):
+    os.makedirs(TEMP_FOLDER)
+
+
+def get_data_dir():
+    '''
+    Gets the folder directory selescrape will store data,
+    such as cookies or browser extensions and logs.
+    '''
+    APP_NAME = 'anime downloader'
+    return os.path.join(click.get_app_dir(APP_NAME), 'data')
+

 def get_browser_config():
    '''
@ -50,148 +49,248 @@ def get_browser_config():
            browser = os_browser[a]
        else:
            browser = 'chrome'
+
    value = data['dl']['selescrape_browser']
    value = value.lower() if value else value
+
    if value in ['chrome', 'firefox']:
        browser = value
+
    return browser


 def get_browser_executable():
    value = data['dl']['selescrape_browser_executable_path']
    executable_value = value.lower() if value else value
-    return executable_value
+    if executable_value:
+        return executable_value


 def get_driver_binary():
    value = data['dl']['selescrape_driver_binary_path']
-    binary_path = value.lower() if value else value
-    return binary_path
+    if value:
+        return value


-def add_url_params(url, params):
-    return url if not params else url + '?' + urlencode(params)
+def cache_request(sele_response):
+    """
+    This function saves the response from a Selenium request in a json.
+    It uses timestamps to can know if the cache has expired or not.
+    """
+    if not cache:
+        return
+
+    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
+
+    if os.path.isfile(file):
+        with open(file, 'r') as f:
+            tmp_cache = json.load(f)
+    else:
+        tmp_cache = {}
+
+    data = sele_response.__dict__
+    url = data['url']
+    url = (url[:-1] if url and url[-1] == '/' else url)
+
+    tmp_cache[url] = {
+        'data': data['text'],
+        'expiry': time.time(),
+        'method': data['method'],
+        'cookies': data['cookies'],
+        'user_agent': data['user_agent']
+    }
+
+    with open(file, 'w') as f:
+        json.dump(tmp_cache, f, indent=4)
+
+
+def check_cache(url):
+    """
+    This function checks if the cache file exists,
+    if it exists then it will read the file
+    And it will verify if the cache is less than or equal to 30 mins old
+    If it is, it will return it as it is.
+    If it isn't, it will delete the expired cache from the file and return None
+    If the file doesn't exist at all it will return None
+    """
+    if not cache:
+        return
+    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
+    if os.path.isfile(file):
+
+        with open(file, 'r') as f:
+            data = json.load(f)
+
+        # Yes, this is ugly,
+        # but its the best way that I found to find the cache
+        # when the url is not exactly the same (a slash at the end or not)
+        clean_url = (url[:-1] if url and url[-1] == '/' else url)
+        found = False
+
+        for link in data:
+            if link == clean_url:
+                url = link
+                found = True
+
+        if not found:
+            return
+
+        timestamp = data[url]['expiry']
+
+        if (time.time() - timestamp <= 1800):
+            return data[url]
+        else:
+            data.pop(url, None)
+
+            with open(file, 'w') as f:
+                json.dump(data, f, indent=4)


 def driver_select():
    '''
-    it configures what each browser should do 
-    and gives the driver variable that is used 
-    to perform any actions below this function.
+    This configures what each browser should do
+    and returns the corresponding driver.
    '''
    browser = get_browser_config()
    data_dir = get_data_dir()
    executable = get_browser_executable()
-    driver_binary = get_driver_binary()
-    binary = None if not driver_binary else driver_binary
+    binary = get_driver_binary()
+
    if browser == 'firefox':
-        fireFoxOptions = webdriver.FirefoxOptions()
-        fireFoxOptions.headless = True
-        fireFoxOptions.add_argument('--log fatal')
-        if binary == None:
-            driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
-        else:
-            try:
-                driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
-            except:
-                driver = webdriver.Firefox(executable_path=binary, options=fireFoxOptions, service_log_path=os.path.devnull)
+        fireFox_Options = webdriver.FirefoxOptions()
+        ops = [
+            "--width=1920", "--height=1080",
+            "-headless", "--log fatal"
+        ]
+
+        for option in ops:
+            fireFox_Options.add_argument(option)
+
+        fireFox_Profile = webdriver.FirefoxProfile()
+        fireFox_Profile.set_preference(
+            "general.useragent.override", get_random_header()['user-agent']
+        )
+
+        driver = webdriver.Firefox(
+            # sets user-agent
+            firefox_profile=fireFox_Profile,
+            # sets various firefox settings
+            options=fireFox_Options,
+            # by default it will be None, if a binary location is in the config then it will use that
+            firefox_binary=None if not executable else executable,
+            # by default it will be "geckodriver", if a geckodriver location is in the config then it will use that
+            executable_path=(binary if binary else "geckodriver"),
+            # an attempt at stopping selenium from printing a pile of garbage to the console.
+            service_log_path=os.path.devnull
+        )
+
    elif browser == 'chrome':
        from selenium.webdriver.chrome.options import Options
-        chrome_options = Options()
-        chrome_options.add_argument("--headless")
-        chrome_options.add_argument("--disable-gpu")
+
        profile_path = os.path.join(data_dir, 'Selenium_chromium')
-        log_path = os.path.join(data_dir, 'chromedriver.log')
-        chrome_options.add_argument('--log-level=OFF')
-        chrome_options.add_argument(f"--user-data-dir={profile_path}")
-        chrome_options.add_argument("--no-sandbox")
-        chrome_options.add_argument("--window-size=1920,1080")
-        chrome_options.add_argument(f'user-agent={get_random_header()}')
-        if binary == None:
-            if executable == None:
-                driver = webdriver.Chrome(options=chrome_options)
-            else:
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-                cap = DesiredCapabilities.CHROME
-                cap['binary_location'] = executable
-                driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
-        else:
-            if executable == None:
-                driver = webdriver.Chrome(options=chrome_options)
-            else:
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-                cap = DesiredCapabilities.CHROME
-                cap['binary_location'] = executable
-                driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, service_log_path=os.path.devnull)
+        chrome_options = Options()
+
+        ops = [
+            "--headless", "--disable-gpu", '--log-level=OFF',
+            f"--user-data-dir={profile_path}", "--no-sandbox",
+            "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"  # noqa
+        ]
+
+        for option in ops:
+            chrome_options.add_argument(option)
+
+        cap = None
+
+        if executable:
+            from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+
+            cap = DesiredCapabilities.CHROME
+            cap['binary_location'] = executable
+
+        driver = webdriver.Chrome(
+            # sets user-agent, and various chrome settings
+            options=chrome_options,
+            # by default it will be "chromedriver", if a chromedriver location is in the config then it will use that
+            executable_path=(binary if binary else "chromedriver"),
+            # by default it will be None, if a binary location is in the config then it will use that
+            desired_capabilities=cap,
+            # an attempt at stopping selenium from printing a pile of garbage to the console.
+            service_log_path=os.path.devnull
+        )
    return driver


-def status_select(driver, url, status='hide'):
-    '''
-    For now it doesnt do what its name suggests, 
-    I have planned to add a status reporter of the http response code.
-    This part of the code is not removed because it is part of its core.
-    Treat it like it isnt here.
-    '''
-    try:
-        if status == 'hide':
-            driver.get(url)
-        elif status == 'show':
-            r = requests.head(url)
-            if r.status_code == 503:
-                raise RuntimeError("This website's sevice is unavailable or has cloudflare on.")
-            driver.get(url)
-            return r.status_code
-        else:
-            driver.get(url)
-    except requests.ConnectionError:
-        raise RuntimeError("Failed to establish a connection using the requests library.")
-
-
 def cloudflare_wait(driver):
    '''
    It waits until cloudflare has gone away before doing any further actions.
-    The way it works is by getting the title of the page 
+    The way it works is by getting the title of the page
    and as long as it is "Just a moment..." it will keep waiting.
-    This part of the code won't make the code execute slower 
-    if the target website has not a Cloudflare redirection.
-    At most it will sleep 1 second as a precaution. 
-    Also, i have made it time out after 30 seconds, useful if the target website is not responsive 
+    This part of the code won't make the code execute slower
+    if the target website has no Cloudflare redirection.
+    At most it will sleep 1 second as a precaution.
+    Also, i have made it time out after 50 seconds, useful if the target website is not responsive
    and to stop it from running infinitely.
    '''
-    abort_after = 30
+    abort_after = 50  # seconds
    start = time.time()

    title = driver.title  # title = "Just a moment..."
-    while title == "Just a moment...":
-        time.sleep(0.25)
+    while "Just a moment" in title:
+        time.sleep(0.35)
        delta = time.time() - start
        if delta >= abort_after:
-            logger.error(f'Timeout:\nCouldnt bypass cloudflare. \
-            See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
+            logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
+            See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
+            return 1
        title = driver.title
-        if not title == "Just a moment...":
+        if not "Just a moment" in title:
            break
-    time.sleep(1)  # This is necessary to make sure everything has loaded fine.
+    time.sleep(2)  # This is necessary to make sure everything has loaded fine.
+    return 0


 def request(request_type, url, **kwargs):  # Headers not yet supported , headers={}
    params = kwargs.get('params', {})
-    new_url = add_url_params(url, params)
-    driver = driver_select()
-    status = status_select(driver, new_url, 'hide')
-    try:
-        cloudflare_wait(driver)
-        user_agent = driver.execute_script("return navigator.userAgent;")  # dirty, but allows for all sorts of things above
-        cookies = driver.get_cookies()
-        text = driver.page_source
-        driver.close()
+
+    url = url if not params else url + '?' + urlencode(params)
+    cached_data = check_cache(url)
+
+    if cached_data:
+        text = cached_data['data']
+        user_agent = cached_data['user_agent']
+        request_type = cached_data['method']
+        cookies = cached_data['cookies']
        return SeleResponse(url, request_type, text, cookies, user_agent)
-    except:
-        driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
-        driver.close()
-        logger.error(f'There was a problem getting the page: {new_url}. \
-        See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
+
+    else:
+        driver = driver_select()
+        driver.get(url)
+
+        try:
+            exit_code = cloudflare_wait(driver)
+            user_agent = driver.execute_script("return navigator.userAgent;")
+            cookies = driver.get_cookies()
+            text = driver.page_source
+            driver.close()
+
+            if exit_code != 0:
+                return SeleResponse(url, request_type, None, cookies, user_agent)
+
+            seleResponse = SeleResponse(
+                url, request_type,
+                text, cookies,
+                user_agent
+            )
+
+            cache_request(seleResponse)
+            return seleResponse
+
+        except:
+            driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
+            driver.close()
+            logger.error(f'There was a problem getting the page: {url}.' +
+                         '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
+            return


 class SeleResponse:
@ -224,5 +323,5 @@ class SeleResponse:
        return self.text

    def __repr__(self):
-        return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES {} USERAGENT {}>'.format(
+        return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES: {} USERAGENT: {}>'.format(
            self.url, self.method, self.text, self.cookies, self.user_agent)