reworked some logic and also made the code more readable

2021-05-25 22:13:14 +03:00 · 2021-05-25 22:13:14 +03:00 · 1f9a7dd35f
parent 4f21f6600f
commit 1f9a7dd35f
1 changed files with 113 additions and 69 deletions
--- a/anime_downloader/sites/helpers/selescrape.py
+++ b/anime_downloader/sites/helpers/selescrape.py
@ -4,14 +4,25 @@ from urllib.parse import urlencode
 from selenium import webdriver
 from sys import platform
 import tempfile
 import os
 import logging
 import click
 import time
 import json
 import os
 def open_config():
    from anime_downloader.config import Config
    return Config
 serverLogger.setLevel(logging.ERROR)
 logger = logging.getLogger(__name__)
 TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
 data = open_config()
 if not os.path.isdir(TEMP_FOLDER):
    os.makedirs(TEMP_FOLDER)
 def get_data_dir():
@ -23,14 +34,6 @@ def get_data_dir():
    return os.path.join(click.get_app_dir(APP_NAME), 'data')
 def open_config():
    from anime_downloader.config import Config
    return Config
 data = open_config()
 def get_browser_config():
    '''
    Decides what browser selescrape will use.
@ -63,24 +66,31 @@ def get_browser_executable():
 def get_driver_binary():
    value = data['dl']['selescrape_driver_binary_path']
-    binary_path = value.lower() if value else value
+    if value:
-    return binary_path
+        return value
    return None
 def cache_request(sele_response):
    """
    This function saves the response from a Selenium request in a json.
-    It uses timestamps so that the rest of the code can know if the cache has expired or not.
+    It uses timestamps to can know if the cache has expired or not.
    """
-    file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
+    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
    if os.path.isfile(file):
        with open(file, 'r') as f:
            tmp_cache = json.load(f)
    else:
        tmp_cache = {}
    data = sele_response.__dict__
-    tmp_cache[data['url']] = {
+    url = data['url']
    url = (url[:-1] if url and url[-1] == '/' else url)
    tmp_cache[url] = {
        'data': data['text'],
        'expiry': time.time(),
        'method': data['method'],
@ -96,80 +106,111 @@ def check_cache(url):
    """
    This function checks if the cache file exists,
    if it exists then it will read the file
-    And it will verify if the cache is less than or equal to 1 hour ago
+    And it will verify if the cache is less than or equal to 30 mins ago
    If it is, it will return it as it is.
    If it isn't, it will delete the expired cache from the file and return None
    If the file doesn't exist at all it will return None
    """
-    file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
+    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
    if os.path.isfile(file):
        with open(file, 'r') as f:
            data = json.load(f)
-        if url not in data:
+
        # Yes, this is ugly,
        # but its the best way that I found to find the cache
        # when the url is not exactly the same (a slash at the end or not)
        clean_url = (url[:-1] if url and url[-1] == '/' else url)
        found = False
        for link in data:
            if link == clean_url:
                url = link
                found = True
        if not found:
            return
        timestamp = data[url]['expiry']
-        if (time.time() - timestamp <= 3600):
+
        if (time.time() - timestamp <= 1800):
            return data[url]
        else:
            data.pop(url, None)
            with open(file, 'w') as f:
                json.dump(data, f, indent=4)
 def driver_select():
    '''
-    it configures what each browser should do
+    This configures what each browser should do
-    and gives the driver variable that is used
+    and returns the corresponding driver.
    to perform any actions below this function.
    '''
    browser = get_browser_config()
    data_dir = get_data_dir()
    executable = get_browser_executable()
-    driver_binary = get_driver_binary()
+    binary = get_driver_binary()
-    binary = None if not driver_binary else driver_binary
+
    if browser == 'firefox':
        fireFox_Options = webdriver.FirefoxOptions()
-        fireFox_Options.headless = True
+        ops = [
-        fireFox_Options.add_argument('--log fatal')
+            "--width=1920", "--height=1080",
-        fireFox_Profile = webdriver.FirefoxProfile()
+            "headless", "--log fatal"
-        fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent'])
+        ]
-        if not binary:
+        for option in ops:
-            driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
+            fireFox_Options.add_argument(option)
-        else:
+
-            try:
+        fireFox_Profile = webdriver.FirefoxProfile()
-                driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
+        fireFox_Profile.set_preference(
-            except:
+            "general.useragent.override", get_random_header()['user-agent']
-                driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options,
+        )
-                                           service_log_path=os.path.devnull)
+
        driver = webdriver.Firefox(
            # sets user-agent
            firefox_profile=fireFox_Profile,
            # sets various firefox settings
            options=fireFox_Options,
            # by default it will be None, if a chromedriver location is in the config then it will use that
            executable_path=(binary if binary else "geckodriver"),
            # an attempt at stopping selenium from printing a pile of garbage to the console.
            service_log_path=os.path.devnull
        )
    elif browser == 'chrome':
        profile_path = os.path.join(data_dir, 'Selenium_chromium')
        log_path = os.path.join(data_dir, 'chromedriver.log')
        from selenium.webdriver.chrome.options import Options
        profile_path = os.path.join(data_dir, 'Selenium_chromium')
        chrome_options = Options()
-        ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}",
+
-               "--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"]
+        ops = [
            "--headless", "--disable-gpu", '--log-level=OFF',
            f"--user-data-dir={profile_path}", "--no-sandbox",
            "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"  # noqa
        ]
        for option in ops:
            chrome_options.add_argument(option)
        cap = None
        if not binary:
-            if not executable:
+            from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-                driver = webdriver.Chrome(options=chrome_options)
+
-            else:
+            cap = DesiredCapabilities.CHROME
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+            cap['binary_location'] = executable
-                cap = DesiredCapabilities.CHROME
+
-                cap['binary_location'] = executable
+        driver = webdriver.Chrome(
-                driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
+            # sets user-agent, and various chrome settings
-        else:
+            options=chrome_options,
-            if not executable:
+            # by default it will be None, if a chromedriver location is in the config then it will use that
-                driver = webdriver.Chrome(options=chrome_options)
+            executable_path=binary,
-            else:
+            # by default it will be None, if a binary location is in the config then it will use that
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+            desired_capabilities=cap,
-                cap = DesiredCapabilities.CHROME
+            # an attempt at stopping selenium from printing a pile of garbage to the console.
-                cap['binary_location'] = executable
+            service_log_path=os.path.devnull
-                driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options,
+        )
                                          service_log_path=os.path.devnull)
    return driver
@ -184,19 +225,19 @@ def cloudflare_wait(driver):
    Also, i have made it time out after 50 seconds, useful if the target website is not responsive
    and to stop it from running infinitely.
    '''
-    abort_after = 50
+    abort_after = 50  # seconds
    start = time.time()
    title = driver.title  # title = "Just a moment..."
-    while title == "Just a moment...":
+    while "Just a moment" in title:
-        time.sleep(0.25)
+        time.sleep(0.35)
        delta = time.time() - start
        if delta >= abort_after:
            logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
            See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
            return 1
        title = driver.title
-        if not title == "Just a moment...":
+        if not "Just a moment" in title:
            break
    time.sleep(2)  # This is necessary to make sure everything has loaded fine.
    return 0
@ -204,10 +245,11 @@ def cloudflare_wait(driver):
 def request(request_type, url, **kwargs):  # Headers not yet supported , headers={}
    params = kwargs.get('params', {})
    url = url if not params else url + '?' + urlencode(params)
-    check_caches = check_cache(url)
+    cached_data = check_cache(url)
-    if bool(check_caches):
+
-        cached_data = check_caches
+    if cached_data:
        text = cached_data['data']
        user_agent = cached_data['user_agent']
        request_type = cached_data['method']
@ -215,28 +257,30 @@ def request(request_type, url, **kwargs):  # Headers not yet supported , headers
        return SeleResponse(url, request_type, text, cookies, user_agent)
    else:
        driver = driver_select()
        driver.get(url)
        try:
            exit_code = cloudflare_wait(driver)
            user_agent = driver.execute_script("return navigator.userAgent;")
            cookies = driver.get_cookies()
            text = driver.page_source
            driver.close()
-            if exit_code == 0:
+
-                pass
+            if exit_code != 0:
            else:
                return SeleResponse(url, request_type, None, cookies, user_agent)
-            seleResponse = SeleResponse(url, request_type, text, cookies, user_agent)
+            seleResponse = SeleResponse(
                url, request_type,
                text, cookies,
                user_agent
            )
            cache_request(seleResponse)
            return seleResponse
        except:
-            driver.save_screenshot(f"{get_data_dir()}/screenshot.png");
+            driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
            driver.close()
            logger.error(f'There was a problem getting the page: {url}.' +
                         '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')