reworked some logic and also made the code more readable

2021-05-25 22:13:14 +03:00 · 2021-05-25 22:13:14 +03:00 · 1f9a7dd35f
parent 4f21f6600f
commit 1f9a7dd35f
1 changed files with 113 additions and 69 deletions
--- a/anime_downloader/sites/helpers/selescrape.py
+++ b/anime_downloader/sites/helpers/selescrape.py
@ -4,14 +4,25 @@ from urllib.parse import urlencode
 from selenium import webdriver
 from sys import platform
 import tempfile
-import os
 import logging
 import click
 import time
 import json
+import os
+
+
+def open_config():
+    from anime_downloader.config import Config
+    return Config
+

 serverLogger.setLevel(logging.ERROR)
 logger = logging.getLogger(__name__)
+TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
+data = open_config()
+
+if not os.path.isdir(TEMP_FOLDER):
+    os.makedirs(TEMP_FOLDER)


 def get_data_dir():
@ -23,14 +34,6 @@ def get_data_dir():
    return os.path.join(click.get_app_dir(APP_NAME), 'data')


-def open_config():
-    from anime_downloader.config import Config
-    return Config
-
-
-data = open_config()
-
-
 def get_browser_config():
    '''
    Decides what browser selescrape will use.
@ -63,24 +66,31 @@ def get_browser_executable():

 def get_driver_binary():
    value = data['dl']['selescrape_driver_binary_path']
-    binary_path = value.lower() if value else value
-    return binary_path
+    if value:
+        return value
+
+    return None


 def cache_request(sele_response):
    """
    This function saves the response from a Selenium request in a json.
-    It uses timestamps so that the rest of the code can know if the cache has expired or not.
+    It uses timestamps to can know if the cache has expired or not.
    """

-    file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
+    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
+
    if os.path.isfile(file):
        with open(file, 'r') as f:
            tmp_cache = json.load(f)
    else:
        tmp_cache = {}
+
    data = sele_response.__dict__
-    tmp_cache[data['url']] = {
+    url = data['url']
+    url = (url[:-1] if url and url[-1] == '/' else url)
+
+    tmp_cache[url] = {
        'data': data['text'],
        'expiry': time.time(),
        'method': data['method'],
@ -96,80 +106,111 @@ def check_cache(url):
    """
    This function checks if the cache file exists,
    if it exists then it will read the file
-    And it will verify if the cache is less than or equal to 1 hour ago
+    And it will verify if the cache is less than or equal to 30 mins ago
    If it is, it will return it as it is.
    If it isn't, it will delete the expired cache from the file and return None
    If the file doesn't exist at all it will return None
    """
-    file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
+    file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
    if os.path.isfile(file):
+
        with open(file, 'r') as f:
            data = json.load(f)
-        if url not in data:
+
+        # Yes, this is ugly,
+        # but its the best way that I found to find the cache
+        # when the url is not exactly the same (a slash at the end or not)
+        clean_url = (url[:-1] if url and url[-1] == '/' else url)
+        found = False
+
+        for link in data:
+            if link == clean_url:
+                url = link
+                found = True
+
+        if not found:
            return
+
        timestamp = data[url]['expiry']
-        if (time.time() - timestamp <= 3600):
+
+        if (time.time() - timestamp <= 1800):
            return data[url]
        else:
            data.pop(url, None)
+
            with open(file, 'w') as f:
                json.dump(data, f, indent=4)


 def driver_select():
    '''
-    it configures what each browser should do
-    and gives the driver variable that is used
-    to perform any actions below this function.
+    This configures what each browser should do
+    and returns the corresponding driver.
    '''
    browser = get_browser_config()
    data_dir = get_data_dir()
    executable = get_browser_executable()
-    driver_binary = get_driver_binary()
-    binary = None if not driver_binary else driver_binary
+    binary = get_driver_binary()
+
    if browser == 'firefox':
        fireFox_Options = webdriver.FirefoxOptions()
-        fireFox_Options.headless = True
-        fireFox_Options.add_argument('--log fatal')
-        fireFox_Profile = webdriver.FirefoxProfile()
-        fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent'])
+        ops = [
+            "--width=1920", "--height=1080",
+            "headless", "--log fatal"
+        ]

-        if not binary:
-            driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
-        else:
-            try:
-                driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
-            except:
-                driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options,
-                                           service_log_path=os.path.devnull)
+        for option in ops:
+            fireFox_Options.add_argument(option)
+
+        fireFox_Profile = webdriver.FirefoxProfile()
+        fireFox_Profile.set_preference(
+            "general.useragent.override", get_random_header()['user-agent']
+        )
+
+        driver = webdriver.Firefox(
+            # sets user-agent
+            firefox_profile=fireFox_Profile,
+            # sets various firefox settings
+            options=fireFox_Options,
+            # by default it will be None, if a chromedriver location is in the config then it will use that
+            executable_path=(binary if binary else "geckodriver"),
+            # an attempt at stopping selenium from printing a pile of garbage to the console.
+            service_log_path=os.path.devnull
+        )

    elif browser == 'chrome':
-        profile_path = os.path.join(data_dir, 'Selenium_chromium')
-        log_path = os.path.join(data_dir, 'chromedriver.log')
        from selenium.webdriver.chrome.options import Options
+
+        profile_path = os.path.join(data_dir, 'Selenium_chromium')
        chrome_options = Options()
-        ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}",
-               "--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"]
+
+        ops = [
+            "--headless", "--disable-gpu", '--log-level=OFF',
+            f"--user-data-dir={profile_path}", "--no-sandbox",
+            "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"  # noqa
+        ]
+
        for option in ops:
            chrome_options.add_argument(option)

+        cap = None
+
        if not binary:
-            if not executable:
-                driver = webdriver.Chrome(options=chrome_options)
-            else:
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-                cap = DesiredCapabilities.CHROME
-                cap['binary_location'] = executable
-                driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
-        else:
-            if not executable:
-                driver = webdriver.Chrome(options=chrome_options)
-            else:
-                from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
-                cap = DesiredCapabilities.CHROME
-                cap['binary_location'] = executable
-                driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options,
-                                          service_log_path=os.path.devnull)
+            from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
+
+            cap = DesiredCapabilities.CHROME
+            cap['binary_location'] = executable
+
+        driver = webdriver.Chrome(
+            # sets user-agent, and various chrome settings
+            options=chrome_options,
+            # by default it will be None, if a chromedriver location is in the config then it will use that
+            executable_path=binary,
+            # by default it will be None, if a binary location is in the config then it will use that
+            desired_capabilities=cap,
+            # an attempt at stopping selenium from printing a pile of garbage to the console.
+            service_log_path=os.path.devnull
+        )
    return driver


@ -184,19 +225,19 @@ def cloudflare_wait(driver):
    Also, i have made it time out after 50 seconds, useful if the target website is not responsive
    and to stop it from running infinitely.
    '''
-    abort_after = 50
+    abort_after = 50  # seconds
    start = time.time()

    title = driver.title  # title = "Just a moment..."
-    while title == "Just a moment...":
-        time.sleep(0.25)
+    while "Just a moment" in title:
+        time.sleep(0.35)
        delta = time.time() - start
        if delta >= abort_after:
            logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
            See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
            return 1
        title = driver.title
-        if not title == "Just a moment...":
+        if not "Just a moment" in title:
            break
    time.sleep(2)  # This is necessary to make sure everything has loaded fine.
    return 0
@ -204,10 +245,11 @@ def cloudflare_wait(driver):

 def request(request_type, url, **kwargs):  # Headers not yet supported , headers={}
    params = kwargs.get('params', {})
+
    url = url if not params else url + '?' + urlencode(params)
-    check_caches = check_cache(url)
-    if bool(check_caches):
-        cached_data = check_caches
+    cached_data = check_cache(url)
+
+    if cached_data:
        text = cached_data['data']
        user_agent = cached_data['user_agent']
        request_type = cached_data['method']
@ -215,28 +257,30 @@ def request(request_type, url, **kwargs):  # Headers not yet supported , headers
        return SeleResponse(url, request_type, text, cookies, user_agent)

    else:
-
        driver = driver_select()
        driver.get(url)

        try:
-
            exit_code = cloudflare_wait(driver)
            user_agent = driver.execute_script("return navigator.userAgent;")
            cookies = driver.get_cookies()
            text = driver.page_source
            driver.close()
-            if exit_code == 0:
-                pass
-            else:
+
+            if exit_code != 0:
                return SeleResponse(url, request_type, None, cookies, user_agent)

-            seleResponse = SeleResponse(url, request_type, text, cookies, user_agent)
+            seleResponse = SeleResponse(
+                url, request_type,
+                text, cookies,
+                user_agent
+            )
+
            cache_request(seleResponse)
            return seleResponse

        except:
-            driver.save_screenshot(f"{get_data_dir()}/screenshot.png");
+            driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
            driver.close()
            logger.error(f'There was a problem getting the page: {url}.' +
                         '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')