diff --git a/anime_downloader/sites/helpers/request.py b/anime_downloader/sites/helpers/request.py index 6bec3f2..c1a00b5 100644 --- a/anime_downloader/sites/helpers/request.py +++ b/anime_downloader/sites/helpers/request.py @@ -46,6 +46,8 @@ def setup(func): cf : bool cf if True performs the request through cfscrape. For cloudflare protected sites. + sel : bool + sel if True perfroms the request through selescrape (selenium). referer : str a url sent as referer in request headers ''' @@ -57,6 +59,7 @@ def setup(func): from selenium import webdriver from anime_downloader.sites.helpers import selescrape sess = selescrape + sess.cache = cache except ImportError: sess = cf_session logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run: 'pip install selenium' .") @@ -107,6 +110,8 @@ def get(url: str, cf : bool cf if True performs the request through cfscrape. For cloudflare protected sites. + sel : bool + sel if True perfroms the request through selescrape (selenium). referer : str a url sent as referer in request headers ''' diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 0a5e6ec..ec4891d 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -1,31 +1,14 @@ -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger -from selenium.webdriver.support.ui import WebDriverWait from anime_downloader.const import get_random_header -from selenium.webdriver.common.by import By from urllib.parse import urlencode -from urllib.parse import urlsplit from selenium import webdriver -from bs4 import BeautifulSoup -from logging import exception from sys import platform -import requests -import os +import tempfile import logging import click import time import json -serverLogger.setLevel(logging.ERROR) -logger = logging.getLogger(__name__) - - -def get_data_dir(): - ''' - Gets the folder directory selescrape will store data, - such as cookies or browser extensions and logs. - ''' - APP_NAME = 'anime downloader' - return os.path.join(click.get_app_dir(APP_NAME), 'data') +import os def open_config(): @@ -33,8 +16,24 @@ def open_config(): return Config +cache = False +serverLogger.setLevel(logging.ERROR) +logger = logging.getLogger(__name__) +TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache') data = open_config() +if not os.path.isdir(TEMP_FOLDER): + os.makedirs(TEMP_FOLDER) + + +def get_data_dir(): + ''' + Gets the folder directory selescrape will store data, + such as cookies or browser extensions and logs. + ''' + APP_NAME = 'anime downloader' + return os.path.join(click.get_app_dir(APP_NAME), 'data') + def get_browser_config(): ''' @@ -50,148 +49,248 @@ def get_browser_config(): browser = os_browser[a] else: browser = 'chrome' + value = data['dl']['selescrape_browser'] value = value.lower() if value else value + if value in ['chrome', 'firefox']: browser = value + return browser def get_browser_executable(): value = data['dl']['selescrape_browser_executable_path'] executable_value = value.lower() if value else value - return executable_value + if executable_value: + return executable_value def get_driver_binary(): value = data['dl']['selescrape_driver_binary_path'] - binary_path = value.lower() if value else value - return binary_path + if value: + return value -def add_url_params(url, params): - return url if not params else url + '?' + urlencode(params) +def cache_request(sele_response): + """ + This function saves the response from a Selenium request in a json. + It uses timestamps to can know if the cache has expired or not. + """ + if not cache: + return + + file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json') + + if os.path.isfile(file): + with open(file, 'r') as f: + tmp_cache = json.load(f) + else: + tmp_cache = {} + + data = sele_response.__dict__ + url = data['url'] + url = (url[:-1] if url and url[-1] == '/' else url) + + tmp_cache[url] = { + 'data': data['text'], + 'expiry': time.time(), + 'method': data['method'], + 'cookies': data['cookies'], + 'user_agent': data['user_agent'] + } + + with open(file, 'w') as f: + json.dump(tmp_cache, f, indent=4) + + +def check_cache(url): + """ + This function checks if the cache file exists, + if it exists then it will read the file + And it will verify if the cache is less than or equal to 30 mins old + If it is, it will return it as it is. + If it isn't, it will delete the expired cache from the file and return None + If the file doesn't exist at all it will return None + """ + if not cache: + return + file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json') + if os.path.isfile(file): + + with open(file, 'r') as f: + data = json.load(f) + + # Yes, this is ugly, + # but its the best way that I found to find the cache + # when the url is not exactly the same (a slash at the end or not) + clean_url = (url[:-1] if url and url[-1] == '/' else url) + found = False + + for link in data: + if link == clean_url: + url = link + found = True + + if not found: + return + + timestamp = data[url]['expiry'] + + if (time.time() - timestamp <= 1800): + return data[url] + else: + data.pop(url, None) + + with open(file, 'w') as f: + json.dump(data, f, indent=4) def driver_select(): ''' - it configures what each browser should do - and gives the driver variable that is used - to perform any actions below this function. + This configures what each browser should do + and returns the corresponding driver. ''' browser = get_browser_config() data_dir = get_data_dir() executable = get_browser_executable() - driver_binary = get_driver_binary() - binary = None if not driver_binary else driver_binary + binary = get_driver_binary() + if browser == 'firefox': - fireFoxOptions = webdriver.FirefoxOptions() - fireFoxOptions.headless = True - fireFoxOptions.add_argument('--log fatal') - if binary == None: - driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull) - else: - try: - driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull) - except: - driver = webdriver.Firefox(executable_path=binary, options=fireFoxOptions, service_log_path=os.path.devnull) + fireFox_Options = webdriver.FirefoxOptions() + ops = [ + "--width=1920", "--height=1080", + "-headless", "--log fatal" + ] + + for option in ops: + fireFox_Options.add_argument(option) + + fireFox_Profile = webdriver.FirefoxProfile() + fireFox_Profile.set_preference( + "general.useragent.override", get_random_header()['user-agent'] + ) + + driver = webdriver.Firefox( + # sets user-agent + firefox_profile=fireFox_Profile, + # sets various firefox settings + options=fireFox_Options, + # by default it will be None, if a binary location is in the config then it will use that + firefox_binary=None if not executable else executable, + # by default it will be "geckodriver", if a geckodriver location is in the config then it will use that + executable_path=(binary if binary else "geckodriver"), + # an attempt at stopping selenium from printing a pile of garbage to the console. + service_log_path=os.path.devnull + ) + elif browser == 'chrome': from selenium.webdriver.chrome.options import Options - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--disable-gpu") + profile_path = os.path.join(data_dir, 'Selenium_chromium') - log_path = os.path.join(data_dir, 'chromedriver.log') - chrome_options.add_argument('--log-level=OFF') - chrome_options.add_argument(f"--user-data-dir={profile_path}") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument(f'user-agent={get_random_header()}') - if binary == None: - if executable == None: - driver = webdriver.Chrome(options=chrome_options) - else: - from selenium.webdriver.common.desired_capabilities import DesiredCapabilities - cap = DesiredCapabilities.CHROME - cap['binary_location'] = executable - driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options) - else: - if executable == None: - driver = webdriver.Chrome(options=chrome_options) - else: - from selenium.webdriver.common.desired_capabilities import DesiredCapabilities - cap = DesiredCapabilities.CHROME - cap['binary_location'] = executable - driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, service_log_path=os.path.devnull) + chrome_options = Options() + + ops = [ + "--headless", "--disable-gpu", '--log-level=OFF', + f"--user-data-dir={profile_path}", "--no-sandbox", + "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa + ] + + for option in ops: + chrome_options.add_argument(option) + + cap = None + + if executable: + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities + + cap = DesiredCapabilities.CHROME + cap['binary_location'] = executable + + driver = webdriver.Chrome( + # sets user-agent, and various chrome settings + options=chrome_options, + # by default it will be "chromedriver", if a chromedriver location is in the config then it will use that + executable_path=(binary if binary else "chromedriver"), + # by default it will be None, if a binary location is in the config then it will use that + desired_capabilities=cap, + # an attempt at stopping selenium from printing a pile of garbage to the console. + service_log_path=os.path.devnull + ) return driver -def status_select(driver, url, status='hide'): - ''' - For now it doesnt do what its name suggests, - I have planned to add a status reporter of the http response code. - This part of the code is not removed because it is part of its core. - Treat it like it isnt here. - ''' - try: - if status == 'hide': - driver.get(url) - elif status == 'show': - r = requests.head(url) - if r.status_code == 503: - raise RuntimeError("This website's sevice is unavailable or has cloudflare on.") - driver.get(url) - return r.status_code - else: - driver.get(url) - except requests.ConnectionError: - raise RuntimeError("Failed to establish a connection using the requests library.") - - def cloudflare_wait(driver): ''' It waits until cloudflare has gone away before doing any further actions. - The way it works is by getting the title of the page + The way it works is by getting the title of the page and as long as it is "Just a moment..." it will keep waiting. - This part of the code won't make the code execute slower - if the target website has not a Cloudflare redirection. - At most it will sleep 1 second as a precaution. - Also, i have made it time out after 30 seconds, useful if the target website is not responsive + This part of the code won't make the code execute slower + if the target website has no Cloudflare redirection. + At most it will sleep 1 second as a precaution. + Also, i have made it time out after 50 seconds, useful if the target website is not responsive and to stop it from running infinitely. ''' - abort_after = 30 + abort_after = 50 # seconds start = time.time() title = driver.title # title = "Just a moment..." - while title == "Just a moment...": - time.sleep(0.25) + while "Just a moment" in title: + time.sleep(0.35) delta = time.time() - start if delta >= abort_after: - logger.error(f'Timeout:\nCouldnt bypass cloudflare. \ - See the screenshot for more info:\n{get_data_dir()}/screenshot.png') + logger.error(f'Timeout:\tCouldnt bypass cloudflare. \ + See the screenshot for more info:\t{get_data_dir()}/screenshot.png') + return 1 title = driver.title - if not title == "Just a moment...": + if not "Just a moment" in title: break - time.sleep(1) # This is necessary to make sure everything has loaded fine. + time.sleep(2) # This is necessary to make sure everything has loaded fine. + return 0 def request(request_type, url, **kwargs): # Headers not yet supported , headers={} params = kwargs.get('params', {}) - new_url = add_url_params(url, params) - driver = driver_select() - status = status_select(driver, new_url, 'hide') - try: - cloudflare_wait(driver) - user_agent = driver.execute_script("return navigator.userAgent;") # dirty, but allows for all sorts of things above - cookies = driver.get_cookies() - text = driver.page_source - driver.close() + + url = url if not params else url + '?' + urlencode(params) + cached_data = check_cache(url) + + if cached_data: + text = cached_data['data'] + user_agent = cached_data['user_agent'] + request_type = cached_data['method'] + cookies = cached_data['cookies'] return SeleResponse(url, request_type, text, cookies, user_agent) - except: - driver.save_screenshot(f"{get_data_dir()}/screenshot.png") - driver.close() - logger.error(f'There was a problem getting the page: {new_url}. \ - See the screenshot for more info:\n{get_data_dir()}/screenshot.png') + + else: + driver = driver_select() + driver.get(url) + + try: + exit_code = cloudflare_wait(driver) + user_agent = driver.execute_script("return navigator.userAgent;") + cookies = driver.get_cookies() + text = driver.page_source + driver.close() + + if exit_code != 0: + return SeleResponse(url, request_type, None, cookies, user_agent) + + seleResponse = SeleResponse( + url, request_type, + text, cookies, + user_agent + ) + + cache_request(seleResponse) + return seleResponse + + except: + driver.save_screenshot(f"{get_data_dir()}/screenshot.png") + driver.close() + logger.error(f'There was a problem getting the page: {url}.' + + '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png') + return class SeleResponse: @@ -224,5 +323,5 @@ class SeleResponse: return self.text def __repr__(self): - return ''.format( + return ''.format( self.url, self.method, self.text, self.cookies, self.user_agent)