From e3e7a82a25a126129023669caebe364a6ccf2796 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Tue, 1 Sep 2020 23:55:51 +0300 Subject: [PATCH 01/21] added a cache system for selescrape --- anime_downloader/sites/helpers/selescrape.py | 86 ++++++++++++++++---- 1 file changed, 72 insertions(+), 14 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 2d90f74..62486c7 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -15,6 +15,7 @@ import logging import click import time import json + serverLogger.setLevel(logging.ERROR) logger = logging.getLogger(__name__) @@ -73,6 +74,49 @@ def add_url_params(url, params): return url if not params else url + '?' + urlencode(params) + +def cache_request(url, request_type, response, cookies, user_agent): + timestamp = { + 'year': time.localtime().tm_year, + 'month': time.localtime().tm_mon, + 'day': time.localtime().tm_mday, + 'hour': time.localtime().tm_hour, + 'minute': time.localtime().tm_min + } + + tmp_cache = {} + tmp_cache[url] = { + 'data': response, + 'time': timestamp, + 'type': request_type, + 'cookies': cookies, + 'user_agent': user_agent + } + + with open(os.path.join(get_data_dir(), 'cached_requests.json'), 'w') as f: + json.dump(tmp_cache, f, indent=4) + +def check_cache(url): + file = os.path.join(get_data_dir(), 'cached_requests.json') + if os.path.isfile(file): + with open(file, 'r') as f: + data = json.loads(f.read()) + try: + cached_request = data[url] + except KeyError: + return None + timestamp = cached_request['time'] + if (timestamp['year'] == time.localtime().tm_year and + timestamp['month'] == time.localtime().tm_mon and + timestamp['day'] == time.localtime().tm_mday and + time.localtime().tm_hour - timestamp['hour'] <= 1): + return cached_request + else: + return None + else: + return None + + def driver_select(): # ''' it configures what each browser should do @@ -177,21 +221,35 @@ def cloudflare_wait(driver): def request(request_type, url, **kwargs): #Headers not yet supported , headers={} params = kwargs.get('params', {}) - new_url = add_url_params(url, params) - driver = driver_select() - status = status_select(driver, new_url, 'hide') - try: - cloudflare_wait(driver) - user_agent = driver.execute_script("return navigator.userAgent;") #dirty, but allows for all sorts of things above - cookies = driver.get_cookies() - text = driver.page_source - driver.close() + url = add_url_params(url, params) + if bool(check_cache(url)): + cached_data = check_cache(url) + text = cached_data['data'] + user_agent = cached_data['user_agent'] + request_type = cached_data['type'] + cookies = cached_data['cookies'] return SeleResponse(url, request_type, text, cookies, user_agent) - except: - driver.save_screenshot(f"{get_data_dir()}/screenshot.png"); - driver.close() - logger.error(f'There was a problem getting the page: {new_url}. \ - See the screenshot for more info:\n{get_data_dir()}/screenshot.png') + + else: + + driver = driver_select() + status = status_select(driver, url, 'hide') + + try: + cloudflare_wait(driver) + user_agent = driver.execute_script("return navigator.userAgent;") #dirty, but allows for all sorts of things above + cookies = driver.get_cookies() + text = driver.page_source + driver.close() + cache_request(url, request_type, text, cookies, user_agent) + return SeleResponse(url, request_type, text, cookies, user_agent) + + except: + driver.save_screenshot(f"{get_data_dir()}/screenshot.png"); + driver.close() + logger.error(f'There was a problem getting the page: {url}. \ + See the screenshot for more info:\t{get_data_dir()}/screenshot.png') + class SeleResponse: From 7a6aa3b494fca7ff70ec8665d53871067c6b5527 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 00:05:29 +0300 Subject: [PATCH 02/21] Update selescrape.py --- anime_downloader/sites/helpers/selescrape.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 62486c7..1166dc0 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -112,6 +112,9 @@ def check_cache(url): time.localtime().tm_hour - timestamp['hour'] <= 1): return cached_request else: + old_cache = cached_request.pop(url, None) + with open(file, 'w') as f: + json.dump(cached_request, f, indent=4) return None else: return None From 2da7b111913960769cbf35531774daf343fc89c9 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 00:39:58 +0300 Subject: [PATCH 03/21] changed to Unix Timestamps. --- anime_downloader/sites/helpers/selescrape.py | 23 ++++++++------------ 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 1166dc0..2649615 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -76,18 +76,16 @@ def add_url_params(url, params): def cache_request(url, request_type, response, cookies, user_agent): - timestamp = { - 'year': time.localtime().tm_year, - 'month': time.localtime().tm_mon, - 'day': time.localtime().tm_mday, - 'hour': time.localtime().tm_hour, - 'minute': time.localtime().tm_min - } + """ + This function saves the response from a Selenium request in a json. + It uses timestamps so that the rest of the code + can know if its an old cache or a new one. + """ tmp_cache = {} tmp_cache[url] = { 'data': response, - 'time': timestamp, + 'expiry': time.time(), 'type': request_type, 'cookies': cookies, 'user_agent': user_agent @@ -105,14 +103,11 @@ def check_cache(url): cached_request = data[url] except KeyError: return None - timestamp = cached_request['time'] - if (timestamp['year'] == time.localtime().tm_year and - timestamp['month'] == time.localtime().tm_mon and - timestamp['day'] == time.localtime().tm_mday and - time.localtime().tm_hour - timestamp['hour'] <= 1): + timestamp = cached_request['expiry'] + if (time.time() - timestamp <= 3600): return cached_request else: - old_cache = cached_request.pop(url, None) + print(cached_request.pop(url, None)) with open(file, 'w') as f: json.dump(cached_request, f, indent=4) return None From 5ffb63555b081b65e000af1a8713e26db0c39a21 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 00:50:59 +0300 Subject: [PATCH 04/21] Update selescrape.py --- anime_downloader/sites/helpers/selescrape.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 2649615..1ab6860 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -247,6 +247,7 @@ def request(request_type, url, **kwargs): #Headers not yet supported , headers={ driver.close() logger.error(f'There was a problem getting the page: {url}. \ See the screenshot for more info:\t{get_data_dir()}/screenshot.png') + exit() From 49cee92c9849079f7ad8ee762bd8fdf5cba3cff2 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 13:00:33 +0300 Subject: [PATCH 05/21] bug fix Now properly handles the saving of the cached requests without overwriting the old ones. --- anime_downloader/sites/helpers/selescrape.py | 22 ++++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 1ab6860..2fbdce2 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -81,8 +81,12 @@ def cache_request(url, request_type, response, cookies, user_agent): It uses timestamps so that the rest of the code can know if its an old cache or a new one. """ - - tmp_cache = {} + file = os.path.join(get_data_dir(), 'cached_requests.json') + if os.path.isfile(file): + with open(file, 'r') as f: + tmp_cache = json.loads(f.read()) + else: + tmp_cache = {} tmp_cache[url] = { 'data': response, 'expiry': time.time(), @@ -91,7 +95,7 @@ def cache_request(url, request_type, response, cookies, user_agent): 'user_agent': user_agent } - with open(os.path.join(get_data_dir(), 'cached_requests.json'), 'w') as f: + with open(file, 'w') as f: json.dump(tmp_cache, f, indent=4) def check_cache(url): @@ -100,22 +104,22 @@ def check_cache(url): with open(file, 'r') as f: data = json.loads(f.read()) try: - cached_request = data[url] + data[url] except KeyError: return None - timestamp = cached_request['expiry'] + timestamp = data[url]['expiry'] if (time.time() - timestamp <= 3600): - return cached_request + return data[url] else: - print(cached_request.pop(url, None)) + data.pop(url, None) with open(file, 'w') as f: - json.dump(cached_request, f, indent=4) + json.dump(data, f, indent=4) return None else: return None -def driver_select(): # +def driver_select(): ''' it configures what each browser should do and gives the driver variable that is used From 169fdde334ad5d31cd55da82079e64c2490d653b Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 15:28:04 +0300 Subject: [PATCH 06/21] moved the cache to the TEMP folder --- anime_downloader/sites/helpers/selescrape.py | 46 +++++--------------- 1 file changed, 11 insertions(+), 35 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 2fbdce2..9130c5d 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -10,6 +10,7 @@ from bs4 import BeautifulSoup from logging import exception from sys import platform import requests +import tempfile import os import logging import click @@ -70,18 +71,13 @@ def get_driver_binary(): return binary_path -def add_url_params(url, params): - return url if not params else url + '?' + urlencode(params) - - - def cache_request(url, request_type, response, cookies, user_agent): """ This function saves the response from a Selenium request in a json. It uses timestamps so that the rest of the code can know if its an old cache or a new one. """ - file = os.path.join(get_data_dir(), 'cached_requests.json') + file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') if os.path.isfile(file): with open(file, 'r') as f: tmp_cache = json.loads(f.read()) @@ -90,7 +86,7 @@ def cache_request(url, request_type, response, cookies, user_agent): tmp_cache[url] = { 'data': response, 'expiry': time.time(), - 'type': request_type, + 'method': request_type, 'cookies': cookies, 'user_agent': user_agent } @@ -99,7 +95,7 @@ def cache_request(url, request_type, response, cookies, user_agent): json.dump(tmp_cache, f, indent=4) def check_cache(url): - file = os.path.join(get_data_dir(), 'cached_requests.json') + file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') if os.path.isfile(file): with open(file, 'r') as f: data = json.loads(f.read()) @@ -172,28 +168,6 @@ def driver_select(): return driver -def status_select(driver, url, status='hide'): - ''' - For now it doesnt do what its name suggests, - I have planned to add a status reporter of the http response code. - This part of the code is not removed because it is part of its core. - Treat it like it isnt here. - ''' - try: - if status == 'hide': - driver.get(url) - elif status == 'show': - r = requests.head(url) - if r.status_code == 503: - raise RuntimeError("This website's sevice is unavailable or has cloudflare on.") - driver.get(url) - return r.status_code - else: - driver.get(url) - except requests.ConnectionError: - raise RuntimeError("Failed to establish a connection using the requests library.") - - def cloudflare_wait(driver): ''' It waits until cloudflare has gone away before doing any further actions. @@ -213,8 +187,9 @@ def cloudflare_wait(driver): time.sleep(0.25) delta = time.time() - start if delta >= abort_after: - logger.error(f'Timeout:\nCouldnt bypass cloudflare. \ - See the screenshot for more info:\n{get_data_dir()}/screenshot.png') + logger.error(f'Timeout:\tCouldnt bypass cloudflare. \ + See the screenshot for more info:\t{get_data_dir()}/screenshot.png') + break title = driver.title if not title == "Just a moment...": break @@ -223,19 +198,20 @@ def cloudflare_wait(driver): def request(request_type, url, **kwargs): #Headers not yet supported , headers={} params = kwargs.get('params', {}) - url = add_url_params(url, params) + url = url if not params else url + '?' + urlencode(params) + if bool(check_cache(url)): cached_data = check_cache(url) text = cached_data['data'] user_agent = cached_data['user_agent'] - request_type = cached_data['type'] + request_type = cached_data['method'] cookies = cached_data['cookies'] return SeleResponse(url, request_type, text, cookies, user_agent) else: driver = driver_select() - status = status_select(driver, url, 'hide') + driver.get(url) try: cloudflare_wait(driver) From 05e1f4d484cd14b0bdc26c4d265a470f6a7342a6 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 16:28:40 +0300 Subject: [PATCH 07/21] Made the caching use ``SeleResponse.__dict__`` --- anime_downloader/sites/helpers/selescrape.py | 48 ++++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 9130c5d..8d8ba41 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -34,7 +34,6 @@ def open_config(): from anime_downloader.config import Config return Config - data = open_config() @@ -71,30 +70,38 @@ def get_driver_binary(): return binary_path -def cache_request(url, request_type, response, cookies, user_agent): +def cache_request(**kwargs): """ This function saves the response from a Selenium request in a json. - It uses timestamps so that the rest of the code - can know if its an old cache or a new one. + It uses timestamps so that the rest of the code can know if the cache has expired or not. """ + file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') if os.path.isfile(file): with open(file, 'r') as f: tmp_cache = json.loads(f.read()) else: tmp_cache = {} - tmp_cache[url] = { - 'data': response, + tmp_cache[kwargs.get('url')] = { + 'data': kwargs.get('text'), 'expiry': time.time(), - 'method': request_type, - 'cookies': cookies, - 'user_agent': user_agent + 'method': kwargs.get('method'), + 'cookies': kwargs.get('cookies'), + 'user_agent': kwargs.get('user_agent') } with open(file, 'w') as f: json.dump(tmp_cache, f, indent=4) def check_cache(url): + """ + This function checks if the cache file exists, + if it exists then it will read the file + And it will verify if the cache is less than or equal to 1 hour ago + If it is it will return it as it is. + If it isn't it will delete the expired cache from the file and return None + If the file doesn't exist at all it will return None + """ file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') if os.path.isfile(file): with open(file, 'r') as f: @@ -127,16 +134,16 @@ def driver_select(): driver_binary = get_driver_binary() binary = None if not driver_binary else driver_binary if browser == 'firefox': - fireFoxOptions = webdriver.FirefoxOptions() - fireFoxOptions.headless = True - fireFoxOptions.add_argument('--log fatal') + fireFox_Options = webdriver.FirefoxOptions() + fireFox_Options.headless = True + fireFox_Options.add_argument('--log fatal') if binary == None: - driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull) + driver = webdriver.Firefox(options=fireFox_Options, service_log_path=os.path.devnull) else: try: - driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull) + driver = webdriver.Firefox(options=fireFox_Options, service_log_path=os.path.devnull) except: - driver = webdriver.Firefox(executable_path=binary, options=fireFoxOptions, service_log_path=os.path.devnull) + driver = webdriver.Firefox(executable_path=binary, options=fireFox_Options, service_log_path=os.path.devnull) elif browser == 'chrome': from selenium.webdriver.chrome.options import Options chrome_options = Options() @@ -174,12 +181,12 @@ def cloudflare_wait(driver): The way it works is by getting the title of the page and as long as it is "Just a moment..." it will keep waiting. This part of the code won't make the code execute slower - if the target website has not a Cloudflare redirection. + if the target website has no Cloudflare redirection. At most it will sleep 1 second as a precaution. - Also, i have made it time out after 30 seconds, useful if the target website is not responsive + Also, i have made it time out after 50 seconds, useful if the target website is not responsive and to stop it from running infinitely. ''' - abort_after = 30 + abort_after = 50 start = time.time() title = driver.title # title = "Just a moment..." @@ -219,8 +226,9 @@ def request(request_type, url, **kwargs): #Headers not yet supported , headers={ cookies = driver.get_cookies() text = driver.page_source driver.close() - cache_request(url, request_type, text, cookies, user_agent) - return SeleResponse(url, request_type, text, cookies, user_agent) + seleResponse = SeleResponse(url, request_type, text, cookies, user_agent) + cache_request(**seleResponse.__dict__) + return seleResponse except: driver.save_screenshot(f"{get_data_dir()}/screenshot.png"); From 1ae4199d72709139621cae0e08cba406cb29cf35 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 16:33:05 +0300 Subject: [PATCH 08/21] added some comma --- anime_downloader/sites/helpers/selescrape.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 8d8ba41..547285d 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -98,8 +98,8 @@ def check_cache(url): This function checks if the cache file exists, if it exists then it will read the file And it will verify if the cache is less than or equal to 1 hour ago - If it is it will return it as it is. - If it isn't it will delete the expired cache from the file and return None + If it is, it will return it as it is. + If it isn't, it will delete the expired cache from the file and return None If the file doesn't exist at all it will return None """ file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') From 4123b1c7038deccafface58019e5d512dcfdeac0 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 17:08:59 +0300 Subject: [PATCH 09/21] fixed bug: Firefox not getting random user agent from anime dl --- anime_downloader/sites/helpers/selescrape.py | 40 ++++++++++---------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 547285d..315ee86 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -70,7 +70,7 @@ def get_driver_binary(): return binary_path -def cache_request(**kwargs): +def cache_request(sele_response): """ This function saves the response from a Selenium request in a json. It uses timestamps so that the rest of the code can know if the cache has expired or not. @@ -79,15 +79,16 @@ def cache_request(**kwargs): file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') if os.path.isfile(file): with open(file, 'r') as f: - tmp_cache = json.loads(f.read()) + tmp_cache = json.load(f) else: tmp_cache = {} - tmp_cache[kwargs.get('url')] = { - 'data': kwargs.get('text'), + data = sele_response.__dict__ + tmp_cache[data['url']] = { + 'data': data['text'], 'expiry': time.time(), - 'method': kwargs.get('method'), - 'cookies': kwargs.get('cookies'), - 'user_agent': kwargs.get('user_agent') + 'method': data['method'], + 'cookies': data['cookies'], + 'user_agent': data['user_agent'] } with open(file, 'w') as f: @@ -105,11 +106,9 @@ def check_cache(url): file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') if os.path.isfile(file): with open(file, 'r') as f: - data = json.loads(f.read()) - try: - data[url] - except KeyError: - return None + data = json.load(f) + if url not in data: + return timestamp = data[url]['expiry'] if (time.time() - timestamp <= 3600): return data[url] @@ -117,9 +116,9 @@ def check_cache(url): data.pop(url, None) with open(file, 'w') as f: json.dump(data, f, indent=4) - return None + return else: - return None + return def driver_select(): @@ -135,15 +134,18 @@ def driver_select(): binary = None if not driver_binary else driver_binary if browser == 'firefox': fireFox_Options = webdriver.FirefoxOptions() + fireFox_Profile = webdriver.FirefoxProfile() + fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent']) fireFox_Options.headless = True fireFox_Options.add_argument('--log fatal') if binary == None: - driver = webdriver.Firefox(options=fireFox_Options, service_log_path=os.path.devnull) + driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) else: try: - driver = webdriver.Firefox(options=fireFox_Options, service_log_path=os.path.devnull) + driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) except: - driver = webdriver.Firefox(executable_path=binary, options=fireFox_Options, service_log_path=os.path.devnull) + driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options, service_log_path=os.path.devnull) + elif browser == 'chrome': from selenium.webdriver.chrome.options import Options chrome_options = Options() @@ -155,7 +157,7 @@ def driver_select(): chrome_options.add_argument(f"--user-data-dir={profile_path}") chrome_options.add_argument("--no-sandbox") chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument(f'user-agent={get_random_header()}') + chrome_options.add_argument(f"user-agent={get_random_header()['user-agent']}") if binary == None: if executable == None: driver = webdriver.Chrome(options=chrome_options) @@ -227,7 +229,7 @@ def request(request_type, url, **kwargs): #Headers not yet supported , headers={ text = driver.page_source driver.close() seleResponse = SeleResponse(url, request_type, text, cookies, user_agent) - cache_request(**seleResponse.__dict__) + cache_request(seleResponse) return seleResponse except: From 3999698bb1b8915de75ade92f2b1e07e6fd262da Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 17:27:21 +0300 Subject: [PATCH 10/21] Update selescrape.py --- anime_downloader/sites/helpers/selescrape.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 315ee86..396e6ef 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -20,7 +20,6 @@ import json serverLogger.setLevel(logging.ERROR) logger = logging.getLogger(__name__) - def get_data_dir(): ''' Gets the folder directory selescrape will store data, @@ -116,9 +115,6 @@ def check_cache(url): data.pop(url, None) with open(file, 'w') as f: json.dump(data, f, indent=4) - return - else: - return def driver_select(): From 91870487a930b087f14d9072367cd02ed9c2fa22 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 18:26:10 +0300 Subject: [PATCH 11/21] Update selescrape.py --- anime_downloader/sites/helpers/selescrape.py | 27 +++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 396e6ef..6571954 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -129,11 +129,13 @@ def driver_select(): driver_binary = get_driver_binary() binary = None if not driver_binary else driver_binary if browser == 'firefox': + fireFox_Options = webdriver.FirefoxOptions() - fireFox_Profile = webdriver.FirefoxProfile() - fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent']) fireFox_Options.headless = True fireFox_Options.add_argument('--log fatal') + fireFox_Profile = webdriver.FirefoxProfile() + fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent']) + if binary == None: driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) else: @@ -145,15 +147,14 @@ def driver_select(): elif browser == 'chrome': from selenium.webdriver.chrome.options import Options chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--disable-gpu") + ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}", + "--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"] + for option in ops: + chrome_options.add_argument(option) + profile_path = os.path.join(data_dir, 'Selenium_chromium') log_path = os.path.join(data_dir, 'chromedriver.log') - chrome_options.add_argument('--log-level=OFF') - chrome_options.add_argument(f"--user-data-dir={profile_path}") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument(f"user-agent={get_random_header()['user-agent']}") + if binary == None: if executable == None: driver = webdriver.Chrome(options=chrome_options) @@ -214,16 +215,18 @@ def request(request_type, url, **kwargs): #Headers not yet supported , headers={ return SeleResponse(url, request_type, text, cookies, user_agent) else: - + driver = driver_select() driver.get(url) try: + cloudflare_wait(driver) user_agent = driver.execute_script("return navigator.userAgent;") #dirty, but allows for all sorts of things above cookies = driver.get_cookies() text = driver.page_source driver.close() + seleResponse = SeleResponse(url, request_type, text, cookies, user_agent) cache_request(seleResponse) return seleResponse @@ -231,8 +234,8 @@ def request(request_type, url, **kwargs): #Headers not yet supported , headers={ except: driver.save_screenshot(f"{get_data_dir()}/screenshot.png"); driver.close() - logger.error(f'There was a problem getting the page: {url}. \ - See the screenshot for more info:\t{get_data_dir()}/screenshot.png') + logger.error(f'There was a problem getting the page: {url}.' + + '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png') exit() From 8cb3a8dfb0e2e9f1942bdf5ffa80079fea1dcb5b Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Wed, 2 Sep 2020 18:31:25 +0300 Subject: [PATCH 12/21] removed unneeded imports --- anime_downloader/sites/helpers/selescrape.py | 71 ++++++++++---------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 6571954..3679377 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -1,15 +1,8 @@ -from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger -from selenium.webdriver.support.ui import WebDriverWait from anime_downloader.const import get_random_header -from selenium.webdriver.common.by import By from urllib.parse import urlencode -from urllib.parse import urlsplit from selenium import webdriver -from bs4 import BeautifulSoup -from logging import exception from sys import platform -import requests import tempfile import os import logging @@ -20,9 +13,10 @@ import json serverLogger.setLevel(logging.ERROR) logger = logging.getLogger(__name__) + def get_data_dir(): ''' - Gets the folder directory selescrape will store data, + Gets the folder directory selescrape will store data, such as cookies or browser extensions and logs. ''' APP_NAME = 'anime downloader' @@ -33,6 +27,7 @@ def open_config(): from anime_downloader.config import Config return Config + data = open_config() @@ -40,20 +35,23 @@ def get_browser_config(): ''' Decides what browser selescrape will use. ''' - os_browser = { #maps os to a browser - 'linux':'firefox', - 'darwin':'chrome', - 'win32':'chrome' + os_browser = { # maps os to a browser + 'linux': 'firefox', + 'darwin': 'chrome', + 'win32': 'chrome' } for a in os_browser: if platform.startswith(a): - browser = os_browser[a] + browser = os_browser[a] else: browser = 'chrome' + value = data['dl']['selescrape_browser'] value = value.lower() if value else value + if value in ['chrome', 'firefox']: browser = value + return browser @@ -88,11 +86,12 @@ def cache_request(sele_response): 'method': data['method'], 'cookies': data['cookies'], 'user_agent': data['user_agent'] - } + } with open(file, 'w') as f: json.dump(tmp_cache, f, indent=4) + def check_cache(url): """ This function checks if the cache file exists, @@ -119,8 +118,8 @@ def check_cache(url): def driver_select(): ''' - it configures what each browser should do - and gives the driver variable that is used + it configures what each browser should do + and gives the driver variable that is used to perform any actions below this function. ''' browser = get_browser_config() @@ -135,28 +134,29 @@ def driver_select(): fireFox_Options.add_argument('--log fatal') fireFox_Profile = webdriver.FirefoxProfile() fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent']) - - if binary == None: + + if not binary: driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) else: try: driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) except: - driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options, service_log_path=os.path.devnull) + driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options, + service_log_path=os.path.devnull) elif browser == 'chrome': from selenium.webdriver.chrome.options import Options chrome_options = Options() - ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}", - "--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"] + ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}", + "--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"] for option in ops: chrome_options.add_argument(option) profile_path = os.path.join(data_dir, 'Selenium_chromium') log_path = os.path.join(data_dir, 'chromedriver.log') - if binary == None: - if executable == None: + if not binary: + if not executable: driver = webdriver.Chrome(options=chrome_options) else: from selenium.webdriver.common.desired_capabilities import DesiredCapabilities @@ -164,25 +164,26 @@ def driver_select(): cap['binary_location'] = executable driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options) else: - if executable == None: + if not executable: driver = webdriver.Chrome(options=chrome_options) else: from selenium.webdriver.common.desired_capabilities import DesiredCapabilities cap = DesiredCapabilities.CHROME cap['binary_location'] = executable - driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, service_log_path=os.path.devnull) + driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, + service_log_path=os.path.devnull) return driver def cloudflare_wait(driver): ''' It waits until cloudflare has gone away before doing any further actions. - The way it works is by getting the title of the page + The way it works is by getting the title of the page and as long as it is "Just a moment..." it will keep waiting. - This part of the code won't make the code execute slower + This part of the code won't make the code execute slower if the target website has no Cloudflare redirection. - At most it will sleep 1 second as a precaution. - Also, i have made it time out after 50 seconds, useful if the target website is not responsive + At most it will sleep 1 second as a precaution. + Also, i have made it time out after 50 seconds, useful if the target website is not responsive and to stop it from running infinitely. ''' abort_after = 50 @@ -199,10 +200,10 @@ def cloudflare_wait(driver): title = driver.title if not title == "Just a moment...": break - time.sleep(1) # This is necessary to make sure everything has loaded fine. + time.sleep(1) # This is necessary to make sure everything has loaded fine. -def request(request_type, url, **kwargs): #Headers not yet supported , headers={} +def request(request_type, url, **kwargs): # Headers not yet supported , headers={} params = kwargs.get('params', {}) url = url if not params else url + '?' + urlencode(params) @@ -222,11 +223,11 @@ def request(request_type, url, **kwargs): #Headers not yet supported , headers={ try: cloudflare_wait(driver) - user_agent = driver.execute_script("return navigator.userAgent;") #dirty, but allows for all sorts of things above + user_agent = driver.execute_script("return navigator.userAgent;") cookies = driver.get_cookies() text = driver.page_source driver.close() - + seleResponse = SeleResponse(url, request_type, text, cookies, user_agent) cache_request(seleResponse) return seleResponse @@ -235,11 +236,10 @@ def request(request_type, url, **kwargs): #Headers not yet supported , headers={ driver.save_screenshot(f"{get_data_dir()}/screenshot.png"); driver.close() logger.error(f'There was a problem getting the page: {url}.' + - '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png') + '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png') exit() - class SeleResponse: """ Class for the selenium response. @@ -257,6 +257,7 @@ class SeleResponse: user_agent: string User agent used on the webpage """ + def __init__(self, url, method, text, cookies, user_agent): self.url = url self.method = method From 924d84499c73dbd459f872a002b8a792aafaf52b Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Sun, 18 Oct 2020 18:51:09 +0300 Subject: [PATCH 13/21] Update selescrape.py --- anime_downloader/sites/helpers/selescrape.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 3679377..2b6dc13 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -145,6 +145,8 @@ def driver_select(): service_log_path=os.path.devnull) elif browser == 'chrome': + profile_path = os.path.join(data_dir, 'Selenium_chromium') + log_path = os.path.join(data_dir, 'chromedriver.log') from selenium.webdriver.chrome.options import Options chrome_options = Options() ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}", @@ -152,9 +154,6 @@ def driver_select(): for option in ops: chrome_options.add_argument(option) - profile_path = os.path.join(data_dir, 'Selenium_chromium') - log_path = os.path.join(data_dir, 'chromedriver.log') - if not binary: if not executable: driver = webdriver.Chrome(options=chrome_options) From c067bcad78233a0466d89c16161c1f60fd0b7937 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Sat, 24 Oct 2020 22:39:35 +0300 Subject: [PATCH 14/21] fixed a bug --- anime_downloader/sites/helpers/selescrape.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 2b6dc13..88ba4d0 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -195,11 +195,12 @@ def cloudflare_wait(driver): if delta >= abort_after: logger.error(f'Timeout:\tCouldnt bypass cloudflare. \ See the screenshot for more info:\t{get_data_dir()}/screenshot.png') - break + return 1 title = driver.title if not title == "Just a moment...": break - time.sleep(1) # This is necessary to make sure everything has loaded fine. + time.sleep(2) # This is necessary to make sure everything has loaded fine. + return 0 def request(request_type, url, **kwargs): # Headers not yet supported , headers={} @@ -221,11 +222,15 @@ def request(request_type, url, **kwargs): # Headers not yet supported , headers try: - cloudflare_wait(driver) + exit_code = cloudflare_wait(driver) user_agent = driver.execute_script("return navigator.userAgent;") cookies = driver.get_cookies() text = driver.page_source driver.close() + if exit_code == 0: + pass + else: + return SeleResponse(url, request_type, None, cookies, user_agent) seleResponse = SeleResponse(url, request_type, text, cookies, user_agent) cache_request(seleResponse) From 0108b852e6d5a05ed3e65ebce283cb25c87b5085 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixGamer@users.noreply.github.com> Date: Thu, 29 Oct 2020 23:42:44 +0200 Subject: [PATCH 15/21] Update selescrape.py --- anime_downloader/sites/helpers/selescrape.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 88ba4d0..2faad32 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -206,9 +206,9 @@ def cloudflare_wait(driver): def request(request_type, url, **kwargs): # Headers not yet supported , headers={} params = kwargs.get('params', {}) url = url if not params else url + '?' + urlencode(params) - - if bool(check_cache(url)): - cached_data = check_cache(url) + check_caches = check_cache(url) + if bool(check_caches): + cached_data = check_caches text = cached_data['data'] user_agent = cached_data['user_agent'] request_type = cached_data['method'] From 1f9a7dd35fbfa78962df301d5563c8a989656f12 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixWasTaken@users.noreply.github.com> Date: Tue, 25 May 2021 22:13:14 +0300 Subject: [PATCH 16/21] reworked some logic and also made the code more readable --- anime_downloader/sites/helpers/selescrape.py | 182 ++++++++++++------- 1 file changed, 113 insertions(+), 69 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 9ad32c7..89f4c80 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -4,14 +4,25 @@ from urllib.parse import urlencode from selenium import webdriver from sys import platform import tempfile -import os import logging import click import time import json +import os + + +def open_config(): + from anime_downloader.config import Config + return Config + serverLogger.setLevel(logging.ERROR) logger = logging.getLogger(__name__) +TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache') +data = open_config() + +if not os.path.isdir(TEMP_FOLDER): + os.makedirs(TEMP_FOLDER) def get_data_dir(): @@ -23,14 +34,6 @@ def get_data_dir(): return os.path.join(click.get_app_dir(APP_NAME), 'data') -def open_config(): - from anime_downloader.config import Config - return Config - - -data = open_config() - - def get_browser_config(): ''' Decides what browser selescrape will use. @@ -63,24 +66,31 @@ def get_browser_executable(): def get_driver_binary(): value = data['dl']['selescrape_driver_binary_path'] - binary_path = value.lower() if value else value - return binary_path + if value: + return value + + return None def cache_request(sele_response): """ This function saves the response from a Selenium request in a json. - It uses timestamps so that the rest of the code can know if the cache has expired or not. + It uses timestamps to can know if the cache has expired or not. """ - file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') + file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json') + if os.path.isfile(file): with open(file, 'r') as f: tmp_cache = json.load(f) else: tmp_cache = {} + data = sele_response.__dict__ - tmp_cache[data['url']] = { + url = data['url'] + url = (url[:-1] if url and url[-1] == '/' else url) + + tmp_cache[url] = { 'data': data['text'], 'expiry': time.time(), 'method': data['method'], @@ -96,80 +106,111 @@ def check_cache(url): """ This function checks if the cache file exists, if it exists then it will read the file - And it will verify if the cache is less than or equal to 1 hour ago + And it will verify if the cache is less than or equal to 30 mins ago If it is, it will return it as it is. If it isn't, it will delete the expired cache from the file and return None If the file doesn't exist at all it will return None """ - file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') + file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json') if os.path.isfile(file): + with open(file, 'r') as f: data = json.load(f) - if url not in data: + + # Yes, this is ugly, + # but its the best way that I found to find the cache + # when the url is not exactly the same (a slash at the end or not) + clean_url = (url[:-1] if url and url[-1] == '/' else url) + found = False + + for link in data: + if link == clean_url: + url = link + found = True + + if not found: return + timestamp = data[url]['expiry'] - if (time.time() - timestamp <= 3600): + + if (time.time() - timestamp <= 1800): return data[url] else: data.pop(url, None) + with open(file, 'w') as f: json.dump(data, f, indent=4) def driver_select(): ''' - it configures what each browser should do - and gives the driver variable that is used - to perform any actions below this function. + This configures what each browser should do + and returns the corresponding driver. ''' browser = get_browser_config() data_dir = get_data_dir() executable = get_browser_executable() - driver_binary = get_driver_binary() - binary = None if not driver_binary else driver_binary + binary = get_driver_binary() + if browser == 'firefox': fireFox_Options = webdriver.FirefoxOptions() - fireFox_Options.headless = True - fireFox_Options.add_argument('--log fatal') - fireFox_Profile = webdriver.FirefoxProfile() - fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent']) + ops = [ + "--width=1920", "--height=1080", + "headless", "--log fatal" + ] - if not binary: - driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) - else: - try: - driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) - except: - driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options, - service_log_path=os.path.devnull) + for option in ops: + fireFox_Options.add_argument(option) + + fireFox_Profile = webdriver.FirefoxProfile() + fireFox_Profile.set_preference( + "general.useragent.override", get_random_header()['user-agent'] + ) + + driver = webdriver.Firefox( + # sets user-agent + firefox_profile=fireFox_Profile, + # sets various firefox settings + options=fireFox_Options, + # by default it will be None, if a chromedriver location is in the config then it will use that + executable_path=(binary if binary else "geckodriver"), + # an attempt at stopping selenium from printing a pile of garbage to the console. + service_log_path=os.path.devnull + ) elif browser == 'chrome': - profile_path = os.path.join(data_dir, 'Selenium_chromium') - log_path = os.path.join(data_dir, 'chromedriver.log') from selenium.webdriver.chrome.options import Options + + profile_path = os.path.join(data_dir, 'Selenium_chromium') chrome_options = Options() - ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}", - "--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"] + + ops = [ + "--headless", "--disable-gpu", '--log-level=OFF', + f"--user-data-dir={profile_path}", "--no-sandbox", + "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa + ] + for option in ops: chrome_options.add_argument(option) + cap = None + if not binary: - if not executable: - driver = webdriver.Chrome(options=chrome_options) - else: - from selenium.webdriver.common.desired_capabilities import DesiredCapabilities - cap = DesiredCapabilities.CHROME - cap['binary_location'] = executable - driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options) - else: - if not executable: - driver = webdriver.Chrome(options=chrome_options) - else: - from selenium.webdriver.common.desired_capabilities import DesiredCapabilities - cap = DesiredCapabilities.CHROME - cap['binary_location'] = executable - driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, - service_log_path=os.path.devnull) + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities + + cap = DesiredCapabilities.CHROME + cap['binary_location'] = executable + + driver = webdriver.Chrome( + # sets user-agent, and various chrome settings + options=chrome_options, + # by default it will be None, if a chromedriver location is in the config then it will use that + executable_path=binary, + # by default it will be None, if a binary location is in the config then it will use that + desired_capabilities=cap, + # an attempt at stopping selenium from printing a pile of garbage to the console. + service_log_path=os.path.devnull + ) return driver @@ -184,19 +225,19 @@ def cloudflare_wait(driver): Also, i have made it time out after 50 seconds, useful if the target website is not responsive and to stop it from running infinitely. ''' - abort_after = 50 + abort_after = 50 # seconds start = time.time() title = driver.title # title = "Just a moment..." - while title == "Just a moment...": - time.sleep(0.25) + while "Just a moment" in title: + time.sleep(0.35) delta = time.time() - start if delta >= abort_after: logger.error(f'Timeout:\tCouldnt bypass cloudflare. \ See the screenshot for more info:\t{get_data_dir()}/screenshot.png') return 1 title = driver.title - if not title == "Just a moment...": + if not "Just a moment" in title: break time.sleep(2) # This is necessary to make sure everything has loaded fine. return 0 @@ -204,10 +245,11 @@ def cloudflare_wait(driver): def request(request_type, url, **kwargs): # Headers not yet supported , headers={} params = kwargs.get('params', {}) + url = url if not params else url + '?' + urlencode(params) - check_caches = check_cache(url) - if bool(check_caches): - cached_data = check_caches + cached_data = check_cache(url) + + if cached_data: text = cached_data['data'] user_agent = cached_data['user_agent'] request_type = cached_data['method'] @@ -215,28 +257,30 @@ def request(request_type, url, **kwargs): # Headers not yet supported , headers return SeleResponse(url, request_type, text, cookies, user_agent) else: - driver = driver_select() driver.get(url) try: - exit_code = cloudflare_wait(driver) user_agent = driver.execute_script("return navigator.userAgent;") cookies = driver.get_cookies() text = driver.page_source driver.close() - if exit_code == 0: - pass - else: + + if exit_code != 0: return SeleResponse(url, request_type, None, cookies, user_agent) - seleResponse = SeleResponse(url, request_type, text, cookies, user_agent) + seleResponse = SeleResponse( + url, request_type, + text, cookies, + user_agent + ) + cache_request(seleResponse) return seleResponse except: - driver.save_screenshot(f"{get_data_dir()}/screenshot.png"); + driver.save_screenshot(f"{get_data_dir()}/screenshot.png") driver.close() logger.error(f'There was a problem getting the page: {url}.' + '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png') From 6d8b52af5f9f3c3f6612ffee7be3e27c63ee00ab Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixWasTaken@users.noreply.github.com> Date: Tue, 25 May 2021 22:18:19 +0300 Subject: [PATCH 17/21] fix firefox headless --- anime_downloader/sites/helpers/selescrape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 89f4c80..3677963 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -156,7 +156,7 @@ def driver_select(): fireFox_Options = webdriver.FirefoxOptions() ops = [ "--width=1920", "--height=1080", - "headless", "--log fatal" + "-headless", "--log fatal" ] for option in ops: From 7bcc0707486e28d92758476f0999656a651efde1 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixWasTaken@users.noreply.github.com> Date: Tue, 25 May 2021 22:37:32 +0300 Subject: [PATCH 18/21] Update selescrape.py --- anime_downloader/sites/helpers/selescrape.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index 3677963..acb4556 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -61,7 +61,8 @@ def get_browser_config(): def get_browser_executable(): value = data['dl']['selescrape_browser_executable_path'] executable_value = value.lower() if value else value - return executable_value + if executable_value: + return executable_value def get_driver_binary(): @@ -69,8 +70,6 @@ def get_driver_binary(): if value: return value - return None - def cache_request(sele_response): """ @@ -172,7 +171,9 @@ def driver_select(): firefox_profile=fireFox_Profile, # sets various firefox settings options=fireFox_Options, - # by default it will be None, if a chromedriver location is in the config then it will use that + # by default it will be None, if a binary location is in the config then it will use that + firefox_binary=None if not executable else executable, + # by default it will be "geckodriver", if a geckodriver location is in the config then it will use that executable_path=(binary if binary else "geckodriver"), # an attempt at stopping selenium from printing a pile of garbage to the console. service_log_path=os.path.devnull @@ -195,7 +196,7 @@ def driver_select(): cap = None - if not binary: + if executable: from selenium.webdriver.common.desired_capabilities import DesiredCapabilities cap = DesiredCapabilities.CHROME @@ -204,8 +205,8 @@ def driver_select(): driver = webdriver.Chrome( # sets user-agent, and various chrome settings options=chrome_options, - # by default it will be None, if a chromedriver location is in the config then it will use that - executable_path=binary, + # by default it will be "chromedriver", if a chromedriver location is in the config then it will use that + executable_path=(binary if binary else "chromedriver"), # by default it will be None, if a binary location is in the config then it will use that desired_capabilities=cap, # an attempt at stopping selenium from printing a pile of garbage to the console. From 220f097333ea58dc7655ace933be787f0bc5f694 Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixWasTaken@users.noreply.github.com> Date: Tue, 25 May 2021 22:46:12 +0300 Subject: [PATCH 19/21] added cache flag --- anime_downloader/sites/helpers/selescrape.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/anime_downloader/sites/helpers/selescrape.py b/anime_downloader/sites/helpers/selescrape.py index acb4556..ec4891d 100644 --- a/anime_downloader/sites/helpers/selescrape.py +++ b/anime_downloader/sites/helpers/selescrape.py @@ -16,6 +16,7 @@ def open_config(): return Config +cache = False serverLogger.setLevel(logging.ERROR) logger = logging.getLogger(__name__) TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache') @@ -76,6 +77,8 @@ def cache_request(sele_response): This function saves the response from a Selenium request in a json. It uses timestamps to can know if the cache has expired or not. """ + if not cache: + return file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json') @@ -105,11 +108,13 @@ def check_cache(url): """ This function checks if the cache file exists, if it exists then it will read the file - And it will verify if the cache is less than or equal to 30 mins ago + And it will verify if the cache is less than or equal to 30 mins old If it is, it will return it as it is. If it isn't, it will delete the expired cache from the file and return None If the file doesn't exist at all it will return None """ + if not cache: + return file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json') if os.path.isfile(file): From 46d7db8fac119163b7338ba0638622cf66026d1e Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixWasTaken@users.noreply.github.com> Date: Tue, 25 May 2021 22:46:32 +0300 Subject: [PATCH 20/21] Update request.py --- anime_downloader/sites/helpers/request.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anime_downloader/sites/helpers/request.py b/anime_downloader/sites/helpers/request.py index 924ea6b..c193f1c 100644 --- a/anime_downloader/sites/helpers/request.py +++ b/anime_downloader/sites/helpers/request.py @@ -57,6 +57,7 @@ def setup(func): from selenium import webdriver from anime_downloader.sites.helpers import selescrape sess = selescrape + sess.cache = cache except ImportError: sess = cf_session logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run: 'pip install selenium' .") From 4e184b3f6c8cc82396e74db7adcf66bf5f70ae5d Mon Sep 17 00:00:00 2001 From: Arjix <53124886+ArjixWasTaken@users.noreply.github.com> Date: Tue, 25 May 2021 22:47:40 +0300 Subject: [PATCH 21/21] added sel to docstring --- anime_downloader/sites/helpers/request.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anime_downloader/sites/helpers/request.py b/anime_downloader/sites/helpers/request.py index c193f1c..2881573 100644 --- a/anime_downloader/sites/helpers/request.py +++ b/anime_downloader/sites/helpers/request.py @@ -46,6 +46,8 @@ def setup(func): cf : bool cf if True performs the request through cfscrape. For cloudflare protected sites. + sel : bool + sel if True perfroms the request through selescrape (selenium). referer : str a url sent as referer in request headers ''' @@ -108,6 +110,8 @@ def get(url: str, cf : bool cf if True performs the request through cfscrape. For cloudflare protected sites. + sel : bool + sel if True perfroms the request through selescrape (selenium). referer : str a url sent as referer in request headers '''