reworked some logic and also made the code more readable

master
Arjix 2021-05-25 22:13:14 +03:00 committed by GitHub
parent 4f21f6600f
commit 1f9a7dd35f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 113 additions and 69 deletions

View File

@ -4,14 +4,25 @@ from urllib.parse import urlencode
from selenium import webdriver from selenium import webdriver
from sys import platform from sys import platform
import tempfile import tempfile
import os
import logging import logging
import click import click
import time import time
import json import json
import os
def open_config():
from anime_downloader.config import Config
return Config
serverLogger.setLevel(logging.ERROR) serverLogger.setLevel(logging.ERROR)
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
data = open_config()
if not os.path.isdir(TEMP_FOLDER):
os.makedirs(TEMP_FOLDER)
def get_data_dir(): def get_data_dir():
@ -23,14 +34,6 @@ def get_data_dir():
return os.path.join(click.get_app_dir(APP_NAME), 'data') return os.path.join(click.get_app_dir(APP_NAME), 'data')
def open_config():
from anime_downloader.config import Config
return Config
data = open_config()
def get_browser_config(): def get_browser_config():
''' '''
Decides what browser selescrape will use. Decides what browser selescrape will use.
@ -63,24 +66,31 @@ def get_browser_executable():
def get_driver_binary(): def get_driver_binary():
value = data['dl']['selescrape_driver_binary_path'] value = data['dl']['selescrape_driver_binary_path']
binary_path = value.lower() if value else value if value:
return binary_path return value
return None
def cache_request(sele_response): def cache_request(sele_response):
""" """
This function saves the response from a Selenium request in a json. This function saves the response from a Selenium request in a json.
It uses timestamps so that the rest of the code can know if the cache has expired or not. It uses timestamps to can know if the cache has expired or not.
""" """
file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
if os.path.isfile(file): if os.path.isfile(file):
with open(file, 'r') as f: with open(file, 'r') as f:
tmp_cache = json.load(f) tmp_cache = json.load(f)
else: else:
tmp_cache = {} tmp_cache = {}
data = sele_response.__dict__ data = sele_response.__dict__
tmp_cache[data['url']] = { url = data['url']
url = (url[:-1] if url and url[-1] == '/' else url)
tmp_cache[url] = {
'data': data['text'], 'data': data['text'],
'expiry': time.time(), 'expiry': time.time(),
'method': data['method'], 'method': data['method'],
@ -96,80 +106,111 @@ def check_cache(url):
""" """
This function checks if the cache file exists, This function checks if the cache file exists,
if it exists then it will read the file if it exists then it will read the file
And it will verify if the cache is less than or equal to 1 hour ago And it will verify if the cache is less than or equal to 30 mins ago
If it is, it will return it as it is. If it is, it will return it as it is.
If it isn't, it will delete the expired cache from the file and return None If it isn't, it will delete the expired cache from the file and return None
If the file doesn't exist at all it will return None If the file doesn't exist at all it will return None
""" """
file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json') file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
if os.path.isfile(file): if os.path.isfile(file):
with open(file, 'r') as f: with open(file, 'r') as f:
data = json.load(f) data = json.load(f)
if url not in data:
# Yes, this is ugly,
# but its the best way that I found to find the cache
# when the url is not exactly the same (a slash at the end or not)
clean_url = (url[:-1] if url and url[-1] == '/' else url)
found = False
for link in data:
if link == clean_url:
url = link
found = True
if not found:
return return
timestamp = data[url]['expiry'] timestamp = data[url]['expiry']
if (time.time() - timestamp <= 3600):
if (time.time() - timestamp <= 1800):
return data[url] return data[url]
else: else:
data.pop(url, None) data.pop(url, None)
with open(file, 'w') as f: with open(file, 'w') as f:
json.dump(data, f, indent=4) json.dump(data, f, indent=4)
def driver_select(): def driver_select():
''' '''
it configures what each browser should do This configures what each browser should do
and gives the driver variable that is used and returns the corresponding driver.
to perform any actions below this function.
''' '''
browser = get_browser_config() browser = get_browser_config()
data_dir = get_data_dir() data_dir = get_data_dir()
executable = get_browser_executable() executable = get_browser_executable()
driver_binary = get_driver_binary() binary = get_driver_binary()
binary = None if not driver_binary else driver_binary
if browser == 'firefox': if browser == 'firefox':
fireFox_Options = webdriver.FirefoxOptions() fireFox_Options = webdriver.FirefoxOptions()
fireFox_Options.headless = True ops = [
fireFox_Options.add_argument('--log fatal') "--width=1920", "--height=1080",
fireFox_Profile = webdriver.FirefoxProfile() "headless", "--log fatal"
fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent']) ]
if not binary: for option in ops:
driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) fireFox_Options.add_argument(option)
else:
try: fireFox_Profile = webdriver.FirefoxProfile()
driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull) fireFox_Profile.set_preference(
except: "general.useragent.override", get_random_header()['user-agent']
driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options, )
service_log_path=os.path.devnull)
driver = webdriver.Firefox(
# sets user-agent
firefox_profile=fireFox_Profile,
# sets various firefox settings
options=fireFox_Options,
# by default it will be None, if a chromedriver location is in the config then it will use that
executable_path=(binary if binary else "geckodriver"),
# an attempt at stopping selenium from printing a pile of garbage to the console.
service_log_path=os.path.devnull
)
elif browser == 'chrome': elif browser == 'chrome':
profile_path = os.path.join(data_dir, 'Selenium_chromium')
log_path = os.path.join(data_dir, 'chromedriver.log')
from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.options import Options
profile_path = os.path.join(data_dir, 'Selenium_chromium')
chrome_options = Options() chrome_options = Options()
ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}",
"--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"] ops = [
"--headless", "--disable-gpu", '--log-level=OFF',
f"--user-data-dir={profile_path}", "--no-sandbox",
"--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa
]
for option in ops: for option in ops:
chrome_options.add_argument(option) chrome_options.add_argument(option)
cap = None
if not binary: if not binary:
if not executable: from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
driver = webdriver.Chrome(options=chrome_options)
else: cap = DesiredCapabilities.CHROME
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities cap['binary_location'] = executable
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable driver = webdriver.Chrome(
driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options) # sets user-agent, and various chrome settings
else: options=chrome_options,
if not executable: # by default it will be None, if a chromedriver location is in the config then it will use that
driver = webdriver.Chrome(options=chrome_options) executable_path=binary,
else: # by default it will be None, if a binary location is in the config then it will use that
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities desired_capabilities=cap,
cap = DesiredCapabilities.CHROME # an attempt at stopping selenium from printing a pile of garbage to the console.
cap['binary_location'] = executable service_log_path=os.path.devnull
driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, )
service_log_path=os.path.devnull)
return driver return driver
@ -184,19 +225,19 @@ def cloudflare_wait(driver):
Also, i have made it time out after 50 seconds, useful if the target website is not responsive Also, i have made it time out after 50 seconds, useful if the target website is not responsive
and to stop it from running infinitely. and to stop it from running infinitely.
''' '''
abort_after = 50 abort_after = 50 # seconds
start = time.time() start = time.time()
title = driver.title # title = "Just a moment..." title = driver.title # title = "Just a moment..."
while title == "Just a moment...": while "Just a moment" in title:
time.sleep(0.25) time.sleep(0.35)
delta = time.time() - start delta = time.time() - start
if delta >= abort_after: if delta >= abort_after:
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \ logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
See the screenshot for more info:\t{get_data_dir()}/screenshot.png') See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
return 1 return 1
title = driver.title title = driver.title
if not title == "Just a moment...": if not "Just a moment" in title:
break break
time.sleep(2) # This is necessary to make sure everything has loaded fine. time.sleep(2) # This is necessary to make sure everything has loaded fine.
return 0 return 0
@ -204,10 +245,11 @@ def cloudflare_wait(driver):
def request(request_type, url, **kwargs): # Headers not yet supported , headers={} def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
params = kwargs.get('params', {}) params = kwargs.get('params', {})
url = url if not params else url + '?' + urlencode(params) url = url if not params else url + '?' + urlencode(params)
check_caches = check_cache(url) cached_data = check_cache(url)
if bool(check_caches):
cached_data = check_caches if cached_data:
text = cached_data['data'] text = cached_data['data']
user_agent = cached_data['user_agent'] user_agent = cached_data['user_agent']
request_type = cached_data['method'] request_type = cached_data['method']
@ -215,28 +257,30 @@ def request(request_type, url, **kwargs): # Headers not yet supported , headers
return SeleResponse(url, request_type, text, cookies, user_agent) return SeleResponse(url, request_type, text, cookies, user_agent)
else: else:
driver = driver_select() driver = driver_select()
driver.get(url) driver.get(url)
try: try:
exit_code = cloudflare_wait(driver) exit_code = cloudflare_wait(driver)
user_agent = driver.execute_script("return navigator.userAgent;") user_agent = driver.execute_script("return navigator.userAgent;")
cookies = driver.get_cookies() cookies = driver.get_cookies()
text = driver.page_source text = driver.page_source
driver.close() driver.close()
if exit_code == 0:
pass if exit_code != 0:
else:
return SeleResponse(url, request_type, None, cookies, user_agent) return SeleResponse(url, request_type, None, cookies, user_agent)
seleResponse = SeleResponse(url, request_type, text, cookies, user_agent) seleResponse = SeleResponse(
url, request_type,
text, cookies,
user_agent
)
cache_request(seleResponse) cache_request(seleResponse)
return seleResponse return seleResponse
except: except:
driver.save_screenshot(f"{get_data_dir()}/screenshot.png"); driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
driver.close() driver.close()
logger.error(f'There was a problem getting the page: {url}.' + logger.error(f'There was a problem getting the page: {url}.' +
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png') '\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')