reworked some logic and also made the code more readable

master
Arjix 2021-05-25 22:13:14 +03:00 committed by GitHub
parent 4f21f6600f
commit 1f9a7dd35f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 113 additions and 69 deletions

View File

@ -4,14 +4,25 @@ from urllib.parse import urlencode
from selenium import webdriver
from sys import platform
import tempfile
import os
import logging
import click
import time
import json
import os
def open_config():
from anime_downloader.config import Config
return Config
serverLogger.setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
data = open_config()
if not os.path.isdir(TEMP_FOLDER):
os.makedirs(TEMP_FOLDER)
def get_data_dir():
@ -23,14 +34,6 @@ def get_data_dir():
return os.path.join(click.get_app_dir(APP_NAME), 'data')
def open_config():
from anime_downloader.config import Config
return Config
data = open_config()
def get_browser_config():
'''
Decides what browser selescrape will use.
@ -63,24 +66,31 @@ def get_browser_executable():
def get_driver_binary():
value = data['dl']['selescrape_driver_binary_path']
binary_path = value.lower() if value else value
return binary_path
if value:
return value
return None
def cache_request(sele_response):
"""
This function saves the response from a Selenium request in a json.
It uses timestamps so that the rest of the code can know if the cache has expired or not.
It uses timestamps to can know if the cache has expired or not.
"""
file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
if os.path.isfile(file):
with open(file, 'r') as f:
tmp_cache = json.load(f)
else:
tmp_cache = {}
data = sele_response.__dict__
tmp_cache[data['url']] = {
url = data['url']
url = (url[:-1] if url and url[-1] == '/' else url)
tmp_cache[url] = {
'data': data['text'],
'expiry': time.time(),
'method': data['method'],
@ -96,80 +106,111 @@ def check_cache(url):
"""
This function checks if the cache file exists,
if it exists then it will read the file
And it will verify if the cache is less than or equal to 1 hour ago
And it will verify if the cache is less than or equal to 30 mins ago
If it is, it will return it as it is.
If it isn't, it will delete the expired cache from the file and return None
If the file doesn't exist at all it will return None
"""
file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
if os.path.isfile(file):
with open(file, 'r') as f:
data = json.load(f)
if url not in data:
# Yes, this is ugly,
# but its the best way that I found to find the cache
# when the url is not exactly the same (a slash at the end or not)
clean_url = (url[:-1] if url and url[-1] == '/' else url)
found = False
for link in data:
if link == clean_url:
url = link
found = True
if not found:
return
timestamp = data[url]['expiry']
if (time.time() - timestamp <= 3600):
if (time.time() - timestamp <= 1800):
return data[url]
else:
data.pop(url, None)
with open(file, 'w') as f:
json.dump(data, f, indent=4)
def driver_select():
'''
it configures what each browser should do
and gives the driver variable that is used
to perform any actions below this function.
This configures what each browser should do
and returns the corresponding driver.
'''
browser = get_browser_config()
data_dir = get_data_dir()
executable = get_browser_executable()
driver_binary = get_driver_binary()
binary = None if not driver_binary else driver_binary
binary = get_driver_binary()
if browser == 'firefox':
fireFox_Options = webdriver.FirefoxOptions()
fireFox_Options.headless = True
fireFox_Options.add_argument('--log fatal')
fireFox_Profile = webdriver.FirefoxProfile()
fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent'])
ops = [
"--width=1920", "--height=1080",
"headless", "--log fatal"
]
if not binary:
driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
else:
try:
driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
except:
driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options,
service_log_path=os.path.devnull)
for option in ops:
fireFox_Options.add_argument(option)
fireFox_Profile = webdriver.FirefoxProfile()
fireFox_Profile.set_preference(
"general.useragent.override", get_random_header()['user-agent']
)
driver = webdriver.Firefox(
# sets user-agent
firefox_profile=fireFox_Profile,
# sets various firefox settings
options=fireFox_Options,
# by default it will be None, if a chromedriver location is in the config then it will use that
executable_path=(binary if binary else "geckodriver"),
# an attempt at stopping selenium from printing a pile of garbage to the console.
service_log_path=os.path.devnull
)
elif browser == 'chrome':
profile_path = os.path.join(data_dir, 'Selenium_chromium')
log_path = os.path.join(data_dir, 'chromedriver.log')
from selenium.webdriver.chrome.options import Options
profile_path = os.path.join(data_dir, 'Selenium_chromium')
chrome_options = Options()
ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}",
"--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"]
ops = [
"--headless", "--disable-gpu", '--log-level=OFF',
f"--user-data-dir={profile_path}", "--no-sandbox",
"--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa
]
for option in ops:
chrome_options.add_argument(option)
cap = None
if not binary:
if not executable:
driver = webdriver.Chrome(options=chrome_options)
else:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable
driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
else:
if not executable:
driver = webdriver.Chrome(options=chrome_options)
else:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable
driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options,
service_log_path=os.path.devnull)
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable
driver = webdriver.Chrome(
# sets user-agent, and various chrome settings
options=chrome_options,
# by default it will be None, if a chromedriver location is in the config then it will use that
executable_path=binary,
# by default it will be None, if a binary location is in the config then it will use that
desired_capabilities=cap,
# an attempt at stopping selenium from printing a pile of garbage to the console.
service_log_path=os.path.devnull
)
return driver
@ -184,19 +225,19 @@ def cloudflare_wait(driver):
Also, i have made it time out after 50 seconds, useful if the target website is not responsive
and to stop it from running infinitely.
'''
abort_after = 50
abort_after = 50 # seconds
start = time.time()
title = driver.title # title = "Just a moment..."
while title == "Just a moment...":
time.sleep(0.25)
while "Just a moment" in title:
time.sleep(0.35)
delta = time.time() - start
if delta >= abort_after:
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
return 1
title = driver.title
if not title == "Just a moment...":
if not "Just a moment" in title:
break
time.sleep(2) # This is necessary to make sure everything has loaded fine.
return 0
@ -204,10 +245,11 @@ def cloudflare_wait(driver):
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
params = kwargs.get('params', {})
url = url if not params else url + '?' + urlencode(params)
check_caches = check_cache(url)
if bool(check_caches):
cached_data = check_caches
cached_data = check_cache(url)
if cached_data:
text = cached_data['data']
user_agent = cached_data['user_agent']
request_type = cached_data['method']
@ -215,28 +257,30 @@ def request(request_type, url, **kwargs): # Headers not yet supported , headers
return SeleResponse(url, request_type, text, cookies, user_agent)
else:
driver = driver_select()
driver.get(url)
try:
exit_code = cloudflare_wait(driver)
user_agent = driver.execute_script("return navigator.userAgent;")
cookies = driver.get_cookies()
text = driver.page_source
driver.close()
if exit_code == 0:
pass
else:
if exit_code != 0:
return SeleResponse(url, request_type, None, cookies, user_agent)
seleResponse = SeleResponse(url, request_type, text, cookies, user_agent)
seleResponse = SeleResponse(
url, request_type,
text, cookies,
user_agent
)
cache_request(seleResponse)
return seleResponse
except:
driver.save_screenshot(f"{get_data_dir()}/screenshot.png");
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
driver.close()
logger.error(f'There was a problem getting the page: {url}.' +
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')