reworked some logic and also made the code more readable
parent
4f21f6600f
commit
1f9a7dd35f
|
@ -4,14 +4,25 @@ from urllib.parse import urlencode
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from sys import platform
|
from sys import platform
|
||||||
import tempfile
|
import tempfile
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
import click
|
import click
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
|
||||||
|
def open_config():
|
||||||
|
from anime_downloader.config import Config
|
||||||
|
return Config
|
||||||
|
|
||||||
|
|
||||||
serverLogger.setLevel(logging.ERROR)
|
serverLogger.setLevel(logging.ERROR)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
|
||||||
|
data = open_config()
|
||||||
|
|
||||||
|
if not os.path.isdir(TEMP_FOLDER):
|
||||||
|
os.makedirs(TEMP_FOLDER)
|
||||||
|
|
||||||
|
|
||||||
def get_data_dir():
|
def get_data_dir():
|
||||||
|
@ -23,14 +34,6 @@ def get_data_dir():
|
||||||
return os.path.join(click.get_app_dir(APP_NAME), 'data')
|
return os.path.join(click.get_app_dir(APP_NAME), 'data')
|
||||||
|
|
||||||
|
|
||||||
def open_config():
|
|
||||||
from anime_downloader.config import Config
|
|
||||||
return Config
|
|
||||||
|
|
||||||
|
|
||||||
data = open_config()
|
|
||||||
|
|
||||||
|
|
||||||
def get_browser_config():
|
def get_browser_config():
|
||||||
'''
|
'''
|
||||||
Decides what browser selescrape will use.
|
Decides what browser selescrape will use.
|
||||||
|
@ -63,24 +66,31 @@ def get_browser_executable():
|
||||||
|
|
||||||
def get_driver_binary():
|
def get_driver_binary():
|
||||||
value = data['dl']['selescrape_driver_binary_path']
|
value = data['dl']['selescrape_driver_binary_path']
|
||||||
binary_path = value.lower() if value else value
|
if value:
|
||||||
return binary_path
|
return value
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def cache_request(sele_response):
|
def cache_request(sele_response):
|
||||||
"""
|
"""
|
||||||
This function saves the response from a Selenium request in a json.
|
This function saves the response from a Selenium request in a json.
|
||||||
It uses timestamps so that the rest of the code can know if the cache has expired or not.
|
It uses timestamps to can know if the cache has expired or not.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
|
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
|
||||||
|
|
||||||
if os.path.isfile(file):
|
if os.path.isfile(file):
|
||||||
with open(file, 'r') as f:
|
with open(file, 'r') as f:
|
||||||
tmp_cache = json.load(f)
|
tmp_cache = json.load(f)
|
||||||
else:
|
else:
|
||||||
tmp_cache = {}
|
tmp_cache = {}
|
||||||
|
|
||||||
data = sele_response.__dict__
|
data = sele_response.__dict__
|
||||||
tmp_cache[data['url']] = {
|
url = data['url']
|
||||||
|
url = (url[:-1] if url and url[-1] == '/' else url)
|
||||||
|
|
||||||
|
tmp_cache[url] = {
|
||||||
'data': data['text'],
|
'data': data['text'],
|
||||||
'expiry': time.time(),
|
'expiry': time.time(),
|
||||||
'method': data['method'],
|
'method': data['method'],
|
||||||
|
@ -96,80 +106,111 @@ def check_cache(url):
|
||||||
"""
|
"""
|
||||||
This function checks if the cache file exists,
|
This function checks if the cache file exists,
|
||||||
if it exists then it will read the file
|
if it exists then it will read the file
|
||||||
And it will verify if the cache is less than or equal to 1 hour ago
|
And it will verify if the cache is less than or equal to 30 mins ago
|
||||||
If it is, it will return it as it is.
|
If it is, it will return it as it is.
|
||||||
If it isn't, it will delete the expired cache from the file and return None
|
If it isn't, it will delete the expired cache from the file and return None
|
||||||
If the file doesn't exist at all it will return None
|
If the file doesn't exist at all it will return None
|
||||||
"""
|
"""
|
||||||
file = os.path.join(tempfile.gettempdir(), 'selenium_cached_requests.json')
|
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
|
||||||
if os.path.isfile(file):
|
if os.path.isfile(file):
|
||||||
|
|
||||||
with open(file, 'r') as f:
|
with open(file, 'r') as f:
|
||||||
data = json.load(f)
|
data = json.load(f)
|
||||||
if url not in data:
|
|
||||||
|
# Yes, this is ugly,
|
||||||
|
# but its the best way that I found to find the cache
|
||||||
|
# when the url is not exactly the same (a slash at the end or not)
|
||||||
|
clean_url = (url[:-1] if url and url[-1] == '/' else url)
|
||||||
|
found = False
|
||||||
|
|
||||||
|
for link in data:
|
||||||
|
if link == clean_url:
|
||||||
|
url = link
|
||||||
|
found = True
|
||||||
|
|
||||||
|
if not found:
|
||||||
return
|
return
|
||||||
|
|
||||||
timestamp = data[url]['expiry']
|
timestamp = data[url]['expiry']
|
||||||
if (time.time() - timestamp <= 3600):
|
|
||||||
|
if (time.time() - timestamp <= 1800):
|
||||||
return data[url]
|
return data[url]
|
||||||
else:
|
else:
|
||||||
data.pop(url, None)
|
data.pop(url, None)
|
||||||
|
|
||||||
with open(file, 'w') as f:
|
with open(file, 'w') as f:
|
||||||
json.dump(data, f, indent=4)
|
json.dump(data, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
def driver_select():
|
def driver_select():
|
||||||
'''
|
'''
|
||||||
it configures what each browser should do
|
This configures what each browser should do
|
||||||
and gives the driver variable that is used
|
and returns the corresponding driver.
|
||||||
to perform any actions below this function.
|
|
||||||
'''
|
'''
|
||||||
browser = get_browser_config()
|
browser = get_browser_config()
|
||||||
data_dir = get_data_dir()
|
data_dir = get_data_dir()
|
||||||
executable = get_browser_executable()
|
executable = get_browser_executable()
|
||||||
driver_binary = get_driver_binary()
|
binary = get_driver_binary()
|
||||||
binary = None if not driver_binary else driver_binary
|
|
||||||
if browser == 'firefox':
|
if browser == 'firefox':
|
||||||
fireFox_Options = webdriver.FirefoxOptions()
|
fireFox_Options = webdriver.FirefoxOptions()
|
||||||
fireFox_Options.headless = True
|
ops = [
|
||||||
fireFox_Options.add_argument('--log fatal')
|
"--width=1920", "--height=1080",
|
||||||
fireFox_Profile = webdriver.FirefoxProfile()
|
"headless", "--log fatal"
|
||||||
fireFox_Profile.set_preference("general.useragent.override", get_random_header()['user-agent'])
|
]
|
||||||
|
|
||||||
if not binary:
|
for option in ops:
|
||||||
driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
|
fireFox_Options.add_argument(option)
|
||||||
else:
|
|
||||||
try:
|
fireFox_Profile = webdriver.FirefoxProfile()
|
||||||
driver = webdriver.Firefox(fireFox_Profile, options=fireFox_Options, service_log_path=os.path.devnull)
|
fireFox_Profile.set_preference(
|
||||||
except:
|
"general.useragent.override", get_random_header()['user-agent']
|
||||||
driver = webdriver.Firefox(fireFox_Profile, executable_path=binary, options=fireFox_Options,
|
)
|
||||||
service_log_path=os.path.devnull)
|
|
||||||
|
driver = webdriver.Firefox(
|
||||||
|
# sets user-agent
|
||||||
|
firefox_profile=fireFox_Profile,
|
||||||
|
# sets various firefox settings
|
||||||
|
options=fireFox_Options,
|
||||||
|
# by default it will be None, if a chromedriver location is in the config then it will use that
|
||||||
|
executable_path=(binary if binary else "geckodriver"),
|
||||||
|
# an attempt at stopping selenium from printing a pile of garbage to the console.
|
||||||
|
service_log_path=os.path.devnull
|
||||||
|
)
|
||||||
|
|
||||||
elif browser == 'chrome':
|
elif browser == 'chrome':
|
||||||
profile_path = os.path.join(data_dir, 'Selenium_chromium')
|
|
||||||
log_path = os.path.join(data_dir, 'chromedriver.log')
|
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
|
|
||||||
|
profile_path = os.path.join(data_dir, 'Selenium_chromium')
|
||||||
chrome_options = Options()
|
chrome_options = Options()
|
||||||
ops = ["--headless", "--disable-gpu", '--log-level=OFF', f"--user-data-dir={profile_path}",
|
|
||||||
"--no-sandbox", "--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}"]
|
ops = [
|
||||||
|
"--headless", "--disable-gpu", '--log-level=OFF',
|
||||||
|
f"--user-data-dir={profile_path}", "--no-sandbox",
|
||||||
|
"--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa
|
||||||
|
]
|
||||||
|
|
||||||
for option in ops:
|
for option in ops:
|
||||||
chrome_options.add_argument(option)
|
chrome_options.add_argument(option)
|
||||||
|
|
||||||
|
cap = None
|
||||||
|
|
||||||
if not binary:
|
if not binary:
|
||||||
if not executable:
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
|
||||||
else:
|
cap = DesiredCapabilities.CHROME
|
||||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
cap['binary_location'] = executable
|
||||||
cap = DesiredCapabilities.CHROME
|
|
||||||
cap['binary_location'] = executable
|
driver = webdriver.Chrome(
|
||||||
driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
|
# sets user-agent, and various chrome settings
|
||||||
else:
|
options=chrome_options,
|
||||||
if not executable:
|
# by default it will be None, if a chromedriver location is in the config then it will use that
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
executable_path=binary,
|
||||||
else:
|
# by default it will be None, if a binary location is in the config then it will use that
|
||||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
desired_capabilities=cap,
|
||||||
cap = DesiredCapabilities.CHROME
|
# an attempt at stopping selenium from printing a pile of garbage to the console.
|
||||||
cap['binary_location'] = executable
|
service_log_path=os.path.devnull
|
||||||
driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options,
|
)
|
||||||
service_log_path=os.path.devnull)
|
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
|
|
||||||
|
@ -184,19 +225,19 @@ def cloudflare_wait(driver):
|
||||||
Also, i have made it time out after 50 seconds, useful if the target website is not responsive
|
Also, i have made it time out after 50 seconds, useful if the target website is not responsive
|
||||||
and to stop it from running infinitely.
|
and to stop it from running infinitely.
|
||||||
'''
|
'''
|
||||||
abort_after = 50
|
abort_after = 50 # seconds
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
title = driver.title # title = "Just a moment..."
|
title = driver.title # title = "Just a moment..."
|
||||||
while title == "Just a moment...":
|
while "Just a moment" in title:
|
||||||
time.sleep(0.25)
|
time.sleep(0.35)
|
||||||
delta = time.time() - start
|
delta = time.time() - start
|
||||||
if delta >= abort_after:
|
if delta >= abort_after:
|
||||||
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
|
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
|
||||||
See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
||||||
return 1
|
return 1
|
||||||
title = driver.title
|
title = driver.title
|
||||||
if not title == "Just a moment...":
|
if not "Just a moment" in title:
|
||||||
break
|
break
|
||||||
time.sleep(2) # This is necessary to make sure everything has loaded fine.
|
time.sleep(2) # This is necessary to make sure everything has loaded fine.
|
||||||
return 0
|
return 0
|
||||||
|
@ -204,10 +245,11 @@ def cloudflare_wait(driver):
|
||||||
|
|
||||||
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
|
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
|
||||||
params = kwargs.get('params', {})
|
params = kwargs.get('params', {})
|
||||||
|
|
||||||
url = url if not params else url + '?' + urlencode(params)
|
url = url if not params else url + '?' + urlencode(params)
|
||||||
check_caches = check_cache(url)
|
cached_data = check_cache(url)
|
||||||
if bool(check_caches):
|
|
||||||
cached_data = check_caches
|
if cached_data:
|
||||||
text = cached_data['data']
|
text = cached_data['data']
|
||||||
user_agent = cached_data['user_agent']
|
user_agent = cached_data['user_agent']
|
||||||
request_type = cached_data['method']
|
request_type = cached_data['method']
|
||||||
|
@ -215,28 +257,30 @@ def request(request_type, url, **kwargs): # Headers not yet supported , headers
|
||||||
return SeleResponse(url, request_type, text, cookies, user_agent)
|
return SeleResponse(url, request_type, text, cookies, user_agent)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
|
||||||
driver = driver_select()
|
driver = driver_select()
|
||||||
driver.get(url)
|
driver.get(url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
exit_code = cloudflare_wait(driver)
|
exit_code = cloudflare_wait(driver)
|
||||||
user_agent = driver.execute_script("return navigator.userAgent;")
|
user_agent = driver.execute_script("return navigator.userAgent;")
|
||||||
cookies = driver.get_cookies()
|
cookies = driver.get_cookies()
|
||||||
text = driver.page_source
|
text = driver.page_source
|
||||||
driver.close()
|
driver.close()
|
||||||
if exit_code == 0:
|
|
||||||
pass
|
if exit_code != 0:
|
||||||
else:
|
|
||||||
return SeleResponse(url, request_type, None, cookies, user_agent)
|
return SeleResponse(url, request_type, None, cookies, user_agent)
|
||||||
|
|
||||||
seleResponse = SeleResponse(url, request_type, text, cookies, user_agent)
|
seleResponse = SeleResponse(
|
||||||
|
url, request_type,
|
||||||
|
text, cookies,
|
||||||
|
user_agent
|
||||||
|
)
|
||||||
|
|
||||||
cache_request(seleResponse)
|
cache_request(seleResponse)
|
||||||
return seleResponse
|
return seleResponse
|
||||||
|
|
||||||
except:
|
except:
|
||||||
driver.save_screenshot(f"{get_data_dir()}/screenshot.png");
|
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
|
||||||
driver.close()
|
driver.close()
|
||||||
logger.error(f'There was a problem getting the page: {url}.' +
|
logger.error(f'There was a problem getting the page: {url}.' +
|
||||||
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
||||||
|
|
Loading…
Reference in New Issue