Merge pull request #503 from ArjixWasTaken/patch-16
added a cache system for SeleScrape and Improve SeleScrape generallymaster
commit
0fc9613400
|
@ -46,6 +46,8 @@ def setup(func):
|
|||
cf : bool
|
||||
cf if True performs the request through cfscrape.
|
||||
For cloudflare protected sites.
|
||||
sel : bool
|
||||
sel if True perfroms the request through selescrape (selenium).
|
||||
referer : str
|
||||
a url sent as referer in request headers
|
||||
'''
|
||||
|
@ -57,6 +59,7 @@ def setup(func):
|
|||
from selenium import webdriver
|
||||
from anime_downloader.sites.helpers import selescrape
|
||||
sess = selescrape
|
||||
sess.cache = cache
|
||||
except ImportError:
|
||||
sess = cf_session
|
||||
logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run: 'pip install selenium' .")
|
||||
|
@ -107,6 +110,8 @@ def get(url: str,
|
|||
cf : bool
|
||||
cf if True performs the request through cfscrape.
|
||||
For cloudflare protected sites.
|
||||
sel : bool
|
||||
sel if True perfroms the request through selescrape (selenium).
|
||||
referer : str
|
||||
a url sent as referer in request headers
|
||||
'''
|
||||
|
|
|
@ -1,31 +1,14 @@
|
|||
from selenium.webdriver.support import expected_conditions as EC
|
||||
from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from anime_downloader.const import get_random_header
|
||||
from selenium.webdriver.common.by import By
|
||||
from urllib.parse import urlencode
|
||||
from urllib.parse import urlsplit
|
||||
from selenium import webdriver
|
||||
from bs4 import BeautifulSoup
|
||||
from logging import exception
|
||||
from sys import platform
|
||||
import requests
|
||||
import os
|
||||
import tempfile
|
||||
import logging
|
||||
import click
|
||||
import time
|
||||
import json
|
||||
serverLogger.setLevel(logging.ERROR)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def get_data_dir():
|
||||
'''
|
||||
Gets the folder directory selescrape will store data,
|
||||
such as cookies or browser extensions and logs.
|
||||
'''
|
||||
APP_NAME = 'anime downloader'
|
||||
return os.path.join(click.get_app_dir(APP_NAME), 'data')
|
||||
import os
|
||||
|
||||
|
||||
def open_config():
|
||||
|
@ -33,8 +16,24 @@ def open_config():
|
|||
return Config
|
||||
|
||||
|
||||
cache = False
|
||||
serverLogger.setLevel(logging.ERROR)
|
||||
logger = logging.getLogger(__name__)
|
||||
TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
|
||||
data = open_config()
|
||||
|
||||
if not os.path.isdir(TEMP_FOLDER):
|
||||
os.makedirs(TEMP_FOLDER)
|
||||
|
||||
|
||||
def get_data_dir():
|
||||
'''
|
||||
Gets the folder directory selescrape will store data,
|
||||
such as cookies or browser extensions and logs.
|
||||
'''
|
||||
APP_NAME = 'anime downloader'
|
||||
return os.path.join(click.get_app_dir(APP_NAME), 'data')
|
||||
|
||||
|
||||
def get_browser_config():
|
||||
'''
|
||||
|
@ -50,148 +49,248 @@ def get_browser_config():
|
|||
browser = os_browser[a]
|
||||
else:
|
||||
browser = 'chrome'
|
||||
|
||||
value = data['dl']['selescrape_browser']
|
||||
value = value.lower() if value else value
|
||||
|
||||
if value in ['chrome', 'firefox']:
|
||||
browser = value
|
||||
|
||||
return browser
|
||||
|
||||
|
||||
def get_browser_executable():
|
||||
value = data['dl']['selescrape_browser_executable_path']
|
||||
executable_value = value.lower() if value else value
|
||||
return executable_value
|
||||
if executable_value:
|
||||
return executable_value
|
||||
|
||||
|
||||
def get_driver_binary():
|
||||
value = data['dl']['selescrape_driver_binary_path']
|
||||
binary_path = value.lower() if value else value
|
||||
return binary_path
|
||||
if value:
|
||||
return value
|
||||
|
||||
|
||||
def add_url_params(url, params):
|
||||
return url if not params else url + '?' + urlencode(params)
|
||||
def cache_request(sele_response):
|
||||
"""
|
||||
This function saves the response from a Selenium request in a json.
|
||||
It uses timestamps to can know if the cache has expired or not.
|
||||
"""
|
||||
if not cache:
|
||||
return
|
||||
|
||||
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
|
||||
|
||||
if os.path.isfile(file):
|
||||
with open(file, 'r') as f:
|
||||
tmp_cache = json.load(f)
|
||||
else:
|
||||
tmp_cache = {}
|
||||
|
||||
data = sele_response.__dict__
|
||||
url = data['url']
|
||||
url = (url[:-1] if url and url[-1] == '/' else url)
|
||||
|
||||
tmp_cache[url] = {
|
||||
'data': data['text'],
|
||||
'expiry': time.time(),
|
||||
'method': data['method'],
|
||||
'cookies': data['cookies'],
|
||||
'user_agent': data['user_agent']
|
||||
}
|
||||
|
||||
with open(file, 'w') as f:
|
||||
json.dump(tmp_cache, f, indent=4)
|
||||
|
||||
|
||||
def check_cache(url):
|
||||
"""
|
||||
This function checks if the cache file exists,
|
||||
if it exists then it will read the file
|
||||
And it will verify if the cache is less than or equal to 30 mins old
|
||||
If it is, it will return it as it is.
|
||||
If it isn't, it will delete the expired cache from the file and return None
|
||||
If the file doesn't exist at all it will return None
|
||||
"""
|
||||
if not cache:
|
||||
return
|
||||
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
|
||||
if os.path.isfile(file):
|
||||
|
||||
with open(file, 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Yes, this is ugly,
|
||||
# but its the best way that I found to find the cache
|
||||
# when the url is not exactly the same (a slash at the end or not)
|
||||
clean_url = (url[:-1] if url and url[-1] == '/' else url)
|
||||
found = False
|
||||
|
||||
for link in data:
|
||||
if link == clean_url:
|
||||
url = link
|
||||
found = True
|
||||
|
||||
if not found:
|
||||
return
|
||||
|
||||
timestamp = data[url]['expiry']
|
||||
|
||||
if (time.time() - timestamp <= 1800):
|
||||
return data[url]
|
||||
else:
|
||||
data.pop(url, None)
|
||||
|
||||
with open(file, 'w') as f:
|
||||
json.dump(data, f, indent=4)
|
||||
|
||||
|
||||
def driver_select():
|
||||
'''
|
||||
it configures what each browser should do
|
||||
and gives the driver variable that is used
|
||||
to perform any actions below this function.
|
||||
This configures what each browser should do
|
||||
and returns the corresponding driver.
|
||||
'''
|
||||
browser = get_browser_config()
|
||||
data_dir = get_data_dir()
|
||||
executable = get_browser_executable()
|
||||
driver_binary = get_driver_binary()
|
||||
binary = None if not driver_binary else driver_binary
|
||||
binary = get_driver_binary()
|
||||
|
||||
if browser == 'firefox':
|
||||
fireFoxOptions = webdriver.FirefoxOptions()
|
||||
fireFoxOptions.headless = True
|
||||
fireFoxOptions.add_argument('--log fatal')
|
||||
if binary == None:
|
||||
driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
|
||||
else:
|
||||
try:
|
||||
driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
|
||||
except:
|
||||
driver = webdriver.Firefox(executable_path=binary, options=fireFoxOptions, service_log_path=os.path.devnull)
|
||||
fireFox_Options = webdriver.FirefoxOptions()
|
||||
ops = [
|
||||
"--width=1920", "--height=1080",
|
||||
"-headless", "--log fatal"
|
||||
]
|
||||
|
||||
for option in ops:
|
||||
fireFox_Options.add_argument(option)
|
||||
|
||||
fireFox_Profile = webdriver.FirefoxProfile()
|
||||
fireFox_Profile.set_preference(
|
||||
"general.useragent.override", get_random_header()['user-agent']
|
||||
)
|
||||
|
||||
driver = webdriver.Firefox(
|
||||
# sets user-agent
|
||||
firefox_profile=fireFox_Profile,
|
||||
# sets various firefox settings
|
||||
options=fireFox_Options,
|
||||
# by default it will be None, if a binary location is in the config then it will use that
|
||||
firefox_binary=None if not executable else executable,
|
||||
# by default it will be "geckodriver", if a geckodriver location is in the config then it will use that
|
||||
executable_path=(binary if binary else "geckodriver"),
|
||||
# an attempt at stopping selenium from printing a pile of garbage to the console.
|
||||
service_log_path=os.path.devnull
|
||||
)
|
||||
|
||||
elif browser == 'chrome':
|
||||
from selenium.webdriver.chrome.options import Options
|
||||
chrome_options = Options()
|
||||
chrome_options.add_argument("--headless")
|
||||
chrome_options.add_argument("--disable-gpu")
|
||||
|
||||
profile_path = os.path.join(data_dir, 'Selenium_chromium')
|
||||
log_path = os.path.join(data_dir, 'chromedriver.log')
|
||||
chrome_options.add_argument('--log-level=OFF')
|
||||
chrome_options.add_argument(f"--user-data-dir={profile_path}")
|
||||
chrome_options.add_argument("--no-sandbox")
|
||||
chrome_options.add_argument("--window-size=1920,1080")
|
||||
chrome_options.add_argument(f'user-agent={get_random_header()}')
|
||||
if binary == None:
|
||||
if executable == None:
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
else:
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
cap = DesiredCapabilities.CHROME
|
||||
cap['binary_location'] = executable
|
||||
driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
|
||||
else:
|
||||
if executable == None:
|
||||
driver = webdriver.Chrome(options=chrome_options)
|
||||
else:
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
cap = DesiredCapabilities.CHROME
|
||||
cap['binary_location'] = executable
|
||||
driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, service_log_path=os.path.devnull)
|
||||
chrome_options = Options()
|
||||
|
||||
ops = [
|
||||
"--headless", "--disable-gpu", '--log-level=OFF',
|
||||
f"--user-data-dir={profile_path}", "--no-sandbox",
|
||||
"--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa
|
||||
]
|
||||
|
||||
for option in ops:
|
||||
chrome_options.add_argument(option)
|
||||
|
||||
cap = None
|
||||
|
||||
if executable:
|
||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||
|
||||
cap = DesiredCapabilities.CHROME
|
||||
cap['binary_location'] = executable
|
||||
|
||||
driver = webdriver.Chrome(
|
||||
# sets user-agent, and various chrome settings
|
||||
options=chrome_options,
|
||||
# by default it will be "chromedriver", if a chromedriver location is in the config then it will use that
|
||||
executable_path=(binary if binary else "chromedriver"),
|
||||
# by default it will be None, if a binary location is in the config then it will use that
|
||||
desired_capabilities=cap,
|
||||
# an attempt at stopping selenium from printing a pile of garbage to the console.
|
||||
service_log_path=os.path.devnull
|
||||
)
|
||||
return driver
|
||||
|
||||
|
||||
def status_select(driver, url, status='hide'):
|
||||
'''
|
||||
For now it doesnt do what its name suggests,
|
||||
I have planned to add a status reporter of the http response code.
|
||||
This part of the code is not removed because it is part of its core.
|
||||
Treat it like it isnt here.
|
||||
'''
|
||||
try:
|
||||
if status == 'hide':
|
||||
driver.get(url)
|
||||
elif status == 'show':
|
||||
r = requests.head(url)
|
||||
if r.status_code == 503:
|
||||
raise RuntimeError("This website's sevice is unavailable or has cloudflare on.")
|
||||
driver.get(url)
|
||||
return r.status_code
|
||||
else:
|
||||
driver.get(url)
|
||||
except requests.ConnectionError:
|
||||
raise RuntimeError("Failed to establish a connection using the requests library.")
|
||||
|
||||
|
||||
def cloudflare_wait(driver):
|
||||
'''
|
||||
It waits until cloudflare has gone away before doing any further actions.
|
||||
The way it works is by getting the title of the page
|
||||
The way it works is by getting the title of the page
|
||||
and as long as it is "Just a moment..." it will keep waiting.
|
||||
This part of the code won't make the code execute slower
|
||||
if the target website has not a Cloudflare redirection.
|
||||
At most it will sleep 1 second as a precaution.
|
||||
Also, i have made it time out after 30 seconds, useful if the target website is not responsive
|
||||
This part of the code won't make the code execute slower
|
||||
if the target website has no Cloudflare redirection.
|
||||
At most it will sleep 1 second as a precaution.
|
||||
Also, i have made it time out after 50 seconds, useful if the target website is not responsive
|
||||
and to stop it from running infinitely.
|
||||
'''
|
||||
abort_after = 30
|
||||
abort_after = 50 # seconds
|
||||
start = time.time()
|
||||
|
||||
title = driver.title # title = "Just a moment..."
|
||||
while title == "Just a moment...":
|
||||
time.sleep(0.25)
|
||||
while "Just a moment" in title:
|
||||
time.sleep(0.35)
|
||||
delta = time.time() - start
|
||||
if delta >= abort_after:
|
||||
logger.error(f'Timeout:\nCouldnt bypass cloudflare. \
|
||||
See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
|
||||
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
|
||||
See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
||||
return 1
|
||||
title = driver.title
|
||||
if not title == "Just a moment...":
|
||||
if not "Just a moment" in title:
|
||||
break
|
||||
time.sleep(1) # This is necessary to make sure everything has loaded fine.
|
||||
time.sleep(2) # This is necessary to make sure everything has loaded fine.
|
||||
return 0
|
||||
|
||||
|
||||
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
|
||||
params = kwargs.get('params', {})
|
||||
new_url = add_url_params(url, params)
|
||||
driver = driver_select()
|
||||
status = status_select(driver, new_url, 'hide')
|
||||
try:
|
||||
cloudflare_wait(driver)
|
||||
user_agent = driver.execute_script("return navigator.userAgent;") # dirty, but allows for all sorts of things above
|
||||
cookies = driver.get_cookies()
|
||||
text = driver.page_source
|
||||
driver.close()
|
||||
|
||||
url = url if not params else url + '?' + urlencode(params)
|
||||
cached_data = check_cache(url)
|
||||
|
||||
if cached_data:
|
||||
text = cached_data['data']
|
||||
user_agent = cached_data['user_agent']
|
||||
request_type = cached_data['method']
|
||||
cookies = cached_data['cookies']
|
||||
return SeleResponse(url, request_type, text, cookies, user_agent)
|
||||
except:
|
||||
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
|
||||
driver.close()
|
||||
logger.error(f'There was a problem getting the page: {new_url}. \
|
||||
See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
|
||||
|
||||
else:
|
||||
driver = driver_select()
|
||||
driver.get(url)
|
||||
|
||||
try:
|
||||
exit_code = cloudflare_wait(driver)
|
||||
user_agent = driver.execute_script("return navigator.userAgent;")
|
||||
cookies = driver.get_cookies()
|
||||
text = driver.page_source
|
||||
driver.close()
|
||||
|
||||
if exit_code != 0:
|
||||
return SeleResponse(url, request_type, None, cookies, user_agent)
|
||||
|
||||
seleResponse = SeleResponse(
|
||||
url, request_type,
|
||||
text, cookies,
|
||||
user_agent
|
||||
)
|
||||
|
||||
cache_request(seleResponse)
|
||||
return seleResponse
|
||||
|
||||
except:
|
||||
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
|
||||
driver.close()
|
||||
logger.error(f'There was a problem getting the page: {url}.' +
|
||||
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
||||
return
|
||||
|
||||
|
||||
class SeleResponse:
|
||||
|
@ -224,5 +323,5 @@ class SeleResponse:
|
|||
return self.text
|
||||
|
||||
def __repr__(self):
|
||||
return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES {} USERAGENT {}>'.format(
|
||||
return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES: {} USERAGENT: {}>'.format(
|
||||
self.url, self.method, self.text, self.cookies, self.user_agent)
|
||||
|
|
Loading…
Reference in New Issue