Merge pull request #503 from ArjixWasTaken/patch-16

added a cache system for SeleScrape and Improve SeleScrape generally
master
Arjix 2021-08-20 16:17:26 +03:00 committed by GitHub
commit 0fc9613400
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 217 additions and 113 deletions

View File

@ -46,6 +46,8 @@ def setup(func):
cf : bool
cf if True performs the request through cfscrape.
For cloudflare protected sites.
sel : bool
sel if True perfroms the request through selescrape (selenium).
referer : str
a url sent as referer in request headers
'''
@ -57,6 +59,7 @@ def setup(func):
from selenium import webdriver
from anime_downloader.sites.helpers import selescrape
sess = selescrape
sess.cache = cache
except ImportError:
sess = cf_session
logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run: 'pip install selenium' .")
@ -107,6 +110,8 @@ def get(url: str,
cf : bool
cf if True performs the request through cfscrape.
For cloudflare protected sites.
sel : bool
sel if True perfroms the request through selescrape (selenium).
referer : str
a url sent as referer in request headers
'''

View File

@ -1,31 +1,14 @@
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger
from selenium.webdriver.support.ui import WebDriverWait
from anime_downloader.const import get_random_header
from selenium.webdriver.common.by import By
from urllib.parse import urlencode
from urllib.parse import urlsplit
from selenium import webdriver
from bs4 import BeautifulSoup
from logging import exception
from sys import platform
import requests
import os
import tempfile
import logging
import click
import time
import json
serverLogger.setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
def get_data_dir():
'''
Gets the folder directory selescrape will store data,
such as cookies or browser extensions and logs.
'''
APP_NAME = 'anime downloader'
return os.path.join(click.get_app_dir(APP_NAME), 'data')
import os
def open_config():
@ -33,8 +16,24 @@ def open_config():
return Config
cache = False
serverLogger.setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
data = open_config()
if not os.path.isdir(TEMP_FOLDER):
os.makedirs(TEMP_FOLDER)
def get_data_dir():
'''
Gets the folder directory selescrape will store data,
such as cookies or browser extensions and logs.
'''
APP_NAME = 'anime downloader'
return os.path.join(click.get_app_dir(APP_NAME), 'data')
def get_browser_config():
'''
@ -50,148 +49,248 @@ def get_browser_config():
browser = os_browser[a]
else:
browser = 'chrome'
value = data['dl']['selescrape_browser']
value = value.lower() if value else value
if value in ['chrome', 'firefox']:
browser = value
return browser
def get_browser_executable():
value = data['dl']['selescrape_browser_executable_path']
executable_value = value.lower() if value else value
return executable_value
if executable_value:
return executable_value
def get_driver_binary():
value = data['dl']['selescrape_driver_binary_path']
binary_path = value.lower() if value else value
return binary_path
if value:
return value
def add_url_params(url, params):
return url if not params else url + '?' + urlencode(params)
def cache_request(sele_response):
"""
This function saves the response from a Selenium request in a json.
It uses timestamps to can know if the cache has expired or not.
"""
if not cache:
return
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
if os.path.isfile(file):
with open(file, 'r') as f:
tmp_cache = json.load(f)
else:
tmp_cache = {}
data = sele_response.__dict__
url = data['url']
url = (url[:-1] if url and url[-1] == '/' else url)
tmp_cache[url] = {
'data': data['text'],
'expiry': time.time(),
'method': data['method'],
'cookies': data['cookies'],
'user_agent': data['user_agent']
}
with open(file, 'w') as f:
json.dump(tmp_cache, f, indent=4)
def check_cache(url):
"""
This function checks if the cache file exists,
if it exists then it will read the file
And it will verify if the cache is less than or equal to 30 mins old
If it is, it will return it as it is.
If it isn't, it will delete the expired cache from the file and return None
If the file doesn't exist at all it will return None
"""
if not cache:
return
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
if os.path.isfile(file):
with open(file, 'r') as f:
data = json.load(f)
# Yes, this is ugly,
# but its the best way that I found to find the cache
# when the url is not exactly the same (a slash at the end or not)
clean_url = (url[:-1] if url and url[-1] == '/' else url)
found = False
for link in data:
if link == clean_url:
url = link
found = True
if not found:
return
timestamp = data[url]['expiry']
if (time.time() - timestamp <= 1800):
return data[url]
else:
data.pop(url, None)
with open(file, 'w') as f:
json.dump(data, f, indent=4)
def driver_select():
'''
it configures what each browser should do
and gives the driver variable that is used
to perform any actions below this function.
This configures what each browser should do
and returns the corresponding driver.
'''
browser = get_browser_config()
data_dir = get_data_dir()
executable = get_browser_executable()
driver_binary = get_driver_binary()
binary = None if not driver_binary else driver_binary
binary = get_driver_binary()
if browser == 'firefox':
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.headless = True
fireFoxOptions.add_argument('--log fatal')
if binary == None:
driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
else:
try:
driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
except:
driver = webdriver.Firefox(executable_path=binary, options=fireFoxOptions, service_log_path=os.path.devnull)
fireFox_Options = webdriver.FirefoxOptions()
ops = [
"--width=1920", "--height=1080",
"-headless", "--log fatal"
]
for option in ops:
fireFox_Options.add_argument(option)
fireFox_Profile = webdriver.FirefoxProfile()
fireFox_Profile.set_preference(
"general.useragent.override", get_random_header()['user-agent']
)
driver = webdriver.Firefox(
# sets user-agent
firefox_profile=fireFox_Profile,
# sets various firefox settings
options=fireFox_Options,
# by default it will be None, if a binary location is in the config then it will use that
firefox_binary=None if not executable else executable,
# by default it will be "geckodriver", if a geckodriver location is in the config then it will use that
executable_path=(binary if binary else "geckodriver"),
# an attempt at stopping selenium from printing a pile of garbage to the console.
service_log_path=os.path.devnull
)
elif browser == 'chrome':
from selenium.webdriver.chrome.options import Options
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
profile_path = os.path.join(data_dir, 'Selenium_chromium')
log_path = os.path.join(data_dir, 'chromedriver.log')
chrome_options.add_argument('--log-level=OFF')
chrome_options.add_argument(f"--user-data-dir={profile_path}")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--window-size=1920,1080")
chrome_options.add_argument(f'user-agent={get_random_header()}')
if binary == None:
if executable == None:
driver = webdriver.Chrome(options=chrome_options)
else:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable
driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
else:
if executable == None:
driver = webdriver.Chrome(options=chrome_options)
else:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable
driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, service_log_path=os.path.devnull)
chrome_options = Options()
ops = [
"--headless", "--disable-gpu", '--log-level=OFF',
f"--user-data-dir={profile_path}", "--no-sandbox",
"--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa
]
for option in ops:
chrome_options.add_argument(option)
cap = None
if executable:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable
driver = webdriver.Chrome(
# sets user-agent, and various chrome settings
options=chrome_options,
# by default it will be "chromedriver", if a chromedriver location is in the config then it will use that
executable_path=(binary if binary else "chromedriver"),
# by default it will be None, if a binary location is in the config then it will use that
desired_capabilities=cap,
# an attempt at stopping selenium from printing a pile of garbage to the console.
service_log_path=os.path.devnull
)
return driver
def status_select(driver, url, status='hide'):
'''
For now it doesnt do what its name suggests,
I have planned to add a status reporter of the http response code.
This part of the code is not removed because it is part of its core.
Treat it like it isnt here.
'''
try:
if status == 'hide':
driver.get(url)
elif status == 'show':
r = requests.head(url)
if r.status_code == 503:
raise RuntimeError("This website's sevice is unavailable or has cloudflare on.")
driver.get(url)
return r.status_code
else:
driver.get(url)
except requests.ConnectionError:
raise RuntimeError("Failed to establish a connection using the requests library.")
def cloudflare_wait(driver):
'''
It waits until cloudflare has gone away before doing any further actions.
The way it works is by getting the title of the page
The way it works is by getting the title of the page
and as long as it is "Just a moment..." it will keep waiting.
This part of the code won't make the code execute slower
if the target website has not a Cloudflare redirection.
At most it will sleep 1 second as a precaution.
Also, i have made it time out after 30 seconds, useful if the target website is not responsive
This part of the code won't make the code execute slower
if the target website has no Cloudflare redirection.
At most it will sleep 1 second as a precaution.
Also, i have made it time out after 50 seconds, useful if the target website is not responsive
and to stop it from running infinitely.
'''
abort_after = 30
abort_after = 50 # seconds
start = time.time()
title = driver.title # title = "Just a moment..."
while title == "Just a moment...":
time.sleep(0.25)
while "Just a moment" in title:
time.sleep(0.35)
delta = time.time() - start
if delta >= abort_after:
logger.error(f'Timeout:\nCouldnt bypass cloudflare. \
See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
return 1
title = driver.title
if not title == "Just a moment...":
if not "Just a moment" in title:
break
time.sleep(1) # This is necessary to make sure everything has loaded fine.
time.sleep(2) # This is necessary to make sure everything has loaded fine.
return 0
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
params = kwargs.get('params', {})
new_url = add_url_params(url, params)
driver = driver_select()
status = status_select(driver, new_url, 'hide')
try:
cloudflare_wait(driver)
user_agent = driver.execute_script("return navigator.userAgent;") # dirty, but allows for all sorts of things above
cookies = driver.get_cookies()
text = driver.page_source
driver.close()
url = url if not params else url + '?' + urlencode(params)
cached_data = check_cache(url)
if cached_data:
text = cached_data['data']
user_agent = cached_data['user_agent']
request_type = cached_data['method']
cookies = cached_data['cookies']
return SeleResponse(url, request_type, text, cookies, user_agent)
except:
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
driver.close()
logger.error(f'There was a problem getting the page: {new_url}. \
See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
else:
driver = driver_select()
driver.get(url)
try:
exit_code = cloudflare_wait(driver)
user_agent = driver.execute_script("return navigator.userAgent;")
cookies = driver.get_cookies()
text = driver.page_source
driver.close()
if exit_code != 0:
return SeleResponse(url, request_type, None, cookies, user_agent)
seleResponse = SeleResponse(
url, request_type,
text, cookies,
user_agent
)
cache_request(seleResponse)
return seleResponse
except:
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
driver.close()
logger.error(f'There was a problem getting the page: {url}.' +
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
return
class SeleResponse:
@ -224,5 +323,5 @@ class SeleResponse:
return self.text
def __repr__(self):
return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES {} USERAGENT {}>'.format(
return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES: {} USERAGENT: {}>'.format(
self.url, self.method, self.text, self.cookies, self.user_agent)