Merge pull request #503 from ArjixWasTaken/patch-16
added a cache system for SeleScrape and Improve SeleScrape generallymaster
commit
0fc9613400
|
@ -46,6 +46,8 @@ def setup(func):
|
||||||
cf : bool
|
cf : bool
|
||||||
cf if True performs the request through cfscrape.
|
cf if True performs the request through cfscrape.
|
||||||
For cloudflare protected sites.
|
For cloudflare protected sites.
|
||||||
|
sel : bool
|
||||||
|
sel if True perfroms the request through selescrape (selenium).
|
||||||
referer : str
|
referer : str
|
||||||
a url sent as referer in request headers
|
a url sent as referer in request headers
|
||||||
'''
|
'''
|
||||||
|
@ -57,6 +59,7 @@ def setup(func):
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from anime_downloader.sites.helpers import selescrape
|
from anime_downloader.sites.helpers import selescrape
|
||||||
sess = selescrape
|
sess = selescrape
|
||||||
|
sess.cache = cache
|
||||||
except ImportError:
|
except ImportError:
|
||||||
sess = cf_session
|
sess = cf_session
|
||||||
logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run: 'pip install selenium' .")
|
logger.warning("This provider may not work correctly because it requires selenium to work.\nIf you want to install it then run: 'pip install selenium' .")
|
||||||
|
@ -107,6 +110,8 @@ def get(url: str,
|
||||||
cf : bool
|
cf : bool
|
||||||
cf if True performs the request through cfscrape.
|
cf if True performs the request through cfscrape.
|
||||||
For cloudflare protected sites.
|
For cloudflare protected sites.
|
||||||
|
sel : bool
|
||||||
|
sel if True perfroms the request through selescrape (selenium).
|
||||||
referer : str
|
referer : str
|
||||||
a url sent as referer in request headers
|
a url sent as referer in request headers
|
||||||
'''
|
'''
|
||||||
|
|
|
@ -1,31 +1,14 @@
|
||||||
from selenium.webdriver.support import expected_conditions as EC
|
|
||||||
from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger
|
from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
|
||||||
from anime_downloader.const import get_random_header
|
from anime_downloader.const import get_random_header
|
||||||
from selenium.webdriver.common.by import By
|
|
||||||
from urllib.parse import urlencode
|
from urllib.parse import urlencode
|
||||||
from urllib.parse import urlsplit
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
from logging import exception
|
|
||||||
from sys import platform
|
from sys import platform
|
||||||
import requests
|
import tempfile
|
||||||
import os
|
|
||||||
import logging
|
import logging
|
||||||
import click
|
import click
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
serverLogger.setLevel(logging.ERROR)
|
import os
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def get_data_dir():
|
|
||||||
'''
|
|
||||||
Gets the folder directory selescrape will store data,
|
|
||||||
such as cookies or browser extensions and logs.
|
|
||||||
'''
|
|
||||||
APP_NAME = 'anime downloader'
|
|
||||||
return os.path.join(click.get_app_dir(APP_NAME), 'data')
|
|
||||||
|
|
||||||
|
|
||||||
def open_config():
|
def open_config():
|
||||||
|
@ -33,8 +16,24 @@ def open_config():
|
||||||
return Config
|
return Config
|
||||||
|
|
||||||
|
|
||||||
|
cache = False
|
||||||
|
serverLogger.setLevel(logging.ERROR)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
|
||||||
data = open_config()
|
data = open_config()
|
||||||
|
|
||||||
|
if not os.path.isdir(TEMP_FOLDER):
|
||||||
|
os.makedirs(TEMP_FOLDER)
|
||||||
|
|
||||||
|
|
||||||
|
def get_data_dir():
|
||||||
|
'''
|
||||||
|
Gets the folder directory selescrape will store data,
|
||||||
|
such as cookies or browser extensions and logs.
|
||||||
|
'''
|
||||||
|
APP_NAME = 'anime downloader'
|
||||||
|
return os.path.join(click.get_app_dir(APP_NAME), 'data')
|
||||||
|
|
||||||
|
|
||||||
def get_browser_config():
|
def get_browser_config():
|
||||||
'''
|
'''
|
||||||
|
@ -50,148 +49,248 @@ def get_browser_config():
|
||||||
browser = os_browser[a]
|
browser = os_browser[a]
|
||||||
else:
|
else:
|
||||||
browser = 'chrome'
|
browser = 'chrome'
|
||||||
|
|
||||||
value = data['dl']['selescrape_browser']
|
value = data['dl']['selescrape_browser']
|
||||||
value = value.lower() if value else value
|
value = value.lower() if value else value
|
||||||
|
|
||||||
if value in ['chrome', 'firefox']:
|
if value in ['chrome', 'firefox']:
|
||||||
browser = value
|
browser = value
|
||||||
|
|
||||||
return browser
|
return browser
|
||||||
|
|
||||||
|
|
||||||
def get_browser_executable():
|
def get_browser_executable():
|
||||||
value = data['dl']['selescrape_browser_executable_path']
|
value = data['dl']['selescrape_browser_executable_path']
|
||||||
executable_value = value.lower() if value else value
|
executable_value = value.lower() if value else value
|
||||||
return executable_value
|
if executable_value:
|
||||||
|
return executable_value
|
||||||
|
|
||||||
|
|
||||||
def get_driver_binary():
|
def get_driver_binary():
|
||||||
value = data['dl']['selescrape_driver_binary_path']
|
value = data['dl']['selescrape_driver_binary_path']
|
||||||
binary_path = value.lower() if value else value
|
if value:
|
||||||
return binary_path
|
return value
|
||||||
|
|
||||||
|
|
||||||
def add_url_params(url, params):
|
def cache_request(sele_response):
|
||||||
return url if not params else url + '?' + urlencode(params)
|
"""
|
||||||
|
This function saves the response from a Selenium request in a json.
|
||||||
|
It uses timestamps to can know if the cache has expired or not.
|
||||||
|
"""
|
||||||
|
if not cache:
|
||||||
|
return
|
||||||
|
|
||||||
|
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
|
||||||
|
|
||||||
|
if os.path.isfile(file):
|
||||||
|
with open(file, 'r') as f:
|
||||||
|
tmp_cache = json.load(f)
|
||||||
|
else:
|
||||||
|
tmp_cache = {}
|
||||||
|
|
||||||
|
data = sele_response.__dict__
|
||||||
|
url = data['url']
|
||||||
|
url = (url[:-1] if url and url[-1] == '/' else url)
|
||||||
|
|
||||||
|
tmp_cache[url] = {
|
||||||
|
'data': data['text'],
|
||||||
|
'expiry': time.time(),
|
||||||
|
'method': data['method'],
|
||||||
|
'cookies': data['cookies'],
|
||||||
|
'user_agent': data['user_agent']
|
||||||
|
}
|
||||||
|
|
||||||
|
with open(file, 'w') as f:
|
||||||
|
json.dump(tmp_cache, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
|
def check_cache(url):
|
||||||
|
"""
|
||||||
|
This function checks if the cache file exists,
|
||||||
|
if it exists then it will read the file
|
||||||
|
And it will verify if the cache is less than or equal to 30 mins old
|
||||||
|
If it is, it will return it as it is.
|
||||||
|
If it isn't, it will delete the expired cache from the file and return None
|
||||||
|
If the file doesn't exist at all it will return None
|
||||||
|
"""
|
||||||
|
if not cache:
|
||||||
|
return
|
||||||
|
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
|
||||||
|
if os.path.isfile(file):
|
||||||
|
|
||||||
|
with open(file, 'r') as f:
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# Yes, this is ugly,
|
||||||
|
# but its the best way that I found to find the cache
|
||||||
|
# when the url is not exactly the same (a slash at the end or not)
|
||||||
|
clean_url = (url[:-1] if url and url[-1] == '/' else url)
|
||||||
|
found = False
|
||||||
|
|
||||||
|
for link in data:
|
||||||
|
if link == clean_url:
|
||||||
|
url = link
|
||||||
|
found = True
|
||||||
|
|
||||||
|
if not found:
|
||||||
|
return
|
||||||
|
|
||||||
|
timestamp = data[url]['expiry']
|
||||||
|
|
||||||
|
if (time.time() - timestamp <= 1800):
|
||||||
|
return data[url]
|
||||||
|
else:
|
||||||
|
data.pop(url, None)
|
||||||
|
|
||||||
|
with open(file, 'w') as f:
|
||||||
|
json.dump(data, f, indent=4)
|
||||||
|
|
||||||
|
|
||||||
def driver_select():
|
def driver_select():
|
||||||
'''
|
'''
|
||||||
it configures what each browser should do
|
This configures what each browser should do
|
||||||
and gives the driver variable that is used
|
and returns the corresponding driver.
|
||||||
to perform any actions below this function.
|
|
||||||
'''
|
'''
|
||||||
browser = get_browser_config()
|
browser = get_browser_config()
|
||||||
data_dir = get_data_dir()
|
data_dir = get_data_dir()
|
||||||
executable = get_browser_executable()
|
executable = get_browser_executable()
|
||||||
driver_binary = get_driver_binary()
|
binary = get_driver_binary()
|
||||||
binary = None if not driver_binary else driver_binary
|
|
||||||
if browser == 'firefox':
|
if browser == 'firefox':
|
||||||
fireFoxOptions = webdriver.FirefoxOptions()
|
fireFox_Options = webdriver.FirefoxOptions()
|
||||||
fireFoxOptions.headless = True
|
ops = [
|
||||||
fireFoxOptions.add_argument('--log fatal')
|
"--width=1920", "--height=1080",
|
||||||
if binary == None:
|
"-headless", "--log fatal"
|
||||||
driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
|
]
|
||||||
else:
|
|
||||||
try:
|
for option in ops:
|
||||||
driver = webdriver.Firefox(options=fireFoxOptions, service_log_path=os.path.devnull)
|
fireFox_Options.add_argument(option)
|
||||||
except:
|
|
||||||
driver = webdriver.Firefox(executable_path=binary, options=fireFoxOptions, service_log_path=os.path.devnull)
|
fireFox_Profile = webdriver.FirefoxProfile()
|
||||||
|
fireFox_Profile.set_preference(
|
||||||
|
"general.useragent.override", get_random_header()['user-agent']
|
||||||
|
)
|
||||||
|
|
||||||
|
driver = webdriver.Firefox(
|
||||||
|
# sets user-agent
|
||||||
|
firefox_profile=fireFox_Profile,
|
||||||
|
# sets various firefox settings
|
||||||
|
options=fireFox_Options,
|
||||||
|
# by default it will be None, if a binary location is in the config then it will use that
|
||||||
|
firefox_binary=None if not executable else executable,
|
||||||
|
# by default it will be "geckodriver", if a geckodriver location is in the config then it will use that
|
||||||
|
executable_path=(binary if binary else "geckodriver"),
|
||||||
|
# an attempt at stopping selenium from printing a pile of garbage to the console.
|
||||||
|
service_log_path=os.path.devnull
|
||||||
|
)
|
||||||
|
|
||||||
elif browser == 'chrome':
|
elif browser == 'chrome':
|
||||||
from selenium.webdriver.chrome.options import Options
|
from selenium.webdriver.chrome.options import Options
|
||||||
chrome_options = Options()
|
|
||||||
chrome_options.add_argument("--headless")
|
|
||||||
chrome_options.add_argument("--disable-gpu")
|
|
||||||
profile_path = os.path.join(data_dir, 'Selenium_chromium')
|
profile_path = os.path.join(data_dir, 'Selenium_chromium')
|
||||||
log_path = os.path.join(data_dir, 'chromedriver.log')
|
chrome_options = Options()
|
||||||
chrome_options.add_argument('--log-level=OFF')
|
|
||||||
chrome_options.add_argument(f"--user-data-dir={profile_path}")
|
ops = [
|
||||||
chrome_options.add_argument("--no-sandbox")
|
"--headless", "--disable-gpu", '--log-level=OFF',
|
||||||
chrome_options.add_argument("--window-size=1920,1080")
|
f"--user-data-dir={profile_path}", "--no-sandbox",
|
||||||
chrome_options.add_argument(f'user-agent={get_random_header()}')
|
"--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa
|
||||||
if binary == None:
|
]
|
||||||
if executable == None:
|
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
for option in ops:
|
||||||
else:
|
chrome_options.add_argument(option)
|
||||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
||||||
cap = DesiredCapabilities.CHROME
|
cap = None
|
||||||
cap['binary_location'] = executable
|
|
||||||
driver = webdriver.Chrome(desired_capabilities=cap, options=chrome_options)
|
if executable:
|
||||||
else:
|
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
||||||
if executable == None:
|
|
||||||
driver = webdriver.Chrome(options=chrome_options)
|
cap = DesiredCapabilities.CHROME
|
||||||
else:
|
cap['binary_location'] = executable
|
||||||
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
|
||||||
cap = DesiredCapabilities.CHROME
|
driver = webdriver.Chrome(
|
||||||
cap['binary_location'] = executable
|
# sets user-agent, and various chrome settings
|
||||||
driver = webdriver.Chrome(executable_path=binary, desired_capabilities=cap, options=chrome_options, service_log_path=os.path.devnull)
|
options=chrome_options,
|
||||||
|
# by default it will be "chromedriver", if a chromedriver location is in the config then it will use that
|
||||||
|
executable_path=(binary if binary else "chromedriver"),
|
||||||
|
# by default it will be None, if a binary location is in the config then it will use that
|
||||||
|
desired_capabilities=cap,
|
||||||
|
# an attempt at stopping selenium from printing a pile of garbage to the console.
|
||||||
|
service_log_path=os.path.devnull
|
||||||
|
)
|
||||||
return driver
|
return driver
|
||||||
|
|
||||||
|
|
||||||
def status_select(driver, url, status='hide'):
|
|
||||||
'''
|
|
||||||
For now it doesnt do what its name suggests,
|
|
||||||
I have planned to add a status reporter of the http response code.
|
|
||||||
This part of the code is not removed because it is part of its core.
|
|
||||||
Treat it like it isnt here.
|
|
||||||
'''
|
|
||||||
try:
|
|
||||||
if status == 'hide':
|
|
||||||
driver.get(url)
|
|
||||||
elif status == 'show':
|
|
||||||
r = requests.head(url)
|
|
||||||
if r.status_code == 503:
|
|
||||||
raise RuntimeError("This website's sevice is unavailable or has cloudflare on.")
|
|
||||||
driver.get(url)
|
|
||||||
return r.status_code
|
|
||||||
else:
|
|
||||||
driver.get(url)
|
|
||||||
except requests.ConnectionError:
|
|
||||||
raise RuntimeError("Failed to establish a connection using the requests library.")
|
|
||||||
|
|
||||||
|
|
||||||
def cloudflare_wait(driver):
|
def cloudflare_wait(driver):
|
||||||
'''
|
'''
|
||||||
It waits until cloudflare has gone away before doing any further actions.
|
It waits until cloudflare has gone away before doing any further actions.
|
||||||
The way it works is by getting the title of the page
|
The way it works is by getting the title of the page
|
||||||
and as long as it is "Just a moment..." it will keep waiting.
|
and as long as it is "Just a moment..." it will keep waiting.
|
||||||
This part of the code won't make the code execute slower
|
This part of the code won't make the code execute slower
|
||||||
if the target website has not a Cloudflare redirection.
|
if the target website has no Cloudflare redirection.
|
||||||
At most it will sleep 1 second as a precaution.
|
At most it will sleep 1 second as a precaution.
|
||||||
Also, i have made it time out after 30 seconds, useful if the target website is not responsive
|
Also, i have made it time out after 50 seconds, useful if the target website is not responsive
|
||||||
and to stop it from running infinitely.
|
and to stop it from running infinitely.
|
||||||
'''
|
'''
|
||||||
abort_after = 30
|
abort_after = 50 # seconds
|
||||||
start = time.time()
|
start = time.time()
|
||||||
|
|
||||||
title = driver.title # title = "Just a moment..."
|
title = driver.title # title = "Just a moment..."
|
||||||
while title == "Just a moment...":
|
while "Just a moment" in title:
|
||||||
time.sleep(0.25)
|
time.sleep(0.35)
|
||||||
delta = time.time() - start
|
delta = time.time() - start
|
||||||
if delta >= abort_after:
|
if delta >= abort_after:
|
||||||
logger.error(f'Timeout:\nCouldnt bypass cloudflare. \
|
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
|
||||||
See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
|
See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
||||||
|
return 1
|
||||||
title = driver.title
|
title = driver.title
|
||||||
if not title == "Just a moment...":
|
if not "Just a moment" in title:
|
||||||
break
|
break
|
||||||
time.sleep(1) # This is necessary to make sure everything has loaded fine.
|
time.sleep(2) # This is necessary to make sure everything has loaded fine.
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
|
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
|
||||||
params = kwargs.get('params', {})
|
params = kwargs.get('params', {})
|
||||||
new_url = add_url_params(url, params)
|
|
||||||
driver = driver_select()
|
url = url if not params else url + '?' + urlencode(params)
|
||||||
status = status_select(driver, new_url, 'hide')
|
cached_data = check_cache(url)
|
||||||
try:
|
|
||||||
cloudflare_wait(driver)
|
if cached_data:
|
||||||
user_agent = driver.execute_script("return navigator.userAgent;") # dirty, but allows for all sorts of things above
|
text = cached_data['data']
|
||||||
cookies = driver.get_cookies()
|
user_agent = cached_data['user_agent']
|
||||||
text = driver.page_source
|
request_type = cached_data['method']
|
||||||
driver.close()
|
cookies = cached_data['cookies']
|
||||||
return SeleResponse(url, request_type, text, cookies, user_agent)
|
return SeleResponse(url, request_type, text, cookies, user_agent)
|
||||||
except:
|
|
||||||
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
|
else:
|
||||||
driver.close()
|
driver = driver_select()
|
||||||
logger.error(f'There was a problem getting the page: {new_url}. \
|
driver.get(url)
|
||||||
See the screenshot for more info:\n{get_data_dir()}/screenshot.png')
|
|
||||||
|
try:
|
||||||
|
exit_code = cloudflare_wait(driver)
|
||||||
|
user_agent = driver.execute_script("return navigator.userAgent;")
|
||||||
|
cookies = driver.get_cookies()
|
||||||
|
text = driver.page_source
|
||||||
|
driver.close()
|
||||||
|
|
||||||
|
if exit_code != 0:
|
||||||
|
return SeleResponse(url, request_type, None, cookies, user_agent)
|
||||||
|
|
||||||
|
seleResponse = SeleResponse(
|
||||||
|
url, request_type,
|
||||||
|
text, cookies,
|
||||||
|
user_agent
|
||||||
|
)
|
||||||
|
|
||||||
|
cache_request(seleResponse)
|
||||||
|
return seleResponse
|
||||||
|
|
||||||
|
except:
|
||||||
|
driver.save_screenshot(f"{get_data_dir()}/screenshot.png")
|
||||||
|
driver.close()
|
||||||
|
logger.error(f'There was a problem getting the page: {url}.' +
|
||||||
|
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
class SeleResponse:
|
class SeleResponse:
|
||||||
|
@ -224,5 +323,5 @@ class SeleResponse:
|
||||||
return self.text
|
return self.text
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES {} USERAGENT {}>'.format(
|
return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES: {} USERAGENT: {}>'.format(
|
||||||
self.url, self.method, self.text, self.cookies, self.user_agent)
|
self.url, self.method, self.text, self.cookies, self.user_agent)
|
||||||
|
|
Loading…
Reference in New Issue