
328 lines
9.8 KiB
Raw Normal View History

from selenium.webdriver.remote.remote_connection import LOGGER as serverLogger
from anime_downloader.const import get_random_header
from urllib.parse import urlencode
from selenium import webdriver
from sys import platform
2020-09-02 05:28:04 -07:00
import tempfile
import logging
import click
import time
import json
import os
def open_config():
from anime_downloader.config import Config
return Config
2020-09-01 13:55:51 -07:00
2021-05-25 12:46:12 -07:00
cache = False
logger = logging.getLogger(__name__)
TEMP_FOLDER = os.path.join(tempfile.gettempdir(), 'AnimeDL-SeleniumCache')
data = open_config()
if not os.path.isdir(TEMP_FOLDER):
2020-09-02 08:31:25 -07:00
def get_data_dir():
2020-09-02 08:31:25 -07:00
Gets the folder directory selescrape will store data,
such as cookies or browser extensions and logs.
APP_NAME = 'anime downloader'
return os.path.join(click.get_app_dir(APP_NAME), 'data')
def get_browser_config():
Decides what browser selescrape will use.
2020-09-02 08:31:25 -07:00
os_browser = { # maps os to a browser
'linux': 'firefox',
'darwin': 'chrome',
'win32': 'chrome'
for a in os_browser:
if platform.startswith(a):
2020-09-02 08:31:25 -07:00
browser = os_browser[a]
browser = 'chrome'
2020-09-02 08:31:25 -07:00
value = data['dl']['selescrape_browser']
value = value.lower() if value else value
2020-09-02 08:31:25 -07:00
if value in ['chrome', 'firefox']:
browser = value
2020-09-02 08:31:25 -07:00
return browser
def get_browser_executable():
value = data['dl']['selescrape_browser_executable_path']
executable_value = value.lower() if value else value
2021-05-25 12:37:32 -07:00
if executable_value:
return executable_value
def get_driver_binary():
value = data['dl']['selescrape_driver_binary_path']
if value:
return value
def cache_request(sele_response):
2020-09-01 14:39:58 -07:00
This function saves the response from a Selenium request in a json.
It uses timestamps to can know if the cache has expired or not.
2020-09-01 14:39:58 -07:00
2021-05-25 12:46:12 -07:00
if not cache:
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
if os.path.isfile(file):
with open(file, 'r') as f:
tmp_cache = json.load(f)
tmp_cache = {}
data = sele_response.__dict__
url = data['url']
url = (url[:-1] if url and url[-1] == '/' else url)
tmp_cache[url] = {
'data': data['text'],
2020-09-01 14:39:58 -07:00
'expiry': time.time(),
'method': data['method'],
'cookies': data['cookies'],
'user_agent': data['user_agent']
2020-09-02 08:31:25 -07:00
2020-09-01 13:55:51 -07:00
with open(file, 'w') as f:
2020-09-01 13:55:51 -07:00
json.dump(tmp_cache, f, indent=4)
2020-09-02 08:31:25 -07:00
2020-09-01 13:55:51 -07:00
def check_cache(url):
This function checks if the cache file exists,
if it exists then it will read the file
2021-05-25 12:46:12 -07:00
And it will verify if the cache is less than or equal to 30 mins old
2020-09-02 06:33:05 -07:00
If it is, it will return it as it is.
If it isn't, it will delete the expired cache from the file and return None
If the file doesn't exist at all it will return None
2021-05-25 12:46:12 -07:00
if not cache:
file = os.path.join(TEMP_FOLDER, 'selenium_cached_requests.json')
2020-09-01 13:55:51 -07:00
if os.path.isfile(file):
2020-09-01 13:55:51 -07:00
with open(file, 'r') as f:
data = json.load(f)
# Yes, this is ugly,
# but its the best way that I found to find the cache
# when the url is not exactly the same (a slash at the end or not)
clean_url = (url[:-1] if url and url[-1] == '/' else url)
found = False
for link in data:
if link == clean_url:
url = link
found = True
if not found:
timestamp = data[url]['expiry']
if (time.time() - timestamp <= 1800):
return data[url]
2020-09-01 13:55:51 -07:00
data.pop(url, None)
2020-09-01 14:05:29 -07:00
with open(file, 'w') as f:
json.dump(data, f, indent=4)
2020-09-01 13:55:51 -07:00
def driver_select():
This configures what each browser should do
and returns the corresponding driver.
browser = get_browser_config()
data_dir = get_data_dir()
executable = get_browser_executable()
binary = get_driver_binary()
if browser == 'firefox':
fireFox_Options = webdriver.FirefoxOptions()
ops = [
"--width=1920", "--height=1080",
2021-05-25 12:18:19 -07:00
"-headless", "--log fatal"
2020-09-02 08:31:25 -07:00
for option in ops:
fireFox_Profile = webdriver.FirefoxProfile()
"general.useragent.override", get_random_header()['user-agent']
driver = webdriver.Firefox(
# sets user-agent
# sets various firefox settings
2021-05-25 12:37:32 -07:00
# by default it will be None, if a binary location is in the config then it will use that
firefox_binary=None if not executable else executable,
# by default it will be "geckodriver", if a geckodriver location is in the config then it will use that
executable_path=(binary if binary else "geckodriver"),
# an attempt at stopping selenium from printing a pile of garbage to the console.
elif browser == 'chrome':
from import Options
profile_path = os.path.join(data_dir, 'Selenium_chromium')
chrome_options = Options()
ops = [
"--headless", "--disable-gpu", '--log-level=OFF',
f"--user-data-dir={profile_path}", "--no-sandbox",
"--window-size=1920,1080", f"user-agent={get_random_header()['user-agent']}" # noqa
2020-09-02 08:26:10 -07:00
for option in ops:
cap = None
2021-05-25 12:37:32 -07:00
if executable:
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
cap = DesiredCapabilities.CHROME
cap['binary_location'] = executable
driver = webdriver.Chrome(
# sets user-agent, and various chrome settings
2021-05-25 12:37:32 -07:00
# by default it will be "chromedriver", if a chromedriver location is in the config then it will use that
executable_path=(binary if binary else "chromedriver"),
# by default it will be None, if a binary location is in the config then it will use that
# an attempt at stopping selenium from printing a pile of garbage to the console.
return driver
def cloudflare_wait(driver):
It waits until cloudflare has gone away before doing any further actions.
2020-09-02 08:31:25 -07:00
The way it works is by getting the title of the page
and as long as it is "Just a moment..." it will keep waiting.
2020-09-02 08:31:25 -07:00
This part of the code won't make the code execute slower
if the target website has no Cloudflare redirection.
2020-09-02 08:31:25 -07:00
At most it will sleep 1 second as a precaution.
Also, i have made it time out after 50 seconds, useful if the target website is not responsive
and to stop it from running infinitely.
abort_after = 50 # seconds
start = time.time()
title = driver.title # title = "Just a moment..."
while "Just a moment" in title:
delta = time.time() - start
if delta >= abort_after:
2020-09-02 05:28:04 -07:00
logger.error(f'Timeout:\tCouldnt bypass cloudflare. \
See the screenshot for more info:\t{get_data_dir()}/screenshot.png')
2020-10-24 12:39:35 -07:00
return 1
title = driver.title
if not "Just a moment" in title:
2020-10-24 12:39:35 -07:00
time.sleep(2) # This is necessary to make sure everything has loaded fine.
return 0
2020-09-02 08:31:25 -07:00
def request(request_type, url, **kwargs): # Headers not yet supported , headers={}
params = kwargs.get('params', {})
2020-09-02 05:28:04 -07:00
url = url if not params else url + '?' + urlencode(params)
cached_data = check_cache(url)
if cached_data:
2020-09-01 13:55:51 -07:00
text = cached_data['data']
user_agent = cached_data['user_agent']
2020-09-02 05:28:04 -07:00
request_type = cached_data['method']
2020-09-01 13:55:51 -07:00
cookies = cached_data['cookies']
return SeleResponse(url, request_type, text, cookies, user_agent)
2020-09-01 13:55:51 -07:00
driver = driver_select()
2020-09-02 05:28:04 -07:00
2020-09-01 13:55:51 -07:00
2020-10-24 12:39:35 -07:00
exit_code = cloudflare_wait(driver)
2020-09-02 08:31:25 -07:00
user_agent = driver.execute_script("return navigator.userAgent;")
2020-09-01 13:55:51 -07:00
cookies = driver.get_cookies()
text = driver.page_source
if exit_code != 0:
2020-10-24 12:39:35 -07:00
return SeleResponse(url, request_type, None, cookies, user_agent)
2020-09-02 08:31:25 -07:00
seleResponse = SeleResponse(
url, request_type,
text, cookies,
return seleResponse
2020-09-01 13:55:51 -07:00
2020-09-01 13:55:51 -07:00
2020-09-02 08:26:10 -07:00
logger.error(f'There was a problem getting the page: {url}.' +
2020-09-02 08:31:25 -07:00
'\nSee the screenshot for more info:\t{get_data_dir()}/screenshot.png')
2021-01-20 09:09:16 -08:00
2020-09-01 13:55:51 -07:00
class SeleResponse:
Class for the selenium response.
url: string
URL of the webpage.
medthod: GET or POST
Request type.
text/content: string
Webpage contents.
cookies: dict
Stored cookies from the website.
user_agent: string
User agent used on the webpage
2020-09-02 08:31:25 -07:00
def __init__(self, url, method, text, cookies, user_agent):
self.url = url
self.method = method
self.text = text
self.content = text
self.cookies = cookies
self.user_agent = user_agent
def __str__(self):
return self.text
def __repr__(self):
2021-01-20 09:09:16 -08:00
return '<SeleResponse URL: {} METHOD: {} TEXT: {} COOKIES: {} USERAGENT: {}>'.format(
self.url, self.method, self.text, self.cookies, self.user_agent)