gallery-dl/gallery_dl/cloudflare.py

202 lines
6.3 KiB
Python
Raw Normal View History

2015-11-07 02:30:08 +01:00
# -*- coding: utf-8 -*-
2020-02-09 18:51:29 +01:00
# Copyright 2015-2020 Mike Fährmann
2015-11-07 02:30:08 +01:00
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
2015-11-07 13:06:23 +01:00
"""Methods to access sites behind Cloudflare protection"""
2015-11-07 02:30:08 +01:00
import time
import operator
import collections
2015-11-07 13:06:23 +01:00
import urllib.parse
from xml.etree import ElementTree
2020-04-24 22:47:27 +02:00
from . import text
2019-03-14 16:14:29 +01:00
from .cache import memcache
2015-11-07 02:30:08 +01:00
2017-01-30 19:40:15 +01:00
def is_challenge(response):
return (response.status_code == 503 and
response.headers.get("Server", "").startswith("cloudflare") and
b"jschl-answer" in response.content)
2017-01-30 19:40:15 +01:00
def is_captcha(response):
return (response.status_code == 403 and
b'name="captcha-bypass"' in response.content)
def solve_challenge(session, response, kwargs):
"""Solve Cloudflare challenge and get cfclearance cookie"""
parsed = urllib.parse.urlsplit(response.url)
root = parsed.scheme + "://" + parsed.netloc
page = response.text
cf_kwargs = {}
headers = cf_kwargs["headers"] = collections.OrderedDict()
2019-12-01 01:07:24 +01:00
params = cf_kwargs["data"] = collections.OrderedDict()
headers["Referer"] = response.url
form = text.extract(page, 'id="challenge-form"', '</form>')[0]
for element in ElementTree.fromstring(
"<f>" + form + "</f>").findall("input"):
name = element.attrib.get("name")
if not name:
continue
if name == "jschl_answer":
try:
value = solve_js_challenge(page, parsed.netloc)
except Exception:
return response, None, None
else:
value = element.attrib.get("value")
params[name] = value
try:
params = {"ray": text.extract(page, '?ray=', '"')[0]}
url = root + "/cdn-cgi/images/trace/jschal/nojs/transparent.gif"
session.request("GET", url, params=params)
url = root + "/cdn-cgi/images/trace/jschal/js/nocookie/transparent.gif"
session.request("GET", url, params=params)
except Exception:
pass
2015-11-07 02:30:08 +01:00
time.sleep(4)
url = root + text.unescape(text.extract(page, 'action="', '"')[0])
2019-12-01 01:07:24 +01:00
cf_response = session.request("POST", url, **cf_kwargs)
if cf_response.history:
initial_response = cf_response.history[0]
else:
initial_response = cf_response
2019-12-01 01:07:24 +01:00
cookies = {
cookie.name: cookie.value
for cookie in initial_response.cookies
2019-12-01 01:07:24 +01:00
}
2020-04-24 22:47:27 +02:00
2019-12-01 01:07:24 +01:00
if not cookies:
import logging
log = logging.getLogger("cloudflare")
log.debug("Headers:\n%s", initial_response.headers)
log.debug("Content:\n%s", initial_response.text)
2020-04-24 22:47:27 +02:00
return cf_response, None, None
domain = next(iter(initial_response.cookies)).domain
2019-12-01 01:07:24 +01:00
cookies["__cfduid"] = response.cookies.get("__cfduid", "")
return cf_response, domain, cookies
2015-11-07 02:30:08 +01:00
2017-01-30 19:40:15 +01:00
def solve_js_challenge(page, netloc):
"""Evaluate JS challenge in 'page' to get 'jschl_answer' value"""
# build variable name
# e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
2015-11-07 02:30:08 +01:00
data, pos = text.extract_all(page, (
2016-07-12 12:03:25 +02:00
('var' , ',f, ', '='),
('key' , '"' , '"'),
('expr', ':' , '}'),
2015-11-07 02:30:08 +01:00
))
variable = "{}.{}".format(data["var"], data["key"])
vlength = len(variable)
k = text.extract(page, "k = '", "'")[0]
# evaluate the initial expression
solution = evaluate_expression(data["expr"], page, netloc)
# iterator over all remaining expressions
# and combine their values in 'solution'
2017-01-30 19:40:15 +01:00
expressions = text.extract(
page, "'challenge-form');", "f.submit();", pos)[0]
2015-11-07 02:30:08 +01:00
for expr in expressions.split(";")[1:]:
2015-11-07 02:30:08 +01:00
if expr.startswith(variable):
# select arithmetc function based on operator (+/-/*)
func = OPERATORS[expr[vlength]]
# evaluate the rest of the expression
value = evaluate_expression(expr[vlength+2:], page, netloc, k)
# combine expression value with our current solution
2015-11-07 02:30:08 +01:00
solution = func(solution, value)
2015-11-07 02:30:08 +01:00
elif expr.startswith("a.value"):
if "t.length)" in expr:
# add length of hostname
solution += len(netloc)
if ".toFixed(" in expr:
# trim solution to 10 decimal places
solution = "{:.10f}".format(solution)
return solution
2015-11-07 02:30:08 +01:00
elif expr.startswith("k+="):
k += str(evaluate_expression(expr[3:], page, netloc))
2017-01-30 19:40:15 +01:00
def evaluate_expression(expr, page, netloc, k=""):
"""Evaluate a single Javascript expression for the challenge"""
if expr.startswith("function(p)"):
# get HTML element with ID k and evaluate the expression inside
# 'eval(eval("document.getElementById(k).innerHTML"))'
expr = text.extract(page, 'id="'+k+'"', '<')[0]
return evaluate_expression(expr.partition(">")[2], page, netloc)
if "/" in expr:
# split the expression in numerator and denominator subexpressions,
# evaluate them separately,
# and return their fraction-result
num, _, denom = expr.partition("/")
num = evaluate_expression(num, page, netloc)
denom = evaluate_expression(denom, page, netloc)
return num / denom
if "function(p)" in expr:
# split initial expression and function code
initial, _, func = expr.partition("function(p)")
# evaluate said expression
initial = evaluate_expression(initial, page, netloc)
# get function argument and use it as index into 'netloc'
index = evaluate_expression(func[func.index("}")+1:], page, netloc)
return initial + ord(netloc[int(index)])
# iterate over all subexpressions,
# evaluate them,
# and accumulate their values in 'result'
result = ""
2020-05-01 23:35:43 +02:00
for subexpr in expr.strip("+()").split(")+("):
value = 0
for part in subexpr.split("+"):
if "-" in part:
p1, _, p2 = part.partition("-")
value += VALUES[p1] - VALUES[p2]
else:
value += VALUES[part]
result += str(value)
return int(result)
2015-11-07 02:30:08 +01:00
2017-01-30 19:40:15 +01:00
OPERATORS = {
2015-11-07 02:30:08 +01:00
"+": operator.add,
"-": operator.sub,
"*": operator.mul,
}
2020-05-01 23:35:43 +02:00
VALUES = {
2015-11-07 02:30:08 +01:00
"": 0,
2020-05-01 23:35:43 +02:00
"!": 1,
"[]": 0,
"!![]": 1,
"(!![]": 1,
"(!![])": 1,
2015-11-07 02:30:08 +01:00
}
2019-03-14 16:14:29 +01:00
@memcache(keyarg=0)
def cookies(category):
return None