2015-11-07 02:30:08 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
|
2019-03-10 15:31:33 +01:00
|
|
|
# Copyright 2015-2019 Mike Fährmann
|
2015-11-07 02:30:08 +01:00
|
|
|
#
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
# it under the terms of the GNU General Public License version 2 as
|
|
|
|
# published by the Free Software Foundation.
|
|
|
|
|
2015-11-07 13:06:23 +01:00
|
|
|
"""Methods to access sites behind Cloudflare protection"""
|
|
|
|
|
2018-04-19 21:32:10 +02:00
|
|
|
import re
|
2015-11-07 02:30:08 +01:00
|
|
|
import time
|
|
|
|
import operator
|
2019-04-09 10:52:27 +02:00
|
|
|
import collections
|
2015-11-07 13:06:23 +01:00
|
|
|
import urllib.parse
|
2019-04-01 15:14:59 +02:00
|
|
|
from . import text, exception
|
2019-03-14 16:14:29 +01:00
|
|
|
from .cache import memcache
|
2015-11-07 02:30:08 +01:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2019-03-10 15:31:33 +01:00
|
|
|
def is_challenge(response):
|
|
|
|
return (response.status_code == 503 and
|
|
|
|
response.headers.get("Server", "").startswith("cloudflare") and
|
|
|
|
b"jschl-answer" in response.content)
|
2016-11-20 18:05:49 +01:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2019-04-01 15:14:59 +02:00
|
|
|
def is_captcha(response):
|
|
|
|
return (response.status_code == 403 and
|
|
|
|
b'name="captcha-bypass"' in response.content)
|
|
|
|
|
|
|
|
|
2019-03-10 15:31:33 +01:00
|
|
|
def solve_challenge(session, response, kwargs):
|
|
|
|
"""Solve Cloudflare challenge and get cfclearance cookie"""
|
|
|
|
parsed = urllib.parse.urlsplit(response.url)
|
|
|
|
root = parsed.scheme + "://" + parsed.netloc
|
|
|
|
|
2019-04-09 10:52:27 +02:00
|
|
|
cf_kwargs = {}
|
|
|
|
headers = cf_kwargs["headers"] = collections.OrderedDict()
|
2019-12-01 01:07:24 +01:00
|
|
|
params = cf_kwargs["data"] = collections.OrderedDict()
|
2018-04-19 21:32:10 +02:00
|
|
|
|
2016-12-16 13:28:36 +01:00
|
|
|
page = response.text
|
2019-12-01 01:07:24 +01:00
|
|
|
url = root + text.extract(page, 'action="', '"')[0]
|
|
|
|
params["r"] = text.extract(page, 'name="r" value="', '"')[0]
|
2019-03-10 15:31:33 +01:00
|
|
|
params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
|
2019-04-01 15:14:59 +02:00
|
|
|
params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
|
2019-03-10 15:31:33 +01:00
|
|
|
params["jschl_answer"] = solve_js_challenge(page, parsed.netloc)
|
|
|
|
headers["Referer"] = response.url
|
2018-04-19 21:32:10 +02:00
|
|
|
|
2015-11-07 02:30:08 +01:00
|
|
|
time.sleep(4)
|
2019-03-10 15:31:33 +01:00
|
|
|
|
|
|
|
cf_kwargs["allow_redirects"] = False
|
2019-12-01 01:07:24 +01:00
|
|
|
cf_response = session.request("POST", url, **cf_kwargs)
|
2019-03-10 15:31:33 +01:00
|
|
|
|
2019-12-01 01:07:24 +01:00
|
|
|
cookies = {
|
|
|
|
cookie.name: cookie.value
|
|
|
|
for cookie in cf_response.cookies
|
|
|
|
}
|
|
|
|
if not cookies:
|
2019-04-01 15:14:59 +02:00
|
|
|
import logging
|
|
|
|
log = logging.getLogger("cloudflare")
|
|
|
|
rtype = "CAPTCHA" if is_captcha(cf_response) else "Unexpected"
|
|
|
|
log.error("%s response", rtype)
|
|
|
|
log.debug("Headers:\n%s", cf_response.headers)
|
|
|
|
log.debug("Content:\n%s", cf_response.text)
|
|
|
|
raise exception.StopExtraction()
|
2019-04-27 16:22:42 +02:00
|
|
|
|
2019-12-01 01:07:24 +01:00
|
|
|
domain = next(iter(cf_response.cookies)).domain
|
|
|
|
cookies["__cfduid"] = response.cookies.get("__cfduid", "")
|
|
|
|
return cf_response, domain, cookies
|
2015-11-07 02:30:08 +01:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2019-03-10 15:31:33 +01:00
|
|
|
def solve_js_challenge(page, netloc):
|
|
|
|
"""Evaluate JS challenge in 'page' to get 'jschl_answer' value"""
|
2018-04-19 21:32:10 +02:00
|
|
|
|
|
|
|
# build variable name
|
|
|
|
# e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
|
2015-11-07 02:30:08 +01:00
|
|
|
data, pos = text.extract_all(page, (
|
2016-07-12 12:03:25 +02:00
|
|
|
('var' , ',f, ', '='),
|
2019-03-10 15:31:33 +01:00
|
|
|
('key' , '"' , '"'),
|
|
|
|
('expr', ':' , '}'),
|
2015-11-07 02:30:08 +01:00
|
|
|
))
|
|
|
|
variable = "{}.{}".format(data["var"], data["key"])
|
|
|
|
vlength = len(variable)
|
2018-04-19 21:32:10 +02:00
|
|
|
|
|
|
|
# evaluate the initial expression
|
2019-04-01 15:14:59 +02:00
|
|
|
solution = evaluate_expression(data["expr"], page, netloc)
|
2018-04-19 21:32:10 +02:00
|
|
|
|
|
|
|
# iterator over all remaining expressions
|
|
|
|
# and combine their values in 'solution'
|
2017-01-30 19:40:15 +01:00
|
|
|
expressions = text.extract(
|
2018-04-19 21:32:10 +02:00
|
|
|
page, "'challenge-form');", "f.submit();", pos)[0]
|
2015-11-07 02:30:08 +01:00
|
|
|
for expr in expressions.split(";")[1:]:
|
2018-04-19 21:32:10 +02:00
|
|
|
|
2015-11-07 02:30:08 +01:00
|
|
|
if expr.startswith(variable):
|
2019-03-10 15:31:33 +01:00
|
|
|
# select arithmetc function based on operator (+/-/*)
|
|
|
|
func = OPERATORS[expr[vlength]]
|
2018-04-19 21:32:10 +02:00
|
|
|
# evaluate the rest of the expression
|
2019-04-01 15:14:59 +02:00
|
|
|
value = evaluate_expression(expr[vlength+2:], page, netloc)
|
2019-03-10 15:31:33 +01:00
|
|
|
# combine expression value with our current solution
|
2015-11-07 02:30:08 +01:00
|
|
|
solution = func(solution, value)
|
2018-04-19 21:32:10 +02:00
|
|
|
|
2015-11-07 02:30:08 +01:00
|
|
|
elif expr.startswith("a.value"):
|
2019-04-01 15:14:59 +02:00
|
|
|
if "t.length)" in expr:
|
|
|
|
# add length of hostname
|
|
|
|
solution += len(netloc)
|
2018-04-05 18:24:58 +02:00
|
|
|
if ".toFixed(" in expr:
|
2019-03-10 15:31:33 +01:00
|
|
|
# trim solution to 10 decimal places
|
2019-12-16 15:32:49 +01:00
|
|
|
solution = "{:.10f}".format(solution)
|
2018-04-05 18:24:58 +02:00
|
|
|
return solution
|
2015-11-07 02:30:08 +01:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2019-04-01 15:14:59 +02:00
|
|
|
def evaluate_expression(expr, page, netloc, *,
|
|
|
|
split_re=re.compile(r"[(+]+([^)]*)\)")):
|
2019-03-10 15:31:33 +01:00
|
|
|
"""Evaluate a single Javascript expression for the challenge"""
|
2018-04-19 21:32:10 +02:00
|
|
|
|
2019-04-01 15:14:59 +02:00
|
|
|
if expr.startswith("function(p)"):
|
|
|
|
# get HTML element with ID k and evaluate the expression inside
|
|
|
|
# 'eval(eval("document.getElementById(k).innerHTML"))'
|
|
|
|
k, pos = text.extract(page, "k = '", "'")
|
|
|
|
e, pos = text.extract(page, 'id="'+k+'"', '<')
|
|
|
|
return evaluate_expression(e.partition(">")[2], page, netloc)
|
|
|
|
|
2018-04-05 18:24:58 +02:00
|
|
|
if "/" in expr:
|
2018-04-19 21:32:10 +02:00
|
|
|
# split the expression in numerator and denominator subexpressions,
|
|
|
|
# evaluate them separately,
|
|
|
|
# and return their fraction-result
|
2018-04-05 18:24:58 +02:00
|
|
|
num, _, denom = expr.partition("/")
|
2019-04-01 15:14:59 +02:00
|
|
|
num = evaluate_expression(num, page, netloc)
|
|
|
|
denom = evaluate_expression(denom, page, netloc)
|
|
|
|
return num / denom
|
|
|
|
|
|
|
|
if "function(p)" in expr:
|
|
|
|
# split initial expression and function code
|
|
|
|
initial, _, func = expr.partition("function(p)")
|
|
|
|
# evaluate said expression
|
|
|
|
initial = evaluate_expression(initial, page, netloc)
|
|
|
|
# get function argument and use it as index into 'netloc'
|
|
|
|
index = evaluate_expression(func[func.index("}")+1:], page, netloc)
|
|
|
|
return initial + ord(netloc[int(index)])
|
2018-04-05 18:24:58 +02:00
|
|
|
|
2018-04-19 21:32:10 +02:00
|
|
|
# iterate over all subexpressions,
|
|
|
|
# evaluate them,
|
|
|
|
# and accumulate their values in 'result'
|
|
|
|
result = ""
|
2019-04-01 15:14:59 +02:00
|
|
|
for subexpr in split_re.findall(expr) or (expr,):
|
2018-04-19 21:32:10 +02:00
|
|
|
result += str(sum(
|
2019-03-10 15:31:33 +01:00
|
|
|
VALUES[part]
|
2018-04-19 21:32:10 +02:00
|
|
|
for part in subexpr.split("[]")
|
|
|
|
))
|
|
|
|
return int(result)
|
2015-11-07 02:30:08 +01:00
|
|
|
|
2017-01-30 19:40:15 +01:00
|
|
|
|
2019-03-10 15:31:33 +01:00
|
|
|
OPERATORS = {
|
2015-11-07 02:30:08 +01:00
|
|
|
"+": operator.add,
|
|
|
|
"-": operator.sub,
|
|
|
|
"*": operator.mul,
|
|
|
|
}
|
|
|
|
|
2019-03-10 15:31:33 +01:00
|
|
|
VALUES = {
|
2015-11-07 02:30:08 +01:00
|
|
|
"": 0,
|
|
|
|
"+": 0,
|
|
|
|
"!+": 1,
|
2019-04-01 15:14:59 +02:00
|
|
|
"!!": 1,
|
2015-11-07 02:30:08 +01:00
|
|
|
"+!!": 1,
|
|
|
|
}
|
2019-03-14 16:14:29 +01:00
|
|
|
|
|
|
|
|
|
|
|
@memcache(keyarg=0)
|
|
|
|
def cookies(category):
|
|
|
|
return None
|