593 lines
18 KiB
JavaScript
593 lines
18 KiB
JavaScript
'use strict';
|
|
|
|
const requestModule = require('request-promise');
|
|
const sandbox = require('./lib/sandbox');
|
|
const decodeEmails = require('./lib/email-decode.js');
|
|
const { getDefaultHeaders, caseless } = require('./lib/headers');
|
|
const brotli = require('./lib/brotli');
|
|
const { getChromeRuntimeMock } = require('./lib/chromeRuntime');
|
|
const { deprecate } = require('util');
|
|
const puppeteer = require('puppeteer');
|
|
|
|
const {
|
|
RequestError,
|
|
CaptchaError,
|
|
CloudflareError,
|
|
ParserError
|
|
} = require('./errors');
|
|
|
|
let debugging = false;
|
|
|
|
const HOST = Symbol('host');
|
|
|
|
module.exports = defaults.call(requestModule);
|
|
|
|
function defaults (params) {
|
|
// isCloudScraper === !isRequestModule
|
|
const isRequestModule = this === requestModule;
|
|
|
|
let defaultParams = (!isRequestModule && this.defaultParams) || {
|
|
requester: requestModule,
|
|
// Cookies should be enabled
|
|
jar: requestModule.jar(),
|
|
headers: getDefaultHeaders({ Host: HOST }),
|
|
// Reduce Cloudflare's timeout to cloudflareMaxTimeout if it is excessive
|
|
cloudflareMaxTimeout: 30000,
|
|
// followAllRedirects - follow non-GET HTTP 3xx responses as redirects
|
|
followAllRedirects: true,
|
|
// Support only this max challenges in row. If CF returns more, throw an error
|
|
challengesToSolve: 3,
|
|
// Remove Cloudflare's email protection
|
|
decodeEmails: false,
|
|
// Support gzip encoded responses
|
|
gzip: true,
|
|
agentOptions: {
|
|
// Removes a few problematic TLSv1.0 ciphers to avoid CAPTCHA
|
|
sigalgs: 'ECDSA+SHA256'
|
|
// ciphers: crypto.constants.defaultCipherList + ':!ECDHE+SHA:!AES128-SHA'
|
|
}
|
|
};
|
|
|
|
// Object.assign requires at least nodejs v4, request only test/supports v6+
|
|
defaultParams = Object.assign({}, defaultParams, params);
|
|
|
|
const cloudscraper = requestModule.defaults
|
|
.call(this, defaultParams, function (options) {
|
|
validateRequest(options);
|
|
return performRequest(options, true);
|
|
});
|
|
|
|
// There's no safety net here, any changes apply to all future requests
|
|
// that are made with this instance and derived instances.
|
|
cloudscraper.defaultParams = defaultParams;
|
|
|
|
// Ensure this instance gets a copy of our custom defaults function
|
|
// and afterwards, it will be copied over automatically.
|
|
if (isRequestModule) {
|
|
cloudscraper.defaults = defaults;
|
|
}
|
|
|
|
// Expose the debug option
|
|
Object.defineProperty(cloudscraper, 'debug', {
|
|
configurable: true,
|
|
enumerable: true,
|
|
set (value) {
|
|
requestModule.debug = debugging = true;
|
|
},
|
|
get () {
|
|
return debugging;
|
|
}
|
|
});
|
|
|
|
return cloudscraper;
|
|
}
|
|
|
|
function validateRequest (options) {
|
|
// Prevent overwriting realEncoding in subsequent calls
|
|
if (!('realEncoding' in options)) {
|
|
// Can't just do the normal options.encoding || 'utf8'
|
|
// because null is a valid encoding.
|
|
if ('encoding' in options) {
|
|
options.realEncoding = options.encoding;
|
|
} else {
|
|
options.realEncoding = 'utf8';
|
|
}
|
|
}
|
|
|
|
options.encoding = null;
|
|
|
|
if (isNaN(options.challengesToSolve)) {
|
|
throw new TypeError('Expected `challengesToSolve` option to be a number, ' +
|
|
'got ' + typeof (options.challengesToSolve) + ' instead.');
|
|
}
|
|
|
|
if (isNaN(options.cloudflareMaxTimeout)) {
|
|
throw new TypeError('Expected `cloudflareMaxTimeout` option to be a number, ' +
|
|
'got ' + typeof (options.cloudflareMaxTimeout) + ' instead.');
|
|
}
|
|
|
|
if (typeof options.requester !== 'function') {
|
|
throw new TypeError('Expected `requester` option to be a function, got ' +
|
|
typeof (options.requester) + ' instead.');
|
|
}
|
|
}
|
|
|
|
// This function is wrapped to ensure that we get new options on first call.
|
|
// The options object is reused in subsequent calls when calling it directly.
|
|
function performRequest (options, isFirstRequest) {
|
|
// This should be the default export of either request or request-promise.
|
|
const requester = options.requester;
|
|
|
|
// Note that request is always an instanceof ReadableStream, EventEmitter
|
|
// If the requester is request-promise, it is also thenable.
|
|
const request = requester(options);
|
|
|
|
// We must define the host header ourselves to preserve case and order.
|
|
if (request.getHeader('host') === HOST) {
|
|
request.setHeader('host', request.uri.host);
|
|
}
|
|
|
|
// If the requester is not request-promise, ensure we get a callback.
|
|
if (typeof request.callback !== 'function') {
|
|
throw new TypeError('Expected a callback function, got ' +
|
|
typeof (request.callback) + ' instead.');
|
|
}
|
|
|
|
// We only need the callback from the first request.
|
|
// The other callbacks can be safely ignored.
|
|
if (isFirstRequest) {
|
|
// This should be a user supplied callback or request-promise's callback.
|
|
// The callback is always wrapped/bound to the request instance.
|
|
options.callback = request.callback;
|
|
}
|
|
|
|
request.removeAllListeners('error')
|
|
.once('error', function (error) {
|
|
onRequestResponse(options, error);
|
|
});
|
|
|
|
request.removeAllListeners('complete')
|
|
.once('complete', function (response, body) {
|
|
onRequestResponse(options, null, response, body);
|
|
});
|
|
|
|
// Indicate that this is a cloudscraper request
|
|
request.cloudscraper = true;
|
|
return request;
|
|
}
|
|
|
|
// The argument convention is options first where possible, options
|
|
// always before response, and body always after response.
|
|
function onRequestResponse (options, error, response, body) {
|
|
const callback = options.callback;
|
|
|
|
// Encoding is null so body should be a buffer object
|
|
if (error || !body || !body.toString) {
|
|
// Pure request error (bad connection, wrong url, etc)
|
|
return callback(new RequestError(error, options, response));
|
|
}
|
|
|
|
const headers = caseless(response.headers);
|
|
|
|
response.responseStartTime = Date.now();
|
|
response.isCloudflare = /^(cloudflare|sucuri)/i.test('' + headers.server);
|
|
response.isHTML = /text\/html/i.test('' + headers['content-type']);
|
|
|
|
// If body isn't a buffer, this is a custom response body.
|
|
if (!Buffer.isBuffer(body)) {
|
|
return callback(null, response, body);
|
|
}
|
|
|
|
// Decompress brotli compressed responses
|
|
if (/\bbr\b/i.test('' + headers['content-encoding'])) {
|
|
if (!brotli.isAvailable) {
|
|
const cause = 'Received a Brotli compressed response. Please install brotli';
|
|
return callback(new RequestError(cause, options, response));
|
|
}
|
|
|
|
try {
|
|
response.body = body = brotli.decompress(body);
|
|
} catch (error) {
|
|
return callback(new RequestError(error, options, response));
|
|
}
|
|
|
|
// Request doesn't handle brotli and would've failed to parse JSON.
|
|
if (options.json) {
|
|
try {
|
|
response.body = body = JSON.parse(body, response.request._jsonReviver);
|
|
// If successful, this isn't a challenge.
|
|
return callback(null, response, body);
|
|
} catch (error) {
|
|
// Request's debug will log the failure, no need to duplicate.
|
|
}
|
|
}
|
|
}
|
|
|
|
if (response.isCloudflare && response.isHTML) {
|
|
onCloudflareResponse(options, response, body);
|
|
} else {
|
|
onRequestComplete(options, response, body);
|
|
}
|
|
}
|
|
|
|
function onCloudflareResponse (options, response, body) {
|
|
const callback = options.callback;
|
|
|
|
if (response.statusCode !== 200 && body.length < 1) {
|
|
// This is a 4xx-5xx Cloudflare response with an empty body.
|
|
return callback(new CloudflareError(response.statusCode, options, response));
|
|
}
|
|
|
|
const stringBody = body.toString('utf8');
|
|
|
|
try {
|
|
validateResponse(options, response, stringBody);
|
|
} catch (error) {
|
|
if (error instanceof CaptchaError && typeof options.onCaptcha === 'function') {
|
|
// Give users a chance to solve the reCAPTCHA via services such as anti-captcha.com
|
|
return onCaptcha(options, response, stringBody);
|
|
}
|
|
|
|
return callback(error);
|
|
}
|
|
|
|
const isChallenge = stringBody.indexOf('a = document.getElementById(\'jschl-answer\');') !== -1;
|
|
|
|
if (isChallenge) {
|
|
return onChallenge(options, response, stringBody);
|
|
}
|
|
|
|
const isRedirectChallenge = stringBody.indexOf('You are being redirected') !== -1 ||
|
|
stringBody.indexOf('sucuri_cloudproxy_js') !== -1;
|
|
|
|
if (isRedirectChallenge) {
|
|
return onRedirectChallenge(options, response, stringBody);
|
|
}
|
|
|
|
// 503 status is always a challenge
|
|
if (response.statusCode === 503) {
|
|
return onChallenge(options, response, stringBody);
|
|
}
|
|
|
|
// All is good
|
|
onRequestComplete(options, response, body);
|
|
}
|
|
|
|
function detectRecaptchaVersion (body) {
|
|
// New version > Dec 2019
|
|
if (/__cf_chl_captcha_tk__=(.*)/i.test(body)) { // Test for ver2 first, as it also has ver2 fields
|
|
return 'ver2';
|
|
// Old version < Dec 2019
|
|
} else if (body.indexOf('why_captcha') !== -1 || /cdn-cgi\/l\/chk_captcha/i.test(body)) {
|
|
return 'ver1';
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
function validateResponse (options, response, body) {
|
|
// Finding captcha
|
|
// Old version < Dec 2019
|
|
const recaptchaVer = detectRecaptchaVersion(body);
|
|
if (recaptchaVer) {
|
|
// Convenience boolean
|
|
response.isCaptcha = true;
|
|
throw new CaptchaError('captcha', options, response);
|
|
}
|
|
|
|
// Trying to find '<span class="cf-error-code">1006</span>'
|
|
const match = body.match(/<\w+\s+class="cf-error-code">(.*)<\/\w+>/i);
|
|
|
|
if (match) {
|
|
const code = parseInt(match[1]);
|
|
throw new CloudflareError(code, options, response);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
async function onChallenge (options, response, body) {
|
|
const callback = options.callback;
|
|
const uri = response.request.uri;
|
|
// The query string to send back to Cloudflare
|
|
const payload = { /* s, jschl_vc, pass, jschl_answer */ };
|
|
|
|
let cause;
|
|
let error;
|
|
|
|
if (options.challengesToSolve === 0) {
|
|
cause = 'Cloudflare challenge loop';
|
|
error = new CloudflareError(cause, options, response);
|
|
error.errorType = 4;
|
|
|
|
return callback(error);
|
|
}
|
|
|
|
const browser = await puppeteer.launch({ headless: true });
|
|
const page = await browser.newPage();
|
|
|
|
await page.evaluateOnNewDocument(
|
|
args => {
|
|
if (args && args.fns) {
|
|
for (const fn of Object.keys(args.fns)) {
|
|
eval(`var ${fn} = ${args.fns[fn]}`) // eslint-disable-line
|
|
}
|
|
}
|
|
|
|
window.chrome = getChromeRuntimeMock(window);
|
|
},
|
|
{
|
|
fns: {
|
|
getChromeRuntimeMock: `${getChromeRuntimeMock.toString()}`
|
|
}
|
|
}
|
|
);
|
|
|
|
const ua = response.request.headers[Object.keys(response.request.headers).find(key => key.toLowerCase() === 'user-agent')];
|
|
await page.setUserAgent(ua || 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.0 Safari/537.36');
|
|
|
|
let isFirstRequest = true;
|
|
|
|
await page.setRequestInterception(true);
|
|
// await page.setJavaScriptEnabled(false);
|
|
|
|
page.on('requestfinished', async (request) => {
|
|
if (request.url() === uri.href) {
|
|
const res = request.response();
|
|
const body = await res.buffer();
|
|
|
|
// console.log(123, request.url(), request.isNavigationRequest())
|
|
|
|
if (!body.includes('<title>Just a moment...</title>') || !isFirstRequest) {
|
|
const headers = await res.buffer();
|
|
onRequestResponse(options, null, { headers, body }, body);
|
|
}
|
|
|
|
if (request.isNavigationRequest() && !request.url().includes('youtube')) {
|
|
if (isFirstRequest) {
|
|
isFirstRequest = false;
|
|
}
|
|
}
|
|
}
|
|
});
|
|
|
|
page.on('request', async request => {
|
|
// console.log(request.url(), request.isNavigationRequest())
|
|
request.continue();
|
|
});
|
|
|
|
await page.goto(uri.href);
|
|
}
|
|
|
|
// Parses the reCAPTCHA form and hands control over to the user
|
|
function onCaptcha (options, response, body) {
|
|
const recaptchaVer = detectRecaptchaVersion(body);
|
|
const isRecaptchaVer2 = recaptchaVer === 'ver2';
|
|
const callback = options.callback;
|
|
// UDF that has the responsibility of returning control back to cloudscraper
|
|
const handler = options.onCaptcha;
|
|
// The form data to send back to Cloudflare
|
|
const payload = { /* r|s, g-re-captcha-response */ };
|
|
|
|
let cause;
|
|
let match;
|
|
|
|
match = body.match(/<form(?: [^<>]*)? id=["']?challenge-form['"]?(?: [^<>]*)?>([\S\s]*?)<\/form>/);
|
|
if (!match) {
|
|
cause = 'Challenge form extraction failed';
|
|
return callback(new ParserError(cause, options, response));
|
|
}
|
|
|
|
const form = match[1];
|
|
|
|
let siteKey;
|
|
let rayId; // only for ver 2
|
|
|
|
if (isRecaptchaVer2) {
|
|
match = body.match(/\sdata-ray=["']?([^\s"'<>&]+)/);
|
|
if (!match) {
|
|
cause = 'Unable to find cloudflare ray id';
|
|
return callback(new ParserError(cause, options, response));
|
|
}
|
|
rayId = match[1];
|
|
}
|
|
|
|
match = body.match(/\sdata-sitekey=["']?([^\s"'<>&]+)/);
|
|
if (match) {
|
|
siteKey = match[1];
|
|
} else {
|
|
const keys = [];
|
|
const re = /\/recaptcha\/api2?\/(?:fallback|anchor|bframe)\?(?:[^\s<>]+&(?:amp;)?)?[Kk]=["']?([^\s"'<>&]+)/g;
|
|
|
|
while ((match = re.exec(body)) !== null) {
|
|
// Prioritize the explicit fallback siteKey over other matches
|
|
if (match[0].indexOf('fallback') !== -1) {
|
|
keys.unshift(match[1]);
|
|
if (!debugging) break;
|
|
} else {
|
|
keys.push(match[1]);
|
|
}
|
|
}
|
|
|
|
siteKey = keys[0];
|
|
|
|
if (!siteKey) {
|
|
cause = 'Unable to find the reCAPTCHA site key';
|
|
return callback(new ParserError(cause, options, response));
|
|
}
|
|
|
|
if (debugging) {
|
|
console.warn('Failed to find data-sitekey, using a fallback:', keys);
|
|
}
|
|
}
|
|
|
|
// Everything that is needed to solve the reCAPTCHA
|
|
response.captcha = {
|
|
siteKey,
|
|
uri: response.request.uri,
|
|
form: payload,
|
|
version: recaptchaVer
|
|
};
|
|
|
|
if (isRecaptchaVer2) {
|
|
response.rayId = rayId;
|
|
|
|
match = body.match(/id="challenge-form" action="(.+?)" method="(.+?)"/);
|
|
if (!match) {
|
|
cause = 'Challenge form action and method extraction failed';
|
|
return callback(new ParserError(cause, options, response));
|
|
}
|
|
response.captcha.formMethod = match[2];
|
|
match = match[1].match(/\/(.*)/);
|
|
response.captcha.formActionUri = match[0];
|
|
payload.id = rayId;
|
|
}
|
|
|
|
Object.defineProperty(response.captcha, 'url', {
|
|
configurable: true,
|
|
enumerable: false,
|
|
get: deprecate(function () {
|
|
return response.request.uri.href;
|
|
}, 'captcha.url is deprecated. Please use captcha.uri instead.')
|
|
});
|
|
|
|
// Adding formData
|
|
match = form.match(/<input(?: [^<>]*)? name=[^<>]+>/g);
|
|
if (!match) {
|
|
cause = 'Challenge form is missing inputs';
|
|
return callback(new ParserError(cause, options, response));
|
|
}
|
|
|
|
const inputs = match;
|
|
// Only adding inputs that have both a name and value defined
|
|
for (let name, value, i = 0; i < inputs.length; i++) {
|
|
name = inputs[i].match(/name=["']?([^\s"'<>]*)/);
|
|
if (name) {
|
|
value = inputs[i].match(/value=["']?([^\s"'<>]*)/);
|
|
if (value) {
|
|
payload[name[1]] = value[1];
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sanity check
|
|
if (!payload.s && !payload.r) {
|
|
cause = 'Challenge form is missing secret input';
|
|
return callback(new ParserError(cause, options, response));
|
|
}
|
|
|
|
if (debugging) {
|
|
console.warn('Captcha:', response.captcha);
|
|
}
|
|
|
|
// The callback used to green light form submission
|
|
const submit = function (error) {
|
|
if (error) {
|
|
// Pass an user defined error back to the original request call
|
|
return callback(new CaptchaError(error, options, response));
|
|
}
|
|
|
|
onSubmitCaptcha(options, response);
|
|
};
|
|
|
|
// This seems like an okay-ish API (fewer arguments to the handler)
|
|
response.captcha.submit = submit;
|
|
|
|
// We're handing control over to the user now.
|
|
const thenable = handler(options, response, body);
|
|
// Handle the case where the user returns a promise
|
|
if (thenable && typeof thenable.then === 'function') {
|
|
// eslint-disable-next-line promise/catch-or-return
|
|
thenable.then(submit, function (error) {
|
|
if (!error) {
|
|
// The user broke their promise with a falsy error
|
|
submit(new Error('Falsy error'));
|
|
} else {
|
|
submit(error);
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
function onSubmitCaptcha (options, response) {
|
|
const callback = options.callback;
|
|
const uri = response.request.uri;
|
|
const isRecaptchaVer2 = response.captcha.version === 'ver2';
|
|
|
|
if (!response.captcha.form['g-recaptcha-response']) {
|
|
const cause = 'Form submission without g-recaptcha-response';
|
|
return callback(new CaptchaError(cause, options, response));
|
|
}
|
|
|
|
if (isRecaptchaVer2) {
|
|
options.qs = {
|
|
__cf_chl_captcha_tk__: response.captcha.formActionUri.match(/__cf_chl_captcha_tk__=(.*)/)[1]
|
|
};
|
|
|
|
options.form = response.captcha.form;
|
|
} else {
|
|
options.qs = response.captcha.form;
|
|
}
|
|
|
|
options.method = response.captcha.formMethod || 'GET';
|
|
|
|
// Prevent reusing the headers object to simplify unit testing.
|
|
options.headers = Object.assign({}, options.headers);
|
|
// Use the original uri as the referer and to construct the form action.
|
|
options.headers.Referer = uri.href;
|
|
if (isRecaptchaVer2) {
|
|
options.uri = uri.protocol + '//' + uri.host + response.captcha.formActionUri;
|
|
} else {
|
|
options.uri = uri.protocol + '//' + uri.host + '/cdn-cgi/l/chk_captcha';
|
|
}
|
|
|
|
performRequest(options, false);
|
|
}
|
|
|
|
function onRedirectChallenge (options, response, body) {
|
|
const callback = options.callback;
|
|
const uri = response.request.uri;
|
|
|
|
const match = body.match(/S='([^']+)'/);
|
|
if (!match) {
|
|
const cause = 'Cookie code extraction failed';
|
|
return callback(new ParserError(cause, options, response));
|
|
}
|
|
|
|
const base64EncodedCode = match[1];
|
|
response.challenge = Buffer.from(base64EncodedCode, 'base64').toString('ascii');
|
|
|
|
try {
|
|
// Evaluate cookie setting code
|
|
const ctx = new sandbox.Context();
|
|
sandbox.eval(response.challenge, ctx);
|
|
|
|
options.jar.setCookie(ctx.document.cookie, uri.href, { ignoreError: true });
|
|
} catch (error) {
|
|
error.message = 'Cookie code evaluation failed: ' + error.message;
|
|
return callback(new ParserError(error, options, response));
|
|
}
|
|
|
|
options.challengesToSolve -= 1;
|
|
|
|
performRequest(options, false);
|
|
}
|
|
|
|
function onRequestComplete (options, response, body) {
|
|
const callback = options.callback;
|
|
|
|
if (typeof options.realEncoding === 'string') {
|
|
body = body.toString(options.realEncoding);
|
|
// The resolveWithFullResponse option will resolve with the response
|
|
// object. This changes the response.body so it is as expected.
|
|
|
|
if (response.isHTML && options.decodeEmails) {
|
|
body = decodeEmails(body);
|
|
}
|
|
|
|
response.body = body;
|
|
}
|
|
|
|
callback(null, response, body);
|
|
}
|