'use strict'; const requestModule = require('request-promise'); const sandbox = require('./lib/sandbox'); const decodeEmails = require('./lib/email-decode.js'); const { getDefaultHeaders, caseless } = require('./lib/headers'); const brotli = require('./lib/brotli'); const { getChromeRuntimeMock } = require('./lib/chromeRuntime'); const { deprecate } = require('util'); const puppeteer = require('puppeteer'); const { RequestError, CaptchaError, CloudflareError, ParserError } = require('./errors'); let debugging = false; const HOST = Symbol('host'); module.exports = defaults.call(requestModule); function defaults (params) { // isCloudScraper === !isRequestModule const isRequestModule = this === requestModule; let defaultParams = (!isRequestModule && this.defaultParams) || { requester: requestModule, // Cookies should be enabled jar: requestModule.jar(), headers: getDefaultHeaders({ Host: HOST }), // Reduce Cloudflare's timeout to cloudflareMaxTimeout if it is excessive cloudflareMaxTimeout: 30000, // followAllRedirects - follow non-GET HTTP 3xx responses as redirects followAllRedirects: true, // Support only this max challenges in row. If CF returns more, throw an error challengesToSolve: 3, // Remove Cloudflare's email protection decodeEmails: false, // Support gzip encoded responses gzip: true, agentOptions: { // Removes a few problematic TLSv1.0 ciphers to avoid CAPTCHA sigalgs: 'ECDSA+SHA256' // ciphers: crypto.constants.defaultCipherList + ':!ECDHE+SHA:!AES128-SHA' } }; // Object.assign requires at least nodejs v4, request only test/supports v6+ defaultParams = Object.assign({}, defaultParams, params); const cloudscraper = requestModule.defaults .call(this, defaultParams, function (options) { validateRequest(options); return performRequest(options, true); }); // There's no safety net here, any changes apply to all future requests // that are made with this instance and derived instances. cloudscraper.defaultParams = defaultParams; // Ensure this instance gets a copy of our custom defaults function // and afterwards, it will be copied over automatically. if (isRequestModule) { cloudscraper.defaults = defaults; } // Expose the debug option Object.defineProperty(cloudscraper, 'debug', { configurable: true, enumerable: true, set (value) { requestModule.debug = debugging = true; }, get () { return debugging; } }); return cloudscraper; } function validateRequest (options) { // Prevent overwriting realEncoding in subsequent calls if (!('realEncoding' in options)) { // Can't just do the normal options.encoding || 'utf8' // because null is a valid encoding. if ('encoding' in options) { options.realEncoding = options.encoding; } else { options.realEncoding = 'utf8'; } } options.encoding = null; if (isNaN(options.challengesToSolve)) { throw new TypeError('Expected `challengesToSolve` option to be a number, ' + 'got ' + typeof (options.challengesToSolve) + ' instead.'); } if (isNaN(options.cloudflareMaxTimeout)) { throw new TypeError('Expected `cloudflareMaxTimeout` option to be a number, ' + 'got ' + typeof (options.cloudflareMaxTimeout) + ' instead.'); } if (typeof options.requester !== 'function') { throw new TypeError('Expected `requester` option to be a function, got ' + typeof (options.requester) + ' instead.'); } } // This function is wrapped to ensure that we get new options on first call. // The options object is reused in subsequent calls when calling it directly. function performRequest (options, isFirstRequest) { // This should be the default export of either request or request-promise. const requester = options.requester; // Note that request is always an instanceof ReadableStream, EventEmitter // If the requester is request-promise, it is also thenable. const request = requester(options); // We must define the host header ourselves to preserve case and order. if (request.getHeader('host') === HOST) { request.setHeader('host', request.uri.host); } // If the requester is not request-promise, ensure we get a callback. if (typeof request.callback !== 'function') { throw new TypeError('Expected a callback function, got ' + typeof (request.callback) + ' instead.'); } // We only need the callback from the first request. // The other callbacks can be safely ignored. if (isFirstRequest) { // This should be a user supplied callback or request-promise's callback. // The callback is always wrapped/bound to the request instance. options.callback = request.callback; } request.removeAllListeners('error') .once('error', function (error) { onRequestResponse(options, error); }); request.removeAllListeners('complete') .once('complete', function (response, body) { onRequestResponse(options, null, response, body); }); // Indicate that this is a cloudscraper request request.cloudscraper = true; return request; } // The argument convention is options first where possible, options // always before response, and body always after response. function onRequestResponse (options, error, response, body) { const callback = options.callback; // Encoding is null so body should be a buffer object if (error || !body || !body.toString) { // Pure request error (bad connection, wrong url, etc) return callback(new RequestError(error, options, response)); } const headers = caseless(response.headers); response.responseStartTime = Date.now(); response.isCloudflare = /^(cloudflare|sucuri)/i.test('' + headers.server); response.isHTML = /text\/html/i.test('' + headers['content-type']); // If body isn't a buffer, this is a custom response body. if (!Buffer.isBuffer(body)) { return callback(null, response, body); } // Decompress brotli compressed responses if (/\bbr\b/i.test('' + headers['content-encoding'])) { if (!brotli.isAvailable) { const cause = 'Received a Brotli compressed response. Please install brotli'; return callback(new RequestError(cause, options, response)); } try { response.body = body = brotli.decompress(body); } catch (error) { return callback(new RequestError(error, options, response)); } // Request doesn't handle brotli and would've failed to parse JSON. if (options.json) { try { response.body = body = JSON.parse(body, response.request._jsonReviver); // If successful, this isn't a challenge. return callback(null, response, body); } catch (error) { // Request's debug will log the failure, no need to duplicate. } } } if (response.isCloudflare && response.isHTML) { onCloudflareResponse(options, response, body); } else { onRequestComplete(options, response, body); } } function onCloudflareResponse (options, response, body) { const callback = options.callback; if (response.statusCode !== 200 && body.length < 1) { // This is a 4xx-5xx Cloudflare response with an empty body. return callback(new CloudflareError(response.statusCode, options, response)); } const stringBody = body.toString('utf8'); try { validateResponse(options, response, stringBody); } catch (error) { if (error instanceof CaptchaError && typeof options.onCaptcha === 'function') { // Give users a chance to solve the reCAPTCHA via services such as anti-captcha.com return onCaptcha(options, response, stringBody); } return callback(error); } const isChallenge = stringBody.indexOf('a = document.getElementById(\'jschl-answer\');') !== -1; if (isChallenge) { return onChallenge(options, response, stringBody); } const isRedirectChallenge = stringBody.indexOf('You are being redirected') !== -1 || stringBody.indexOf('sucuri_cloudproxy_js') !== -1; if (isRedirectChallenge) { return onRedirectChallenge(options, response, stringBody); } // 503 status is always a challenge if (response.statusCode === 503) { return onChallenge(options, response, stringBody); } // All is good onRequestComplete(options, response, body); } function detectRecaptchaVersion (body) { // New version > Dec 2019 if (/__cf_chl_captcha_tk__=(.*)/i.test(body)) { // Test for ver2 first, as it also has ver2 fields return 'ver2'; // Old version < Dec 2019 } else if (body.indexOf('why_captcha') !== -1 || /cdn-cgi\/l\/chk_captcha/i.test(body)) { return 'ver1'; } return false; } function validateResponse (options, response, body) { // Finding captcha // Old version < Dec 2019 const recaptchaVer = detectRecaptchaVersion(body); if (recaptchaVer) { // Convenience boolean response.isCaptcha = true; throw new CaptchaError('captcha', options, response); } // Trying to find '1006' const match = body.match(/<\w+\s+class="cf-error-code">(.*)<\/\w+>/i); if (match) { const code = parseInt(match[1]); throw new CloudflareError(code, options, response); } return false; } async function onChallenge (options, response, body) { const callback = options.callback; const uri = response.request.uri; // The query string to send back to Cloudflare const payload = { /* s, jschl_vc, pass, jschl_answer */ }; let cause; let error; if (options.challengesToSolve === 0) { cause = 'Cloudflare challenge loop'; error = new CloudflareError(cause, options, response); error.errorType = 4; return callback(error); } const browser = await puppeteer.launch({ headless: true }); const page = await browser.newPage(); await page.evaluateOnNewDocument( args => { if (args && args.fns) { for (const fn of Object.keys(args.fns)) { eval(`var ${fn} = ${args.fns[fn]}`) // eslint-disable-line } } window.chrome = getChromeRuntimeMock(window); }, { fns: { getChromeRuntimeMock: `${getChromeRuntimeMock.toString()}` } } ); const ua = response.request.headers[Object.keys(response.request.headers).find(key => key.toLowerCase() === 'user-agent')]; await page.setUserAgent(ua || 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.0 Safari/537.36'); let isFirstRequest = true; await page.setRequestInterception(true); // await page.setJavaScriptEnabled(false); page.on('requestfinished', async (request) => { if (request.url() === uri.href) { const res = request.response(); const body = await res.buffer(); // console.log(123, request.url(), request.isNavigationRequest()) if (!body.includes('Just a moment...') || !isFirstRequest) { const headers = await res.buffer(); onRequestResponse(options, null, { headers, body }, body); } if (request.isNavigationRequest() && !request.url().includes('youtube')) { if (isFirstRequest) { isFirstRequest = false; } } } }); page.on('request', async request => { // console.log(request.url(), request.isNavigationRequest()) request.continue(); }); await page.goto(uri.href); } // Parses the reCAPTCHA form and hands control over to the user function onCaptcha (options, response, body) { const recaptchaVer = detectRecaptchaVersion(body); const isRecaptchaVer2 = recaptchaVer === 'ver2'; const callback = options.callback; // UDF that has the responsibility of returning control back to cloudscraper const handler = options.onCaptcha; // The form data to send back to Cloudflare const payload = { /* r|s, g-re-captcha-response */ }; let cause; let match; match = body.match(/]*)? id=["']?challenge-form['"]?(?: [^<>]*)?>([\S\s]*?)<\/form>/); if (!match) { cause = 'Challenge form extraction failed'; return callback(new ParserError(cause, options, response)); } const form = match[1]; let siteKey; let rayId; // only for ver 2 if (isRecaptchaVer2) { match = body.match(/\sdata-ray=["']?([^\s"'<>&]+)/); if (!match) { cause = 'Unable to find cloudflare ray id'; return callback(new ParserError(cause, options, response)); } rayId = match[1]; } match = body.match(/\sdata-sitekey=["']?([^\s"'<>&]+)/); if (match) { siteKey = match[1]; } else { const keys = []; const re = /\/recaptcha\/api2?\/(?:fallback|anchor|bframe)\?(?:[^\s<>]+&(?:amp;)?)?[Kk]=["']?([^\s"'<>&]+)/g; while ((match = re.exec(body)) !== null) { // Prioritize the explicit fallback siteKey over other matches if (match[0].indexOf('fallback') !== -1) { keys.unshift(match[1]); if (!debugging) break; } else { keys.push(match[1]); } } siteKey = keys[0]; if (!siteKey) { cause = 'Unable to find the reCAPTCHA site key'; return callback(new ParserError(cause, options, response)); } if (debugging) { console.warn('Failed to find data-sitekey, using a fallback:', keys); } } // Everything that is needed to solve the reCAPTCHA response.captcha = { siteKey, uri: response.request.uri, form: payload, version: recaptchaVer }; if (isRecaptchaVer2) { response.rayId = rayId; match = body.match(/id="challenge-form" action="(.+?)" method="(.+?)"/); if (!match) { cause = 'Challenge form action and method extraction failed'; return callback(new ParserError(cause, options, response)); } response.captcha.formMethod = match[2]; match = match[1].match(/\/(.*)/); response.captcha.formActionUri = match[0]; payload.id = rayId; } Object.defineProperty(response.captcha, 'url', { configurable: true, enumerable: false, get: deprecate(function () { return response.request.uri.href; }, 'captcha.url is deprecated. Please use captcha.uri instead.') }); // Adding formData match = form.match(/]*)? name=[^<>]+>/g); if (!match) { cause = 'Challenge form is missing inputs'; return callback(new ParserError(cause, options, response)); } const inputs = match; // Only adding inputs that have both a name and value defined for (let name, value, i = 0; i < inputs.length; i++) { name = inputs[i].match(/name=["']?([^\s"'<>]*)/); if (name) { value = inputs[i].match(/value=["']?([^\s"'<>]*)/); if (value) { payload[name[1]] = value[1]; } } } // Sanity check if (!payload.s && !payload.r) { cause = 'Challenge form is missing secret input'; return callback(new ParserError(cause, options, response)); } if (debugging) { console.warn('Captcha:', response.captcha); } // The callback used to green light form submission const submit = function (error) { if (error) { // Pass an user defined error back to the original request call return callback(new CaptchaError(error, options, response)); } onSubmitCaptcha(options, response); }; // This seems like an okay-ish API (fewer arguments to the handler) response.captcha.submit = submit; // We're handing control over to the user now. const thenable = handler(options, response, body); // Handle the case where the user returns a promise if (thenable && typeof thenable.then === 'function') { // eslint-disable-next-line promise/catch-or-return thenable.then(submit, function (error) { if (!error) { // The user broke their promise with a falsy error submit(new Error('Falsy error')); } else { submit(error); } }); } } function onSubmitCaptcha (options, response) { const callback = options.callback; const uri = response.request.uri; const isRecaptchaVer2 = response.captcha.version === 'ver2'; if (!response.captcha.form['g-recaptcha-response']) { const cause = 'Form submission without g-recaptcha-response'; return callback(new CaptchaError(cause, options, response)); } if (isRecaptchaVer2) { options.qs = { __cf_chl_captcha_tk__: response.captcha.formActionUri.match(/__cf_chl_captcha_tk__=(.*)/)[1] }; options.form = response.captcha.form; } else { options.qs = response.captcha.form; } options.method = response.captcha.formMethod || 'GET'; // Prevent reusing the headers object to simplify unit testing. options.headers = Object.assign({}, options.headers); // Use the original uri as the referer and to construct the form action. options.headers.Referer = uri.href; if (isRecaptchaVer2) { options.uri = uri.protocol + '//' + uri.host + response.captcha.formActionUri; } else { options.uri = uri.protocol + '//' + uri.host + '/cdn-cgi/l/chk_captcha'; } performRequest(options, false); } function onRedirectChallenge (options, response, body) { const callback = options.callback; const uri = response.request.uri; const match = body.match(/S='([^']+)'/); if (!match) { const cause = 'Cookie code extraction failed'; return callback(new ParserError(cause, options, response)); } const base64EncodedCode = match[1]; response.challenge = Buffer.from(base64EncodedCode, 'base64').toString('ascii'); try { // Evaluate cookie setting code const ctx = new sandbox.Context(); sandbox.eval(response.challenge, ctx); options.jar.setCookie(ctx.document.cookie, uri.href, { ignoreError: true }); } catch (error) { error.message = 'Cookie code evaluation failed: ' + error.message; return callback(new ParserError(error, options, response)); } options.challengesToSolve -= 1; performRequest(options, false); } function onRequestComplete (options, response, body) { const callback = options.callback; if (typeof options.realEncoding === 'string') { body = body.toString(options.realEncoding); // The resolveWithFullResponse option will resolve with the response // object. This changes the response.body so it is as expected. if (response.isHTML && options.decodeEmails) { body = decodeEmails(body); } response.body = body; } callback(null, response, body); }