// -*- indent-tabs-mode: nil; js-indent-level: 2 -*- /* This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this file, * You can obtain one at http://mozilla.org/MPL/2.0/. */ "use strict"; this.EXPORTED_SYMBOLS = ["ReaderMode"]; const { classes: Cc, interfaces: Ci, utils: Cu } = Components; // Class names to preserve in the readerized output. We preserve these class // names so that rules in aboutReader.css can match them. const CLASSES_TO_PRESERVE = [ "caption", "emoji", "hidden", "invisible", "sr-only", "visually-hidden", "visuallyhidden", "wp-caption", "wp-caption-text", "wp-smiley", ]; Cu.import("resource://gre/modules/Services.jsm"); Cu.import("resource://gre/modules/XPCOMUtils.jsm"); Cu.importGlobalProperties(["XMLHttpRequest"]); XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js"); XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm"); XPCOMUtils.defineLazyModuleGetter(this, "Readerable", "resource://gre/modules/Readerable.jsm"); this.ReaderMode = { // Version of the cache schema. CACHE_VERSION: 1, DEBUG: 0, /** * Enter the reader mode by going forward one step in history if applicable, * if not, append the about:reader page in the history instead. */ enterReaderMode(docShell, win) { let url = win.document.location.href; let readerURL = "about:reader?url=" + encodeURIComponent(url); let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); let sh = webNav.sessionHistory; if (webNav.canGoForward) { let forwardEntry = sh.getEntryAtIndex(sh.index + 1, false); let forwardURL = forwardEntry.URI.spec; if (forwardURL && (forwardURL == readerURL || !readerURL)) { webNav.goForward(); return; } } win.document.location = readerURL; }, /** * Exit the reader mode by going back one step in history if applicable, * if not, append the original page in the history instead. */ leaveReaderMode(docShell, win) { let url = win.document.location.href; let originalURL = this.getOriginalUrl(url); let webNav = docShell.QueryInterface(Ci.nsIWebNavigation); let sh = webNav.sessionHistory; if (webNav.canGoBack) { let prevEntry = sh.getEntryAtIndex(sh.index - 1, false); let prevURL = prevEntry.URI.spec; if (prevURL && (prevURL == originalURL || !originalURL)) { webNav.goBack(); return; } } win.document.location = originalURL; }, /** * Returns original URL from an about:reader URL. * * @param url An about:reader URL. * @return The original URL for the article, or null if we did not find * a properly formatted about:reader URL. */ getOriginalUrl(url) { if (!url.startsWith("about:reader?")) { return null; } let outerHash = ""; try { let uriObj = Services.io.newURI(url); url = uriObj.specIgnoringRef; outerHash = uriObj.ref; } catch (ex) { /* ignore, use the raw string */ } let searchParams = new URLSearchParams(url.substring("about:reader?".length)); if (!searchParams.has("url")) { return null; } let originalUrl = searchParams.get("url"); if (outerHash) { try { let uriObj = Services.io.newURI(originalUrl); uriObj = Services.io.newURI("#" + outerHash, null, uriObj); originalUrl = uriObj.spec; } catch (ex) {} } return originalUrl; }, getOriginalUrlObjectForDisplay(url) { let originalUrl = this.getOriginalUrl(url); if (originalUrl) { let uriObj; try { uriObj = Services.uriFixup.createFixupURI(originalUrl, Services.uriFixup.FIXUP_FLAG_NONE); } catch (ex) { return null; } try { return Services.uriFixup.createExposableURI(uriObj); } catch (ex) { return null; } } return null; }, /** * Gets an article from a loaded browser's document. This method will not attempt * to parse certain URIs (e.g. about: URIs). * * @param doc A document to parse. * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ parseDocument(doc) { if (!Readerable.shouldCheckUri(doc.documentURIObject) || !Readerable.shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } return this._readerParse(doc); }, /** * Downloads and parses a document from a URL. * * @param url URL to download and parse. * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ async downloadAndParseDocument(url) { let doc = await this._downloadDocument(url); if (!doc) { return null; } if (!Readerable.shouldCheckUri(doc.documentURIObject) || !Readerable.shouldCheckUri(doc.baseURIObject, true)) { this.log("Reader mode disabled for URI"); return null; } return await this._readerParse(doc); }, _downloadDocument(url) { try { if (!Readerable.shouldCheckUri(Services.io.newURI(url))) { return null; } } catch (ex) { Cu.reportError(new Error(`Couldn't create URI from ${url} to download: ${ex}`)); return null; } return new Promise((resolve, reject) => { let xhr = new XMLHttpRequest(); xhr.open("GET", url, true); xhr.onerror = evt => reject(evt.error); xhr.responseType = "document"; xhr.onload = evt => { if (xhr.status !== 200) { reject("Reader mode XHR failed with status: " + xhr.status); return; } let doc = xhr.responseXML; if (!doc) { reject("Reader mode XHR didn't return a document"); return; } // Manually follow a meta refresh tag if one exists. let meta = doc.querySelector("meta[http-equiv=refresh]"); if (meta) { let content = meta.getAttribute("content"); if (content) { let urlIndex = content.toUpperCase().indexOf("URL="); if (urlIndex > -1) { let baseURI = Services.io.newURI(url); let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI); let newURL = newURI.spec; let ssm = Services.scriptSecurityManager; let flags = ssm.LOAD_IS_AUTOMATIC_DOCUMENT_REPLACEMENT | ssm.DISALLOW_INHERIT_PRINCIPAL; try { ssm.checkLoadURIStrWithPrincipal(doc.nodePrincipal, newURL, flags); } catch (ex) { let errorMsg = "Reader mode disallowed meta refresh (reason: " + ex + ")."; if (Services.prefs.getBoolPref("reader.errors.includeURLs")) errorMsg += " Refresh target URI: '" + newURL + "'."; reject(errorMsg); return; } // Otherwise, pass an object indicating our new URL: if (!baseURI.equalsExceptRef(newURI)) { reject({newURL}); return; } } } } let responseURL = xhr.responseURL; let givenURL = url; // Convert these to real URIs to make sure the escaping (or lack // thereof) is identical: try { responseURL = Services.io.newURI(responseURL).specIgnoringRef; } catch (ex) { /* Ignore errors - we'll use what we had before */ } try { givenURL = Services.io.newURI(givenURL).specIgnoringRef; } catch (ex) { /* Ignore errors - we'll use what we had before */ } if (responseURL != givenURL) { // We were redirected without a meta refresh tag. // Force redirect to the correct place: reject({newURL: xhr.responseURL}); return; } resolve(doc); }; xhr.send(); }); }, /** * Retrieves an article from the cache given an article URI. * * @param url The article URL. * @return {Promise} * @resolves JS object representing the article, or null if no article is found. * @rejects OS.File.Error */ async getArticleFromCache(url) { let path = this._toHashedPath(url); try { let array = await OS.File.read(path); return JSON.parse(new TextDecoder().decode(array)); } catch (e) { if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile) throw e; return null; } }, /** * Stores an article in the cache. * * @param article JS object representing article. * @return {Promise} * @resolves When the article is stored. * @rejects OS.File.Error */ async storeArticleInCache(article) { let array = new TextEncoder().encode(JSON.stringify(article)); let path = this._toHashedPath(article.url); await this._ensureCacheDir(); return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" }) .then(success => { OS.File.stat(path).then(info => { return EventDispatcher.instance.sendRequest({ type: "Reader:AddedToCache", url: article.url, size: info.size, path: path, }); }); }); }, /** * Removes an article from the cache given an article URI. * * @param url The article URL. * @return {Promise} * @resolves When the article is removed. * @rejects OS.File.Error */ async removeArticleFromCache(url) { let path = this._toHashedPath(url); await OS.File.remove(path); }, log(msg) { if (this.DEBUG) dump("Reader: " + msg); }, /** * Attempts to parse a document into an article. Heavy lifting happens * in readerWorker.js. * * @param doc The document to parse. * @return {Promise} * @resolves JS object representing the article, or null if no article is found. */ async _readerParse(doc) { if (this.parseNodeLimit) { let numTags = doc.getElementsByTagName("*").length; if (numTags > this.parseNodeLimit) { this.log("Aborting parse for " + doc.baseURIObject.spec + "; " + numTags + " elements found"); return null; } } // Fetch this here before we send `doc` off to the worker thread, as later on the // document might be nuked but we will still want the URI. let {documentURI} = doc; let uriParam = { spec: doc.baseURIObject.spec, host: doc.baseURIObject.host, prePath: doc.baseURIObject.prePath, scheme: doc.baseURIObject.scheme, pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec }; // convert text/plain document, if any, to XHTML format if (this._isDocumentPlainText(doc)) { doc = this._convertPlainTextDocument(doc); } let langAttributes = { charset: doc.characterSet, lang: doc.documentElement.lang }; let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"]. createInstance(Ci.nsIDOMSerializer); let serializedDoc = serializer.serializeToString(doc); let options = { classesToPreserve: CLASSES_TO_PRESERVE, }; let article = null; try { article = await ReaderWorker.post("parseDocument", [uriParam, serializedDoc, options]); } catch (e) { Cu.reportError("Error in ReaderWorker: " + e); } // Explicitly null out doc to make it clear it might not be available from this // point on. doc = null; if (!article) { this.log("Worker did not return an article"); return null; } // Readability returns a URI object based on the baseURI, but we only care // about the original document's URL from now on. This also avoids spoofing // attempts where the baseURI doesn't match the domain of the documentURI article.url = documentURI; delete article.uri; let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks; article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils) .convertToPlainText(article.title, flags, 0); await this._assignLanguage(article, langAttributes); this._maybeAssignTextDirection(article); this._assignReadTime(article); return article; }, get _cryptoHash() { delete this._cryptoHash; return this._cryptoHash = Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash); }, get _unicodeConverter() { delete this._unicodeConverter; this._unicodeConverter = Cc["@mozilla.org/intl/scriptableunicodeconverter"] .createInstance(Ci.nsIScriptableUnicodeConverter); this._unicodeConverter.charset = "utf8"; return this._unicodeConverter; }, /** * Calculate the hashed path for a stripped article URL. * * @param url The article URL. This should have referrers removed. * @return The file path to the cached article. */ _toHashedPath(url) { let value = this._unicodeConverter.convertToByteArray(url); this._cryptoHash.init(this._cryptoHash.MD5); this._cryptoHash.update(value, value.length); let hash = CommonUtils.encodeBase32(this._cryptoHash.finish(false)); let fileName = hash.substring(0, hash.indexOf("=")) + ".json"; return OS.Path.join(OS.Constants.Path.profileDir, "readercache", fileName); }, /** * Ensures the cache directory exists. * * @return Promise * @resolves When the cache directory exists. * @rejects OS.File.Error */ _ensureCacheDir() { let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache"); return OS.File.exists(dir).then(exists => { if (!exists) { return OS.File.makeDir(dir); } return undefined; }); }, /** * Sets a global language string value if possible. If langauge detection is * available, use that. Otherwise, revert to a simpler mechanism using the * document's lang attribute or charset. * * @return Promise * @resolves when the language is detected */ _assignLanguage(article, attributes) { try { Cu.import("resource://modules/translation/LanguageDetector.jsm"); return LanguageDetector.detectLanguage(article.textContent).then(result => { article.language = result.confident ? result.language : null; }); } catch(ex) { return new Promise((resolve) => { resolve(this._assignSimpleLanguage(attributes)); }).then(result => { article.language = result; }); } }, _assignSimpleLanguage(attributes) { var lang = attributes.lang.substring(0,2); if (lang) { return lang; } // If there is no lang attribute, try the charset. // We can only use this for charsets that are specific to one language. const charsetLang = new Map([ [ "us-ascii", "en" ], [ "iso-8859-6", "ar" ], [ "iso-8859-7", "el" ], [ "iso-8859-8", "he" ], [ "iso-8859-9", "tr" ], [ "iso-8859-11", "th" ], [ "jis_x0201", "ja" ], [ "shift_jis", "ja" ], [ "euc-jp", "ja" ] ]); return charsetLang.get(attributes.charset); }, _maybeAssignTextDirection(article) { // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved. if (!article.dir && ["ar", "fa", "he", "ug", "ur"].includes(article.language)) { article.dir = "rtl"; } }, /** * Assigns the estimated reading time range of the article to the article object. * * @param article the article object to assign the reading time estimate to. */ _assignReadTime(article) { let lang = article.language || "en"; const readingSpeed = this._getReadingSpeedForLanguage(lang); const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance; const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance; const length = article.length; article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow); article.readingTimeMinsFast = Math.ceil(length / charactersPerMinuteHigh); }, /** * Returns the reading speed of a selection of languages with likely variance. * * Reading speed estimated from a study done on reading speeds in various languages. * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061 * * @return object with characters per minute and variance. Defaults to English * if no suitable language is found in the collection. */ _getReadingSpeedForLanguage(lang) { const readingSpeed = new Map([ [ "en", {cpm: 987, variance: 118 } ], [ "ar", {cpm: 612, variance: 88 } ], [ "de", {cpm: 920, variance: 86 } ], [ "es", {cpm: 1025, variance: 127 } ], [ "fi", {cpm: 1078, variance: 121 } ], [ "fr", {cpm: 998, variance: 126 } ], [ "he", {cpm: 833, variance: 130 } ], [ "it", {cpm: 950, variance: 140 } ], [ "jw", {cpm: 357, variance: 56 } ], [ "nl", {cpm: 978, variance: 143 } ], [ "pl", {cpm: 916, variance: 126 } ], [ "pt", {cpm: 913, variance: 145 } ], [ "ru", {cpm: 986, variance: 175 } ], [ "sk", {cpm: 885, variance: 145 } ], [ "sv", {cpm: 917, variance: 156 } ], [ "tr", {cpm: 1054, variance: 156 } ], [ "zh", {cpm: 255, variance: 29 } ], ]); return readingSpeed.get(lang) || readingSpeed.get("en"); }, /** * * Check if the document to be parsed is text document. * @param doc the doc object to be parsed. * @return boolean * */ _isDocumentPlainText(doc) { return doc.contentType == "text/plain"; }, /** * * The document to be parsed is text document and is converted to HTML format. * @param doc the doc object to be parsed. * @return doc * */ _convertPlainTextDocument(doc) { let preTag = doc.querySelector("pre"); let docFrag = doc.createDocumentFragment(); let content = preTag.textContent; let paragraphs = content.split(/\r?\n\r?\n/); for (let para of paragraphs) { let pElem = doc.createElement("p"); let lines = para.split(/\n/); for (let line of lines) { pElem.append(line); let brElem = doc.createElement("br"); pElem.append(brElem); } docFrag.append(pElem); } preTag.parentNode.replaceChild(docFrag, preTag); return doc; }, }; // Don't try to parse the page if it has too many elements (for memory and // performance reasons) XPCOMUtils.defineLazyPreferenceGetter( ReaderMode, "parseNodeLimit", "reader.parse-node-limit", 0);