Mypal/toolkit/components/reader/ReaderMode.jsm

// -*- indent-tabs-mode: nil; js-indent-level: 2 -*-
/* This Source Code Form is subject to the terms of the Mozilla Public
 * License, v. 2.0. If a copy of the MPL was not distributed with this file,
 * You can obtain one at http://mozilla.org/MPL/2.0/. */
"use strict";

this.EXPORTED_SYMBOLS = ["ReaderMode"];

const { classes: Cc, interfaces: Ci, utils: Cu } = Components;

// Class names to preserve in the readerized output. We preserve these class
// names so that rules in aboutReader.css can match them.
const CLASSES_TO_PRESERVE = [
  "caption",
  "emoji",
  "hidden",
  "invisible",
  "sr-only",
  "visually-hidden",
  "visuallyhidden",
  "wp-caption",
  "wp-caption-text",
  "wp-smiley",
];

Cu.import("resource://gre/modules/Services.jsm");
Cu.import("resource://gre/modules/XPCOMUtils.jsm");

Cu.importGlobalProperties(["XMLHttpRequest"]);

XPCOMUtils.defineLazyModuleGetter(this, "CommonUtils", "resource://services-common/utils.js");
XPCOMUtils.defineLazyModuleGetter(this, "EventDispatcher", "resource://gre/modules/Messaging.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "OS", "resource://gre/modules/osfile.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "ReaderWorker", "resource://gre/modules/reader/ReaderWorker.jsm");
XPCOMUtils.defineLazyModuleGetter(this, "Readerable", "resource://gre/modules/Readerable.jsm");

this.ReaderMode = {
  // Version of the cache schema.
  CACHE_VERSION: 1,

  DEBUG: 0,

  /**
   * Enter the reader mode by going forward one step in history if applicable,
   * if not, append the about:reader page in the history instead.
   */
  enterReaderMode(docShell, win) {
    let url = win.document.location.href;
    let readerURL = "about:reader?url=" + encodeURIComponent(url);
    let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
    let sh = webNav.sessionHistory;
    if (webNav.canGoForward) {
      let forwardEntry = sh.getEntryAtIndex(sh.index + 1, false);
      let forwardURL = forwardEntry.URI.spec;
      if (forwardURL && (forwardURL == readerURL || !readerURL)) {
        webNav.goForward();
        return;
      }
    }

    win.document.location = readerURL;
  },

  /**
   * Exit the reader mode by going back one step in history if applicable,
   * if not, append the original page in the history instead.
   */
  leaveReaderMode(docShell, win) {
    let url = win.document.location.href;
    let originalURL = this.getOriginalUrl(url);
    let webNav = docShell.QueryInterface(Ci.nsIWebNavigation);
    let sh = webNav.sessionHistory;
    if (webNav.canGoBack) {
      let prevEntry = sh.getEntryAtIndex(sh.index - 1, false);
      let prevURL = prevEntry.URI.spec;
      if (prevURL && (prevURL == originalURL || !originalURL)) {
        webNav.goBack();
        return;
      }
    }

    win.document.location = originalURL;
  },

  /**
   * Returns original URL from an about:reader URL.
   *
   * @param url An about:reader URL.
   * @return The original URL for the article, or null if we did not find
   *         a properly formatted about:reader URL.
   */
  getOriginalUrl(url) {
    if (!url.startsWith("about:reader?")) {
      return null;
    }

    let outerHash = "";
    try {
      let uriObj = Services.io.newURI(url);
      url = uriObj.specIgnoringRef;
      outerHash = uriObj.ref;
    } catch (ex) { /* ignore, use the raw string */ }

    let searchParams = new URLSearchParams(url.substring("about:reader?".length));
    if (!searchParams.has("url")) {
      return null;
    }
    let originalUrl = searchParams.get("url");
    if (outerHash) {
      try {
        let uriObj = Services.io.newURI(originalUrl);
        uriObj = Services.io.newURI("#" + outerHash, null, uriObj);
        originalUrl = uriObj.spec;
      } catch (ex) {}
    }
    return originalUrl;
  },

  getOriginalUrlObjectForDisplay(url) {
    let originalUrl = this.getOriginalUrl(url);
    if (originalUrl) {
      let uriObj;
      try {
        uriObj = Services.uriFixup.createFixupURI(originalUrl, Services.uriFixup.FIXUP_FLAG_NONE);
      } catch (ex) {
        return null;
      }
      try {
        return Services.uriFixup.createExposableURI(uriObj);
      } catch (ex) {
        return null;
      }
    }
    return null;
  },

  /**
   * Gets an article from a loaded browser's document. This method will not attempt
   * to parse certain URIs (e.g. about: URIs).
   *
   * @param doc A document to parse.
   * @return {Promise}
   * @resolves JS object representing the article, or null if no article is found.
   */
  parseDocument(doc) {
    if (!Readerable.shouldCheckUri(doc.documentURIObject) ||
        !Readerable.shouldCheckUri(doc.baseURIObject, true)) {
      this.log("Reader mode disabled for URI");
      return null;
    }

    return this._readerParse(doc);
  },

  /**
   * Downloads and parses a document from a URL.
   *
   * @param url URL to download and parse.
   * @return {Promise}
   * @resolves JS object representing the article, or null if no article is found.
   */
  async downloadAndParseDocument(url) {
    let doc = await this._downloadDocument(url);
    if (!doc) {
      return null;
    }
    if (!Readerable.shouldCheckUri(doc.documentURIObject) ||
        !Readerable.shouldCheckUri(doc.baseURIObject, true)) {
      this.log("Reader mode disabled for URI");
      return null;
    }

    return await this._readerParse(doc);
  },

  _downloadDocument(url) {
    try {
      if (!Readerable.shouldCheckUri(Services.io.newURI(url))) {
        return null;
      }
    } catch (ex) {
      Cu.reportError(new Error(`Couldn't create URI from ${url} to download: ${ex}`));
      return null;
    }
    return new Promise((resolve, reject) => {
      let xhr = new XMLHttpRequest();
      xhr.open("GET", url, true);
      xhr.onerror = evt => reject(evt.error);
      xhr.responseType = "document";
      xhr.onload = evt => {
        if (xhr.status !== 200) {
          reject("Reader mode XHR failed with status: " + xhr.status);
          return;
        }

        let doc = xhr.responseXML;
        if (!doc) {
          reject("Reader mode XHR didn't return a document");
          return;
        }

        // Manually follow a meta refresh tag if one exists.
        let meta = doc.querySelector("meta[http-equiv=refresh]");
        if (meta) {
          let content = meta.getAttribute("content");
          if (content) {
            let urlIndex = content.toUpperCase().indexOf("URL=");
            if (urlIndex > -1) {
              let baseURI = Services.io.newURI(url);
              let newURI = Services.io.newURI(content.substring(urlIndex + 4), null, baseURI);
              let newURL = newURI.spec;
              let ssm = Services.scriptSecurityManager;
              let flags = ssm.LOAD_IS_AUTOMATIC_DOCUMENT_REPLACEMENT |
                          ssm.DISALLOW_INHERIT_PRINCIPAL;
              try {
                ssm.checkLoadURIStrWithPrincipal(doc.nodePrincipal, newURL, flags);
              } catch (ex) {
                let errorMsg = "Reader mode disallowed meta refresh (reason: " + ex + ").";

                if (Services.prefs.getBoolPref("reader.errors.includeURLs"))
                  errorMsg += " Refresh target URI: '" + newURL + "'.";
                reject(errorMsg);
                return;
              }
              // Otherwise, pass an object indicating our new URL:
              if (!baseURI.equalsExceptRef(newURI)) {
                reject({newURL});
                return;
              }
            }
          }
        }
        let responseURL = xhr.responseURL;
        let givenURL = url;
        // Convert these to real URIs to make sure the escaping (or lack
        // thereof) is identical:
        try {
          responseURL = Services.io.newURI(responseURL).specIgnoringRef;
        } catch (ex) { /* Ignore errors - we'll use what we had before */ }
        try {
          givenURL = Services.io.newURI(givenURL).specIgnoringRef;
        } catch (ex) { /* Ignore errors - we'll use what we had before */ }

        if (responseURL != givenURL) {
          // We were redirected without a meta refresh tag.
          // Force redirect to the correct place:
          reject({newURL: xhr.responseURL});
          return;
        }
        resolve(doc);
      };
      xhr.send();
    });
  },


  /**
   * Retrieves an article from the cache given an article URI.
   *
   * @param url The article URL.
   * @return {Promise}
   * @resolves JS object representing the article, or null if no article is found.
   * @rejects OS.File.Error
   */
  async getArticleFromCache(url) {
    let path = this._toHashedPath(url);
    try {
      let array = await OS.File.read(path);
      return JSON.parse(new TextDecoder().decode(array));
    } catch (e) {
      if (!(e instanceof OS.File.Error) || !e.becauseNoSuchFile)
        throw e;
      return null;
    }
  },

  /**
   * Stores an article in the cache.
   *
   * @param article JS object representing article.
   * @return {Promise}
   * @resolves When the article is stored.
   * @rejects OS.File.Error
   */
  async storeArticleInCache(article) {
    let array = new TextEncoder().encode(JSON.stringify(article));
    let path = this._toHashedPath(article.url);
    await this._ensureCacheDir();
    return OS.File.writeAtomic(path, array, { tmpPath: path + ".tmp" })
      .then(success => {
        OS.File.stat(path).then(info => {
          return EventDispatcher.instance.sendRequest({
            type: "Reader:AddedToCache",
            url: article.url,
            size: info.size,
            path: path,
          });
        });
      });
  },

  /**
   * Removes an article from the cache given an article URI.
   *
   * @param url The article URL.
   * @return {Promise}
   * @resolves When the article is removed.
   * @rejects OS.File.Error
   */
  async removeArticleFromCache(url) {
    let path = this._toHashedPath(url);
    await OS.File.remove(path);
  },

  log(msg) {
    if (this.DEBUG)
      dump("Reader: " + msg);
  },

  /**
   * Attempts to parse a document into an article. Heavy lifting happens
   * in readerWorker.js.
   *
   * @param doc The document to parse.
   * @return {Promise}
   * @resolves JS object representing the article, or null if no article is found.
   */
  async _readerParse(doc) {
    if (this.parseNodeLimit) {
      let numTags = doc.getElementsByTagName("*").length;
      if (numTags > this.parseNodeLimit) {
        this.log("Aborting parse for " + doc.baseURIObject.spec + "; " + numTags + " elements found");
        return null;
      }
    }

    // Fetch this here before we send `doc` off to the worker thread, as later on the
    // document might be nuked but we will still want the URI.
    let {documentURI} = doc;

    let uriParam = {
      spec: doc.baseURIObject.spec,
      host: doc.baseURIObject.host,
      prePath: doc.baseURIObject.prePath,
      scheme: doc.baseURIObject.scheme,
      pathBase: Services.io.newURI(".", null, doc.baseURIObject).spec
    };

    // convert text/plain document, if any, to XHTML format
    if (this._isDocumentPlainText(doc)) {
      doc = this._convertPlainTextDocument(doc);
    }

    let langAttributes = {
      charset: doc.characterSet,
      lang: doc.documentElement.lang
    };

    let serializer = Cc["@mozilla.org/xmlextras/xmlserializer;1"].
                     createInstance(Ci.nsIDOMSerializer);
    let serializedDoc = serializer.serializeToString(doc);

    let options = {
      classesToPreserve: CLASSES_TO_PRESERVE,
    };

    let article = null;
    try {
      article = await ReaderWorker.post("parseDocument", [uriParam, serializedDoc, options]);
    } catch (e) {
      Cu.reportError("Error in ReaderWorker: " + e);
    }

    // Explicitly null out doc to make it clear it might not be available from this
    // point on.
    doc = null;

    if (!article) {
      this.log("Worker did not return an article");
      return null;
    }

    // Readability returns a URI object based on the baseURI, but we only care
    // about the original document's URL from now on. This also avoids spoofing
    // attempts where the baseURI doesn't match the domain of the documentURI
    article.url = documentURI;
    delete article.uri;

    let flags = Ci.nsIDocumentEncoder.OutputSelectionOnly | Ci.nsIDocumentEncoder.OutputAbsoluteLinks;
    article.title = Cc["@mozilla.org/parserutils;1"].getService(Ci.nsIParserUtils)
                                                    .convertToPlainText(article.title, flags, 0);

    await this._assignLanguage(article, langAttributes);
    this._maybeAssignTextDirection(article);

    this._assignReadTime(article);

    return article;
  },

  get _cryptoHash() {
    delete this._cryptoHash;
    return this._cryptoHash = Cc["@mozilla.org/security/hash;1"].createInstance(Ci.nsICryptoHash);
  },

  get _unicodeConverter() {
    delete this._unicodeConverter;
    this._unicodeConverter = Cc["@mozilla.org/intl/scriptableunicodeconverter"]
                              .createInstance(Ci.nsIScriptableUnicodeConverter);
    this._unicodeConverter.charset = "utf8";
    return this._unicodeConverter;
  },

  /**
   * Calculate the hashed path for a stripped article URL.
   *
   * @param url The article URL. This should have referrers removed.
   * @return The file path to the cached article.
   */
  _toHashedPath(url) {
    let value = this._unicodeConverter.convertToByteArray(url);
    this._cryptoHash.init(this._cryptoHash.MD5);
    this._cryptoHash.update(value, value.length);

    let hash = CommonUtils.encodeBase32(this._cryptoHash.finish(false));
    let fileName = hash.substring(0, hash.indexOf("=")) + ".json";
    return OS.Path.join(OS.Constants.Path.profileDir, "readercache", fileName);
  },

  /**
   * Ensures the cache directory exists.
   *
   * @return Promise
   * @resolves When the cache directory exists.
   * @rejects OS.File.Error
   */
  _ensureCacheDir() {
    let dir = OS.Path.join(OS.Constants.Path.profileDir, "readercache");
    return OS.File.exists(dir).then(exists => {
      if (!exists) {
        return OS.File.makeDir(dir);
      }
      return undefined;
    });
  },

  /**
   * Sets a global language string value if possible. If langauge detection is
   * available, use that. Otherwise, revert to a simpler mechanism using the
   * document's lang attribute or charset.
   *
   * @return Promise
   * @resolves when the language is detected
   */
  _assignLanguage(article, attributes) {
    try {
      Cu.import("resource://modules/translation/LanguageDetector.jsm");
      return LanguageDetector.detectLanguage(article.textContent).then(result => {
        article.language = result.confident ? result.language : null;
      });
    } catch(ex) {
      return new Promise((resolve) => {
        resolve(this._assignSimpleLanguage(attributes));
      }).then(result => {
        article.language = result;
      });
    }
  },

  _assignSimpleLanguage(attributes) {
    var lang = attributes.lang.substring(0,2);
    if (lang) {
      return lang;
    }

    // If there is no lang attribute, try the charset.
    // We can only use this for charsets that are specific to one language.
    const charsetLang = new Map([
      [ "us-ascii",    "en" ],
      [ "iso-8859-6",  "ar" ],
      [ "iso-8859-7",  "el" ],
      [ "iso-8859-8",  "he" ],
      [ "iso-8859-9",  "tr" ],
      [ "iso-8859-11", "th" ],
      [ "jis_x0201",   "ja" ],
      [ "shift_jis",   "ja" ],
      [ "euc-jp",      "ja" ]
    ]);

    return charsetLang.get(attributes.charset);
  },

  _maybeAssignTextDirection(article) {
    // TODO: Remove the hardcoded language codes below once bug 1320265 is resolved.
    if (!article.dir && ["ar", "fa", "he", "ug", "ur"].includes(article.language)) {
      article.dir = "rtl";
    }
  },

  /**
   * Assigns the estimated reading time range of the article to the article object.
   *
   * @param article the article object to assign the reading time estimate to.
   */
  _assignReadTime(article) {
    let lang = article.language || "en";
    const readingSpeed = this._getReadingSpeedForLanguage(lang);
    const charactersPerMinuteLow = readingSpeed.cpm - readingSpeed.variance;
    const charactersPerMinuteHigh = readingSpeed.cpm + readingSpeed.variance;
    const length = article.length;

    article.readingTimeMinsSlow = Math.ceil(length / charactersPerMinuteLow);
    article.readingTimeMinsFast  = Math.ceil(length / charactersPerMinuteHigh);
  },

  /**
   * Returns the reading speed of a selection of languages with likely variance.
   *
   * Reading speed estimated from a study done on reading speeds in various languages.
   * study can be found here: http://iovs.arvojournals.org/article.aspx?articleid=2166061
   *
   * @return object with characters per minute and variance. Defaults to English
   *         if no suitable language is found in the collection.
   */
  _getReadingSpeedForLanguage(lang) {
    const readingSpeed = new Map([
      [ "en", {cpm: 987,  variance: 118 } ],
      [ "ar", {cpm: 612,  variance: 88 } ],
      [ "de", {cpm: 920,  variance: 86 } ],
      [ "es", {cpm: 1025, variance: 127 } ],
      [ "fi", {cpm: 1078, variance: 121 } ],
      [ "fr", {cpm: 998,  variance: 126 } ],
      [ "he", {cpm: 833,  variance: 130 } ],
      [ "it", {cpm: 950,  variance: 140 } ],
      [ "jw", {cpm: 357,  variance: 56 } ],
      [ "nl", {cpm: 978,  variance: 143 } ],
      [ "pl", {cpm: 916,  variance: 126 } ],
      [ "pt", {cpm: 913,  variance: 145 } ],
      [ "ru", {cpm: 986,  variance: 175 } ],
      [ "sk", {cpm: 885,  variance: 145 } ],
      [ "sv", {cpm: 917,  variance: 156 } ],
      [ "tr", {cpm: 1054, variance: 156 } ],
      [ "zh", {cpm: 255,  variance: 29 } ],
    ]);

    return readingSpeed.get(lang) || readingSpeed.get("en");
  },

  /**
   *
   * Check if the document to be parsed is text document.
   * @param doc the doc object to be parsed.
   * @return boolean
   *
   */
  _isDocumentPlainText(doc) {
    return doc.contentType == "text/plain";
  },

  /**
   *
   * The document to be parsed is text document and is converted to HTML format.
   * @param doc the doc object to be parsed.
   * @return doc
   *
   */
  _convertPlainTextDocument(doc) {
    let preTag = doc.querySelector("pre");
    let docFrag = doc.createDocumentFragment();
    let content = preTag.textContent;
    let paragraphs = content.split(/\r?\n\r?\n/);
    for (let para of paragraphs) {
      let pElem = doc.createElement("p");
      let lines = para.split(/\n/);
      for (let line of lines) {
        pElem.append(line);
        let brElem = doc.createElement("br");
        pElem.append(brElem);
      }
      docFrag.append(pElem);
    }
    preTag.parentNode.replaceChild(docFrag, preTag);
    return doc;
  },
};

// Don't try to parse the page if it has too many elements (for memory and
// performance reasons)
XPCOMUtils.defineLazyPreferenceGetter(
  ReaderMode, "parseNodeLimit", "reader.parse-node-limit", 0);