Get Google Document as HTML

后端 未结 8 2008
无人及你
无人及你 2020-11-27 17:28

I had a wild idea that I could build a website blog for an unsophisticated user friend using Google Drive Documents to back it. I was able to create a contentService that c

8条回答
  •  伪装坚强ぢ
    2020-11-27 18:12

    I've had this problem as well. The HTML that the Document HTML Export spits out is really ugly, so this was my solution:

    /**
     * Takes in a Google Doc ID, gets that doc in HTML format, cleans up the markup, and returns the resulting HTML string.
     *
     * @param {string} the id of the google doc
     * @param {boolean} [useCaching] enable or disable caching. default true.
     * @return {string} the doc's body in html format
     */
    function getContent(id, useCaching) {
    
      if (!id) {
        throw "Please call this API with a valid Google Doc ID";
      }
    
      if (useCaching == null) {
        useCaching = true;
      }
    
      if (typeof useCaching != "boolean") {
        throw "If you're going to specify useCaching, it must be boolean.";
      }
    
      var cache = CacheService.getScriptCache();
      var cached = cache.get(id); // see if we have a cached version of our parsed html
      if (cached && useCaching) {
        var html = cached;
        Logger.log("Pulling doc html from cache...");
      } else {
    
        Logger.log("Grabbing and parsing fresh html from the doc...");
    
        try {
          var doc = DriveApp.getFileById(id);
        } catch (err) {
          throw "Please call this API with a valid Google Doc ID. " + err.message;
        }
    
        var docName = doc.getName();
    
        var forDriveScope = DriveApp.getStorageUsed(); // needed to get Drive Scope requested in ScriptApp.getOAuthToken();
        var url = "https://docs.google.com/feeds/download/documents/export/Export?id=" + id + "&exportFormat=html";
        var param = {
          method: "get",
          headers: {"Authorization": "Bearer " + ScriptApp.getOAuthToken()},
          muteHttpExceptions:true,
        };
    
        var html = UrlFetchApp.fetch(url, param).getContentText();
    
        // nuke the whole head section, including the stylesheet and meta tag
        html = html.replace(/.*<\/head>/, '');
        // remove almost all html attributes
        html = html.replace(/ (id|class|style|start|colspan|rowspan)="[^"]*"/g, '');
        // remove all of the spans, as well as the outer html and body
        html = html.replace(/<(span|\/span|body|\/body|html|\/html)>/g, '');
        // clearly the superior way of denoting line breaks
        html = html.replace(/
    /g, '
    '); cache.put(id, html, 900) // cache doc contents for 15 minutes, in case we get a lot of requests } Logger.log(html); return html; }

    https://gist.github.com/xd1936/cc229d14a89e6327336177bb07ac2980

提交回复
热议问题