Extract text from HTML while preserving block-level element newlines

后端 未结 5 1637
难免孤独
难免孤独 2020-12-13 18:28

Background

Most questions about extracting text from HTML (i.e., stripping the tags) use:

jQuery( htmlString ).text();

While this

5条回答
  •  不知归路
    2020-12-13 19:08

    based on https://stackoverflow.com/a/20384452/3338098 and fixed to support TEXT1

    TEXT2
    =>TEXT1\nTEXT2 and allow non-DOM nodes

    /**
     * Returns the style for a node.
     *
     * @param n The node to check.
     * @param p The property to retrieve (usually 'display').
     * @link http://www.quirksmode.org/dom/getstyles.html
     */
    function getNodeStyle( n, p ) {
      return n.currentStyle ?
        n.currentStyle[p] :
        document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
    }
    
    //IF THE NODE IS NOT ACTUALLY IN THE DOM then this won't take into account 
    text
    //however for simple things like `contenteditable` this is sufficient, however for arbitrary html this will not work function isNodeBlock(node) { if (node.nodeType == document.TEXT_NODE) {return false;} var d = getNodeStyle( node, 'display' );//this is irrelevant if the node isn't currently in the current DOM. if (d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) || node.tagName == 'BR' || node.tagName == 'HR' || node.tagName == 'DIV' // div,p,... add as needed to support non-DOM nodes ) { return true; } return false; } /** * Converts HTML to text, preserving semantic newlines for block-level * elements. * * @param node - The HTML node to perform text extraction. */ function htmlToText( htmlOrNode, isNode ) { var node = htmlOrNode; if (!isNode) {node = jQuery(""+htmlOrNode+"")[0];} //TODO: inject "unsafe" HTML into current DOM while guaranteeing that it won't // change the visible DOM so that `isNodeBlock` will work reliably var result = ''; if( node.nodeType == document.TEXT_NODE ) { // Replace repeated spaces, newlines, and tabs with a single space. result = node.nodeValue.replace( /\s+/g, ' ' ); } else { for( var i = 0, j = node.childNodes.length; i < j; i++ ) { result += htmlToText( node.childNodes[i], true ); if (i < j-1) { if (isNodeBlock(node.childNodes[i])) { result += '\n'; } else if (isNodeBlock(node.childNodes[i+1]) && node.childNodes[i+1].tagName != 'BR' && node.childNodes[i+1].tagName != 'HR') { result += '\n'; } } } } return result; }

    the main change was

          if (i < j-1) {
            if (isNodeBlock(node.childNodes[i])) {
              result += '\n';
            } else if (isNodeBlock(node.childNodes[i+1]) &&
                       node.childNodes[i+1].tagName != 'BR' &&
                       node.childNodes[i+1].tagName != 'HR') {
              result += '\n';
            }
          }
    

    to check neighboring blocks to determine the appropriateness of adding a newline.

提交回复
热议问题