Extract text from HTML while preserving block-level element newlines

有些话、适合烂在心里 提交于 2019-11-30 02:57:49
svidgen

Consider:

/**
 * Returns the style for a node.
 *
 * @param n The node to check.
 * @param p The property to retrieve (usually 'display').
 * @link http://www.quirksmode.org/dom/getstyles.html
 */
this.getStyle = function( n, p ) {
  return n.currentStyle ?
    n.currentStyle[p] :
    document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}

/**
 * Converts HTML to text, preserving semantic newlines for block-level
 * elements.
 *
 * @param node - The HTML node to perform text extraction.
 */
this.toText = function( node ) {
  var result = '';

  if( node.nodeType == document.TEXT_NODE ) {
    // Replace repeated spaces, newlines, and tabs with a single space.
    result = node.nodeValue.replace( /\s+/g, ' ' );
  }
  else {
    for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
      result += _this.toText( node.childNodes[i] );
    }

    var d = _this.getStyle( node, 'display' );

    if( d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
        node.tagName == 'BR' || node.tagName == 'HR' ) {
      result += '\n';
    }
  }

  return result;
}

http://jsfiddle.net/3mzrV/2/

That is to say, with an exception or two, iterate through each node and print its contents, letting the browser's computed style tell you when to insert newlines.

This seems to be (nearly) doing what you want:

function getText($node) {
    return $node.contents().map(function () {
        if (this.nodeName === 'BR') {
            return '\n';
        } else if (this.nodeType === 3) {
            return this.nodeValue;
        } else {
            return getText($(this));
        }
    }).get().join('');
}

DEMO

It just recursively concatenates the values of all text nodes and replaces <br> elements with line breaks.

But there is no semantics in this, it completely relies the original HTML formatting (the leading and trailing white spaces seem to come from how jsFiddle embeds the HTML, but you can easily trim those). For example, notice how it indents the definition term.

If you really want to do this on a semantic level, you need a list of block level elements, recursively iterate over the elements and indent them accordingly. You treat different block elements differently with respect to indentation and line breaks around them. This should not be too difficult.

user3338098

based on https://stackoverflow.com/a/20384452/3338098 and fixed to support TEXT1<div>TEXT2</div>=>TEXT1\nTEXT2 and allow non-DOM nodes

/**
 * Returns the style for a node.
 *
 * @param n The node to check.
 * @param p The property to retrieve (usually 'display').
 * @link http://www.quirksmode.org/dom/getstyles.html
 */
function getNodeStyle( n, p ) {
  return n.currentStyle ?
    n.currentStyle[p] :
    document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}

//IF THE NODE IS NOT ACTUALLY IN THE DOM then this won't take into account <div style="display: inline;">text</div>
//however for simple things like `contenteditable` this is sufficient, however for arbitrary html this will not work
function isNodeBlock(node) {
  if (node.nodeType == document.TEXT_NODE) {return false;}
  var d = getNodeStyle( node, 'display' );//this is irrelevant if the node isn't currently in the current DOM.
  if (d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
      node.tagName == 'BR' || node.tagName == 'HR' ||
      node.tagName == 'DIV' // div,p,... add as needed to support non-DOM nodes
     ) {
    return true;
  }
  return false;
}

/**
 * Converts HTML to text, preserving semantic newlines for block-level
 * elements.
 *
 * @param node - The HTML node to perform text extraction.
 */
function htmlToText( htmlOrNode, isNode ) {
  var node = htmlOrNode;
  if (!isNode) {node = jQuery("<span>"+htmlOrNode+"</span>")[0];}
  //TODO: inject "unsafe" HTML into current DOM while guaranteeing that it won't
  //      change the visible DOM so that `isNodeBlock` will work reliably
  var result = '';
  if( node.nodeType == document.TEXT_NODE ) {
    // Replace repeated spaces, newlines, and tabs with a single space.
    result = node.nodeValue.replace( /\s+/g, ' ' );
  } else {
    for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
      result += htmlToText( node.childNodes[i], true );
      if (i < j-1) {
        if (isNodeBlock(node.childNodes[i])) {
          result += '\n';
        } else if (isNodeBlock(node.childNodes[i+1]) &&
                   node.childNodes[i+1].tagName != 'BR' &&
                   node.childNodes[i+1].tagName != 'HR') {
          result += '\n';
        }
      }
    }
  }
  return result;
}

the main change was

      if (i < j-1) {
        if (isNodeBlock(node.childNodes[i])) {
          result += '\n';
        } else if (isNodeBlock(node.childNodes[i+1]) &&
                   node.childNodes[i+1].tagName != 'BR' &&
                   node.childNodes[i+1].tagName != 'HR') {
          result += '\n';
        }
      }

to check neighboring blocks to determine the appropriateness of adding a newline.

I would like to suggest a little edit from the code of svidgen:

function getText(n, isInnerNode) {
  var rv = '';
  if (n.nodeType == 3) {
      rv = n.nodeValue;
  } else {
      var partial = "";
      var d = getComputedStyle(n).getPropertyValue('display');
      if (isInnerNode && d.match(/^block/) || d.match(/list/) || n.tagName == 'BR') {
          partial += "\n";
      }

      for (var i = 0; i < n.childNodes.length; i++) {
          partial += getText(n.childNodes[i], true);
      }
      rv = partial;
  }
  return rv;
 };

I just added the line break before the for loop, in this way we have a newline before the block, and also a variable to avoid the newline for the root element.

The code should be invocated:

getText(document.getElementById("divElement"))
易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!