Most questions about extracting text from HTML (i.e., stripping the tags) use:
jQuery( htmlString ).text();
While this
based on https://stackoverflow.com/a/20384452/3338098
and fixed to support TEXT1=>TEXT1\nTEXT2 and allow non-DOM nodes
/**
* Returns the style for a node.
*
* @param n The node to check.
* @param p The property to retrieve (usually 'display').
* @link http://www.quirksmode.org/dom/getstyles.html
*/
function getNodeStyle( n, p ) {
return n.currentStyle ?
n.currentStyle[p] :
document.defaultView.getComputedStyle(n, null).getPropertyValue(p);
}
//IF THE NODE IS NOT ACTUALLY IN THE DOM then this won't take into account text
//however for simple things like `contenteditable` this is sufficient, however for arbitrary html this will not work
function isNodeBlock(node) {
if (node.nodeType == document.TEXT_NODE) {return false;}
var d = getNodeStyle( node, 'display' );//this is irrelevant if the node isn't currently in the current DOM.
if (d.match( /^block/ ) || d.match( /list/ ) || d.match( /row/ ) ||
node.tagName == 'BR' || node.tagName == 'HR' ||
node.tagName == 'DIV' // div,p,... add as needed to support non-DOM nodes
) {
return true;
}
return false;
}
/**
* Converts HTML to text, preserving semantic newlines for block-level
* elements.
*
* @param node - The HTML node to perform text extraction.
*/
function htmlToText( htmlOrNode, isNode ) {
var node = htmlOrNode;
if (!isNode) {node = jQuery(""+htmlOrNode+"")[0];}
//TODO: inject "unsafe" HTML into current DOM while guaranteeing that it won't
// change the visible DOM so that `isNodeBlock` will work reliably
var result = '';
if( node.nodeType == document.TEXT_NODE ) {
// Replace repeated spaces, newlines, and tabs with a single space.
result = node.nodeValue.replace( /\s+/g, ' ' );
} else {
for( var i = 0, j = node.childNodes.length; i < j; i++ ) {
result += htmlToText( node.childNodes[i], true );
if (i < j-1) {
if (isNodeBlock(node.childNodes[i])) {
result += '\n';
} else if (isNodeBlock(node.childNodes[i+1]) &&
node.childNodes[i+1].tagName != 'BR' &&
node.childNodes[i+1].tagName != 'HR') {
result += '\n';
}
}
}
}
return result;
}
the main change was
if (i < j-1) {
if (isNodeBlock(node.childNodes[i])) {
result += '\n';
} else if (isNodeBlock(node.childNodes[i+1]) &&
node.childNodes[i+1].tagName != 'BR' &&
node.childNodes[i+1].tagName != 'HR') {
result += '\n';
}
}
to check neighboring blocks to determine the appropriateness of adding a newline.