How to wrap part of a text in a node with JavaScript

后端 未结 5 1344
-上瘾入骨i
-上瘾入骨i 2020-11-28 23:36

I have a challenging problem to solve. I\'m working on a script which takes a regex as an input. This script then finds all matches for this regex in a document and wraps ea

5条回答
  •  旧时难觅i
    2020-11-29 00:22

    Here are two ways to deal with this.

    I don't know if the following will exactly match your needs. It's a simple enough solution to the problem, but at least it doesn't use RegEx to manipulate HTML tags. It performs pattern matching against the raw text and then uses the DOM to manipulate the content.


    First approach

    This approach creates only one tag per match, leveraging some less common browser APIs.
    (See the main problem of this approach below the demo, and if not sure, use the second approach).

    The Range class represents a text fragment. It has a surroundContents function that lets you wrap a range in an element. Except it has a caveat:

    This method is nearly equivalent to newNode.appendChild(range.extractContents()); range.insertNode(newNode). After surrounding, the boundary points of the range include newNode.

    An exception will be thrown, however, if the Range splits a non-Text node with only one of its boundary points. That is, unlike the alternative above, if there are partially selected nodes, they will not be cloned and instead the operation will fail.

    Well, the workaround is provided in the MDN, so all's good.

    So here's an algorithm:

    • Make a list of Text nodes and keep their start indices in the text
    • Concatenate these nodes' values to get the text
    • Find matches over the text, and for each match:

      • Find the start and end nodes of the match, comparing the the nodes' start indices to the match position
      • Create a Range over the match
      • Let the browser do the dirty work using the trick above
      • Rebuild the node list since the last action changed the DOM

    Here's my implementation with a demo:

    function highlight(element, regex) {
        var document = element.ownerDocument;
        
        var getNodes = function() {
            var nodes = [],
                offset = 0,
                node,
                nodeIterator = document.createNodeIterator(element, NodeFilter.SHOW_TEXT, null, false);
                
            while (node = nodeIterator.nextNode()) {
                nodes.push({
                    textNode: node,
                    start: offset,
                    length: node.nodeValue.length
                });
                offset += node.nodeValue.length
            }
            return nodes;
        }
        
        var nodes = getNodes(nodes);
        if (!nodes.length)
            return;
        
        var text = "";
        for (var i = 0; i < nodes.length; ++i)
            text += nodes[i].textNode.nodeValue;
    
        var match;
        while (match = regex.exec(text)) {
            // Prevent empty matches causing infinite loops        
            if (!match[0].length)
            {
                regex.lastIndex++;
                continue;
            }
            
            // Find the start and end text node
            var startNode = null, endNode = null;
            for (i = 0; i < nodes.length; ++i) {
                var node = nodes[i];
                
                if (node.start + node.length <= match.index)
                    continue;
                
                if (!startNode)
                    startNode = node;
                
                if (node.start + node.length >= match.index + match[0].length)
                {
                    endNode = node;
                    break;
                }
            }
            
            var range = document.createRange();
            range.setStart(startNode.textNode, match.index - startNode.start);
            range.setEnd(endNode.textNode, match.index + match[0].length - endNode.start);
            
            var spanNode = document.createElement("span");
            spanNode.className = "highlight";
    
            spanNode.appendChild(range.extractContents());
            range.insertNode(spanNode);
            
            nodes = getNodes();
        }
    }
    
    // Test code
    var testDiv = document.getElementById("test-cases");
    var originalHtml = testDiv.innerHTML;
    function test() {
        testDiv.innerHTML = originalHtml;
        try {
            var regex = new RegExp(document.getElementById("regex").value, "g");
            highlight(testDiv, regex);
        }
        catch(e) {
            testDiv.innerText = e;
        }
    }
    document.getElementById("runBtn").onclick = test;
    test();
    .highlight {
      background-color: yellow;
      border: 1px solid orange;
      border-radius: 5px;
    }
    
    .section {
      border: 1px solid gray;
      padding: 10px;
      margin: 10px;
    }
    RegEx:
    foo bar baz

    HTML is a language used to make websites. It was developed by CERN employees in the early 90s.

    This program is not stable yet. Do not use this in production yet.

    foo bar baz

    Ok, that was the lazy approach which, unfortunately doesn't work for some cases. It works well if you only highlight across inline elements, but breaks when there are block elements along the way because of the following property of the extractContents function:

    Partially selected nodes are cloned to include the parent tags necessary to make the document fragment valid.

    That's bad. It'll just duplicate block-level nodes. Try the previous demo with the baz\s+HTML regex if you want to see how it breaks.


    Second approach

    This approach iterates over the matching nodes, creating tags along the way.

    The overall algorithm is straightforward as it just wraps each matching node in its own . But this means we have to deal with partially matching text nodes, which requires some more effort.

    If a text node matches partially, it's split with the splitText function:

    After the split, the current node contains all the content up to the specified offset point, and a newly created node of the same type contains the remaining text. The newly created node is returned to the caller.

    function highlight(element, regex) {
        var document = element.ownerDocument;
        
        var nodes = [],
            text = "",
            node,
            nodeIterator = document.createNodeIterator(element, NodeFilter.SHOW_TEXT, null, false);
            
        while (node = nodeIterator.nextNode()) {
            nodes.push({
                textNode: node,
                start: text.length
            });
            text += node.nodeValue
        }
        
        if (!nodes.length)
            return;
    
        var match;
        while (match = regex.exec(text)) {
            var matchLength = match[0].length;
            
            // Prevent empty matches causing infinite loops        
            if (!matchLength)
            {
                regex.lastIndex++;
                continue;
            }
            
            for (var i = 0; i < nodes.length; ++i) {
                node = nodes[i];
                var nodeLength = node.textNode.nodeValue.length;
                
                // Skip nodes before the match
                if (node.start + nodeLength <= match.index)
                    continue;
            
                // Break after the match
                if (node.start >= match.index + matchLength)
                    break;
                
                // Split the start node if required
                if (node.start < match.index) {
                    nodes.splice(i + 1, 0, {
                        textNode: node.textNode.splitText(match.index - node.start),
                        start: match.index
                    });
                    continue;
                }
                
                // Split the end node if required
                if (node.start + nodeLength > match.index + matchLength) {
                    nodes.splice(i + 1, 0, {
                        textNode: node.textNode.splitText(match.index + matchLength - node.start),
                        start: match.index + matchLength
                    });
                }
                
                // Highlight the current node
                var spanNode = document.createElement("span");
                spanNode.className = "highlight";
                
                node.textNode.parentNode.replaceChild(spanNode, node.textNode);
                spanNode.appendChild(node.textNode);
            }
        }
    }
    
    // Test code
    var testDiv = document.getElementById("test-cases");
    var originalHtml = testDiv.innerHTML;
    function test() {
        testDiv.innerHTML = originalHtml;
        try {
            var regex = new RegExp(document.getElementById("regex").value, "g");
            highlight(testDiv, regex);
        }
        catch(e) {
            testDiv.innerText = e;
        }
    }
    document.getElementById("runBtn").onclick = test;
    test();
    .highlight {
      background-color: yellow;
    }
    
    .section {
      border: 1px solid gray;
      padding: 10px;
      margin: 10px;
    }
    RegEx:
    foo bar baz

    HTML is a language used to make websites. It was developed by CERN employees in the early 90s.

    This program is not stable yet. Do not use this in production yet.

    foo bar baz

    This should be good enough for most cases I hope. If you need to minimize the number of tags it can be done by extending this function, but I wanted to keep it simple for now.

提交回复
热议问题