I\'m using JavaScript to do some regular expression. Considering I\'m working with well-formed source, and I want to remove any space before[,.] and keep only one space afte
As stated above and many times before, HTML is not a regular language and thus cannot be parsed with regular expressions.
You will have to do this recursively; I'd suggest crawling the DOM object.
Try something like this...
function regexReplaceInnerText(curr_element) {
if (curr_element.childNodes.length <= 0) { // termination case:
// no children; this is a "leaf node"
if (curr_element.nodeName == "#text" || curr_element.nodeType == 3) { // node is text; not an empty tag like
if (curr_element.data.replace(/^\s*|\s*$/g, '') != "") { // node isn't just white space
// (you can skip this check if you want)
var text = curr_element.data;
text = text.replace(/ *(,|\.) *([^ 0-9])/g, '$1 $2');
curr_element.data = text;
}
}
} else {
// recursive case:
// this isn't a leaf node, so we iterate over all children and recurse
for (var i = 0; curr_element.childNodes[i]; i++) {
regexReplaceInnerText(curr_element.childNodes[i]);
}
}
}
// then get the element whose children's text nodes you want to be regex'd
regexReplaceInnerText(document.getElementsByTagName("body")[0]);
// or if you don't want to do the whole document...
regexReplaceInnerText(document.getElementById("ElementToRegEx"));