How to allow only specific set of HTML tags & specific set of Attributes using general Regex?
Allowed HTML Tags:
p|body|b
Here is a Perl solution using PCRE compatible regex. It is not aware of comments, doctype, CDATA, etc. Those should be added for a more complete solution.
# allowed tag and attribute names
my $allowed_tags_open = 'p|body|b|u|em|strong|ul|ol|li|h1|h2|h3|h4|h5|h6|a|tr|td|table|tbody|label|div|sup|sub|caption';
my $allowed_tags_self_closing = 'img|br|hr';
my $allowed_attributes = 'alt|href|tcmuri|title|height|width|align|valign|rowspan|colspan|src|summary|class|id|name|title|target|nowrap|scope|axis|cellpadding|cellspacing|dir|lang|rel';
$allowed_attributes .= '|style'; # for testing
# definitions for matching allowed tag and attribute names
my $re_tags = qr~(?(DEFINE)
(?
/?+
(?>
(?: $allowed_tags_open )
(?! [^\s>/] ) # from (?&tagname)
)
)
(?
(?>
(?: $allowed_tags_self_closing )
(?! [^\s>/] ) # from (?&tagname)
)
)
(? (?> (?&tags_open) | (?&tags_self_closing) ) )
(?
(?>
(?: $allowed_attributes )
(?! [^\s=/>] ) # from (?&attname)
)
)
)~xi;
# definitions for matching the tags
# trying to follow compatible tokenization characteristics of modern browsers
my $re_defs = qr~(?(DEFINE)
(? [a-z/][^\s>/]*+ ) # will match the leading / in closing tags
(? [^\s>/][^\s=/>]*+ ) # first char can be pretty much anything, including =
(? (?>
"[^"]*+" |
\'[^\']*+\' |
[^\s>]*+ # unquoted values can contain quotes, = and /
)
)
(? (?&attname)
(?: \s*+
= \s*+
(?&attval)
)?+
)
(? (?!/>)[^\s>] ) # most crap inside tag is ignored, but don't eat the last / in self closing tags
(? <(?&tagname)
(?: \s*+ # spaces between attributes not required: "style=color:red>bold red text
(?>
(?&attrib) | # order matters
(?&crap) # if not an attribute, eat the crap
)
)*+
\s*+ /?+
>
)
)~xi;
sub sanitize_html{
my $str = shift;
$str =~ s/(?&tag) $re_defs/ sanitize_tag($&) /gexo;
return $str;
}
sub sanitize_tag{
my $tag = shift;
my ($name, $attr, $end) =
$tag =~ /^ < ((?&tags)) (.*?) ( \/?+ > ) $ $re_tags/xo
or return ''; # return empty string if not allowed tag
# return a new clean closing tag if it's a closing tag
return "<$name>" if substr($name, 0, 1) eq '/';
# clean attributes
return "<$name" . sanitize_attributes($attr) . $end;
}
sub sanitize_attributes{
my $attr = shift;
my $new = '';
$attr =~ s{
\G
\s*+ # spaces between attributes not required
(?>
( (?&attrib) ) | # order matters
(?&crap) # if not an attribute, eat the crap
)
$re_defs
}{
my $att = $1;
$new .= " $att" if $att && $att =~ /^(?&attribs) $re_tags/xo;
'';
}gexo;
return $new;
}
my $test = <<'_TEST_';
simple
self
closing
new tag and known attribute
one unknown attr
attr in closing tag
="">crap be gone not bold
tricky"//> not bold
_TEST_
print $test, "\n";
print '-' x 70, "\n";
print sanitize_html $test;
Output:
simple
self
closing
new tag and known attribute
one unknown attr
attr in closing tag
="">crap be gone not bold
tricky"//> not bold
----------------------------------------------------------------------
simple
self
closing
new tag and known attribute
one unknown attr
attr in closing tag
crap be gone not bold
tricky not bold
See how your browser parses the tricky tags: jsFiddle
Possibly relevant: