I\'m building an XML file from scratch and need to know if htmlentities() converts every character that could potentially break an XML file (and possibly UTF-8 data)?
Thought I'd add this for those who need to sanitize & not lose the XML attributes.
// Returns SimpleXML Safe XML keeping the elements attributes as well
function sanitizeXML($xml_content, $xml_followdepth=true){
if (preg_match_all('%<((\w+)\s?.*?)>(.+?)\2>%si', $xml_content, $xmlElements, PREG_SET_ORDER)) {
$xmlSafeContent = '';
foreach($xmlElements as $xmlElem){
$xmlSafeContent .= '<'.$xmlElem['1'].'>';
if (preg_match('%<((\w+)\s?.*?)>(.+?)\2>%si', $xmlElem['3'])) {
$xmlSafeContent .= sanitizeXML($xmlElem['3'], false);
}else{
$xmlSafeContent .= htmlspecialchars($xmlElem['3'],ENT_NOQUOTES);
}
$xmlSafeContent .= ''.$xmlElem['2'].'>';
}
if(!$xml_followdepth)
return $xmlSafeContent;
else
return "".$xmlSafeContent;
} else {
return htmlspecialchars($xml_content,ENT_NOQUOTES);
}
}
Usage:
$body = <<
-
2016 & Au Rendez-Vous Des Enfoir&
EG;
$newXml = sanitizeXML($body);
var_dump($newXml);
Returns:
-
2016 & Au Rendez-Vous Des Enfoir&