I\'m using SimpleXML to load in some xml files (which I didn\'t write/provide and can\'t really change the format of).
Occasionally (eg one or two files out of ever
i think workaroung for creating compute_position function will be make xml string flat before processing. Rewrite code posted by Josh:
function load_invalid_xml($xml)
{
$use_internal_errors = libxml_use_internal_errors(true);
libxml_clear_errors(true);
$sxe = simplexml_load_string($xml);
if ($sxe)
{
return $sxe;
}
$fixed_xml = '';
$last_pos = 0;
// make string flat
$xml = str_replace(array("\r\n", "\r", "\n"), "", $xml);
// get file encoding
$encoding = mb_detect_encoding($xml);
foreach (libxml_get_errors() as $error)
{
$pos = $error->column;
$invalid_char = mb_substr($xml, $pos, 1, $encoding);
$fixed_xml .= substr($xml, $last_pos, $pos - $last_pos) . htmlspecialchars($invalid_char);
$last_pos = $pos + 1;
}
$fixed_xml .= substr($xml, $last_pos);
libxml_use_internal_errors($use_internal_errors);
return simplexml_load_string($fixed_xml);
}
I've added encoding stuff becose i've had problems with simply array[index] way of getting character from string.
This all should work but, dont know why, i've seen that $error->column gives me a different number than it should. Trying to debug this by simply add some invalid characters inside xml and check what value it would return, but no luck with it. Hope someone could tell me what is wrong with this approach.
What you need is something that will use libxml's internal errors to locate invalid characters and escape them accordingly. Here's a mockup of how I'd write it. Take a look at the result of libxml_get_errors()
for error info.
function load_invalid_xml($xml)
{
$use_internal_errors = libxml_use_internal_errors(true);
libxml_clear_errors(true);
$sxe = simplexml_load_string($xml);
if ($sxe)
{
return $sxe;
}
$fixed_xml = '';
$last_pos = 0;
foreach (libxml_get_errors() as $error)
{
// $pos is the position of the faulty character,
// you have to compute it yourself
$pos = compute_position($error->line, $error->column);
$fixed_xml .= substr($xml, $last_pos, $pos - $last_pos) . htmlspecialchars($xml[$pos]);
$last_pos = $pos + 1;
}
$fixed_xml .= substr($xml, $last_pos);
libxml_use_internal_errors($use_internal_errors);
return simplexml_load_string($fixed_xml);
}