I dont parse this url: http://foldmunka.net
$ch = curl_init(\"http://foldmunka.net\");
//curl_setopt($ch, CURLOPT_NOBODY, true);
curl_setopt($ch, CURLOPT_RE
Here is a Simple Html DOM Parser solution just for comparison. It's output is similar for the DomDocument solution's, but this one is more complicated and runs much slower (~2300ms against DomDocument's ~100ms), so I don't recommend to use it:
Updated to work with elements inside elements.
tag == 'a')
$element->innertext = makePlainText($element->innertext, LOAD_FROM_STRING);
}
function callback_buildPlainText($element)
{
global $processed_plain_text;
$excluded_tags = array('script', 'style');
switch ($element->tag)
{
case 'text':
// filter when 'text' is descendant of 'a', because we are
// processing the anchor tags with the required attributes
// separately at the 'a' tag,
// and also filter out other unneccessary tags
if (($element->parent->tag != 'a') && !in_array($element->parent->tag, $excluded_tags))
$processed_plain_text .= $element->innertext . ' ';
break;
case 'img':
$processed_plain_text .= $element->alt . ' ';
$processed_plain_text .= $element->title . ' ';
break;
case 'a':
$processed_plain_text .= $element->alt . ' ';
$processed_plain_text .= $element->title . ' ';
$processed_plain_text .= $element->innertext . ' ';
break;
}
}
function makePlainText($source, $mode = LOAD_FROM_URL)
{
global $processed_plain_text;
if ($mode == LOAD_FROM_URL)
$html = file_get_html($source);
elseif ($mode == LOAD_FROM_STRING)
$html = str_get_dom ($source);
else
return 'Wrong mode defined in makePlainText: ' . $mode;
$html->set_callback('callback_cleanNestedAnchorContent');
// processing with the first callback to clean up the anchor tags
$html = str_get_html($html->save());
$html->set_callback('callback_buildPlainText');
// processing with the second callback to build the full plain text with
// the required attributes of the 'img' and 'a' tags, and excluding the
// unneccessary ones like script and style tags
$html->save();
$return = $processed_plain_text;
// cleaning the global variable
$processed_plain_text = '';
return $return;
}
//$html = 'Hello Hello this site
click HERE
Some text.';
echo makePlainText('http://foldmunka.net');
//echo makePlainText($html, LOAD_FROM_STRING);