Whats the easiest way to strip the HTML tags in perl. I am using a regular expression to parse HTML from a URL which works great but how can I strip the HTML tags off?
there is also a nice Perl module HTML::Scrubber.
#!/usr/bin/perl
use warnings;
use strict;
use HTML::Scrubber;
my $file = shift or die "need a file $!\n";
my $html;
open (FH,"< $file");
read( FH, $html, -s FH );
close FH;
#print "$html\n";
my $scrubber = HTML::Scrubber->new;
$scrubber->default(1); ## default to allow HTML
#$scrubber->script(0); ## no script
#$scrubber->style(0); ## no style
# OR
$scrubber->deny(qw[script style]);
my $clean_html = $scrubber->scrub($html);
open (OH, '>', $file.'.scrubbed.html');
print OH $clean_html;
close OH;
exit;