3 # This script cleans up an HTML document
8 # configure these values
10 qw(bgcolor background color face style link alink vlink text
11 onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload
12 onmousedown onmousemove onmouseout onmouseover onmouseup
13 onreset onselect onunload
15 my @ignore_tags = qw(font big small b i);
16 my @ignore_elements = qw(script style);
18 # make it easier to look up attributes
19 my %ignore_attr = map { $_ => 1} @ignore_attr;
25 # kill some attributes
26 my($k_offset, $k_len, $v_offset, $v_len) = @{$pos}[-4 .. -1];
27 my $next_attr = $v_offset ? $v_offset + $v_len : $k_offset + $k_len;
30 ($k_offset, $k_len, $v_offset, $v_len) = splice @$pos, -4;
31 if ($ignore_attr{lc substr($text, $k_offset, $k_len)}) {
32 substr($text, $k_offset, $next_attr - $k_offset) = "";
35 $next_attr = $k_offset;
37 # if we killed all attributed, kill any extra whitespace too
38 $text =~ s/^(<\w+)\s+>$/$1>/ if $edited;
46 print shift if $type eq "doctype";
54 HTML::Parser->new(api_version => 3,
55 start_h => [\&tag, "tokenpos, text"],
56 process_h => ["", ""],
57 comment_h => ["", ""],
58 declaration_h => [\&decl, "tagname, text"],
59 default_h => [\&text, "text"],
61 ignore_tags => \@ignore_tags,
62 ignore_elements => \@ignore_elements,
64 ->parse_file(shift) || die "Can't open file: $!\n";