-> , etc..
blink No blink tags.
contenttype Remove default contenttype.
comments Remove excess comments.
entities " -> ", etc.
dequote remove quotes from tag parameters where possible.
defcolor recode colors in shorter form. (#ffffff -> white, etc.)
javascript remove excess spaces and newlines in javascript code.
htmldefaults remove default values for some html tags
lowercasetags translate all HTML tags to lowercase
=item parameterized values
meta Takes a space separated list of meta tags to remove,
default "GENERATOR FORMATTER"
emptytags Takes a space separated list of tags to remove when there is no
content between the start and end tag, like this: .
The default is 'b i font center'
=back
Please note that if your HTML includes preformatted regions (this means, if
it includes ...
, we do not suggest removing whitespace, as it will
alter the rendered defaults.
HTML::Clean will print out a warning if it finds a preformatted region and is
requested to strip whitespace. In order to prevent this, specify that you don't
want to strip whitespace - i.e.
$h->strip( {whitespace => 0} );
=cut
use vars qw/
$do_whitespace
$do_shortertags
$do_meta
$do_blink
$do_contenttype
$do_comments
$do_entities
$do_dequote
$do_defcolor
$do_emptytags
$do_javascript
$do_htmldefaults
$do_lowercasetags
$do_defbaseurl
/;
$do_whitespace = 1;
$do_shortertags = 1;
$do_meta = "generator formatter";
$do_blink = 1;
$do_contenttype = 1;
$do_comments = 1;
$do_entities = 1;
$do_dequote = 1;
$do_defcolor = 1;
$do_emptytags = 'b i font center';
$do_javascript = 1;
$do_htmldefaults = 1;
$do_lowercasetags = 1;
$do_defbaseurl = '';
sub strip {
my($self, $options) = @_;
my $h = $self->{'DATA'};
my $level = $self->{'LEVEL'};
# Select a set of options based on $level, and then modify based on
# user supplied options.
_level_defaults($level);
if(defined($options)) {
no strict 'refs';
for (keys(%$options)) {
${"do_" . lc($_)} = $options->{$_} if defined ${"do_" . lc($_)};
}
}
if ($do_shortertags) {
$$h =~ s,,,sgi;
$$h =~ s,,,sgi;
$$h =~ s,,,sgi;
$$h =~ s,,,sgi;
}
if ($do_whitespace) {
if ($$h =~ / region in your HTML, which depends on the whitespace not
being modified. You requested to strip the whitespace - The rendered results
will be affected.
Hint: Use $h->strip({whitespace => 0}); instead.
EOF
}
$$h =~ s,[\r\n]+,\n,sg; # Carriage/LF -> LF
$$h =~ s,\s+\n,\n,sg; # empty line
$$h =~ s,\n\s+<,\n<,sg; # space before tag
$$h =~ s,\n\s+,\n,sg; # other spaces
$$h =~ s,>\n\s*<,><,sg; # LF/spaces between tags..
# Remove excess spaces within tags.. note, we could parse out the elements
# and rewrite for excess spaces between elements. perhaps next version.
# removed due to problems with > and < in tag elements..
#$$h =~ s,\s+>,>,sg;
#$$h =~ s,<\s+,<,sg;
# do this again later..
}
if ($do_entities) {
$$h =~ s,",\",sg;
# Simplify long entity names if using default charset...
$$h =~ m,charset=([^\"]+)\",;
if (!defined($1) || ($1 eq 'iso-8859-1')) {
$$h =~ s,&([A-z]+);,($_ENTITIES{$1}) ? chr($_ENTITIES{$1}) : $&,sige;
}
}
if ($do_meta) {
foreach my $m (split(/\s+/, $do_meta)) {
$$h =~ s,]*?>,,sig;
}
}
if ($do_contenttype) {
# Don't need this, since it is the default for most web servers
# Also gets rid of 'blinking pages' in older versions of netscape.
$$h =~ s,,,sig;
}
if ($do_defcolor) {
$$h =~ s,(<[^<]+?color=['"]?\#[0-9A-Fa-f]+["']?),_defcolorcheck($&),sige;
}
if ($do_comments) {
# don't strip server side includes..
# try not to get javascript, or styles...
$$h =~ s,,_commentcheck($&),sige;
# Remove javascript comments
$$h =~ s,,_jscomments($&),sige;
}
if ($do_javascript) {
#
$$h =~ s,,_javascript($&),sige;
}
if ($do_blink) {
$$h =~ s,,,sgi;
}
if ($do_dequote) {
while ($$h =~ s,<([A-z]+ [A-z]+=)(['"])([A-z0-9]+)\2(\s*?[^>]*?>),<$1$3$4,sig)
{
# Remove alphanumeric quotes. Note, breaks DTD..
;
}
}
# remove , etc..
if ($do_emptytags) {
my $pat = $do_emptytags;
$pat =~ s/\s+/|/g;
while ($$h =~ s,<($pat)(\s+[^>]*?)?>\s*\1>,,siog){}
}
if ($do_htmldefaults) {
# Tables
# seems to break things..
#$$h =~ s,(
]*)\s+border=0([^>]*>),$1$2,sig;
$$h =~ s,(]*)\s+rowspan=1([^>]*>),$1$2,sig;
$$h =~ s,( | ]*)\s+colspan=1([^>]*>),$1$2,sig;
#
# P, TABLE tags are default left aligned..
# lynx is inconsistent in this manner though..
$$h =~ s,<(P|table|td)( [^>]*)align=\"?left\"?([^>]*)>,<$1$2$3>,sig;
# OL start=1
$$h =~ s,(]*)start=\"?1\"?([^>]*>),$1$2,sig;
# FORM
$$h =~ s,( |