![]() |
[Inhalt] -> [HTML] |
[info] |
|
Der aus Word 97 oder Word 2000 exportierte HTML-Code kann in der Regel nicht so verwendet werden, wie er vorliegt. Eine Reihe von Tools bietet dem Webdesigner hier die nötige Unterstützung:
Die hier gezeigte Lösung verwendet das Perl-Modul HTML::TreeBuilder, um den HTML-Code aus Word zu parsen, unerwünschte Elemente zu entfernen und in einer bereinigten Form zurück zu geben.
#### configuration ####
# attributes to ignore
my @ignore_attr =
qw(bgcolor background color face style link alink
vlink text onblur onchange onclick ondblclick
onfocus onkeydown onkeyup onload onmousedown
onmousemove onmouseout onmouseover onmouseup
onreset onselect onunload class xmlns:w xmlns:o
xmlns
);
# tags to ignore
my @ignore_tags =
qw(font big small body dir html div span);
# tags to drop with content
my @ignore_elements =
qw(script style head o:p);
# call sub
my $output = clean_up_htmltree($input);
############################################################
sub clean_up_htmltree {
############################################################
my $input = shift;
my $warn = 0;
my $htmlex;
use HTML::TreeBuilder;
my $h = HTML::TreeBuilder->new;
$h->ignore_unknown(0);
$h->warn($warn);
$h->parse($input);
# drop all unwanted tags
foreach (@Conf::ignore_tags) {
$htmlex = 1, next if lc($_) eq "html"; # remove ...?
while (my $ok = $h->look_down('_tag', "$_")) {
$ok->replace_with_content;
}
}
# drop all unwanted elements (tags w/ content)
foreach (@Conf::ignore_elements) {
while (my $ok = $h->look_down('_tag', "$_")) {
$ok->detach;
}
}
# drop all unwanted attributes
foreach my $attr (@Conf::ignore_attr) {
while (my $ok = $h->look_down( sub { defined($_[0]->attr($attr)) } )) {
$ok->attr($attr, undef);
}
}
# drop unwanted script code
foreach my $ok ( $h->look_down( sub { grep { /^<\s*!\[.+?\]\s*>$/ } $_[0]->content_list } ) ) {
$ok->detach_content;
}
my $output = $h->as_HTML(undef, " ", {});
# params = entities to encode, indent, optional endtags
$h = $h->delete(); # nuke it!
if ($htmlex) {
$output =~ s:^\s*::m;
$output =~ s:\s*$::m;
}
return $output;
}
print link_filter($text);
sub link_filter
{
my ($text) = @_;
my $return;
use HTML::TokeParser::Simple;
my $p = HTML::TokeParser::Simple->new(\$text);
while ( my $token = $p->get_token ) {
if ( $token->is_start_tag('a') && $token->return_attr->{href}) {
my $anchor = $token->as_is;
while (my $token = $p->get_token) {
$anchor .= $token->as_is;
last if $token->is_end_tag('a');
}
$return .= $anchor; ### do something here!!!
### example
# my $text = "";
# my $url = $token->return_attr->{href};
# while (my $token = $p->get_token) {
# last if $token->is_end_tag('a');
# $text .= $token->as_is;
# }
#$return .= link_replace($url,$text);
}
else
{
$return .= $token->as_is;
}
}
return $return;
}
print filter($text);
sub filter
{
my $text = shift || return "";
$text =~ s|<[aA]\b # <a
[^<>]*\b # any attributes
[hH][rR][eE][fF]=\"? # href=
([^\"\s<>]+) # URL
\"? # href closed
[^<>]* # any attributes
> # anchor closed
(?!<[iI][mM][gG]) # not <img ...>
(.+?) # TEXT
</[aA]> # </a>
|link_replace($1,$2)|sgex;
return $text;
}
sub link_replace
{
my ($url, $text) = @_;
$url ||= "/";
## host
my $host = "www.map-forum.de";
## images
my $intern = '<img src="/images/link_int.png" width="14" height="10" alt="interner Link" border="0">';
my $extern = '<img src="/images/link_ext.png" width="14" height="10" alt="externer Link" border="0">';
## e-email
return obscure("<a href=\"$url\">$text</a>", $text) if $url =~ /^mailto:/;
## ignore image within anchor [see filter()]
# if ($text =~ /^<img\b/) {
# return "<a href=\"$url\">$text</a>" if ($url =~ /$host/ || $url !~ /^https?:\/\//);
# return "<a href=\"$url\" target=\"_blank\">$text</a>";
# }
## http
substr($text,1,0) = "</nobr>" if length($text);
return "<a href=\"$url\"><nobr>$intern$text</a>" if ($url =~ /$host/ || $url !~ /^https?:\/\//);
return "<a href=\"$url\" target=\"_blank\"><nobr>$extern$text</a>";
}
sub obscure
{
my ($code, $text) = @_;
## replace @ by something other (maybe image)
$text =~ s/@/{at}/g;
## split $code at every 4th char and join again
$code = "<script language=\"JavaScript\">document.write('" .
join("'+'", $code =~ /(.{1,4})/g) .
"');</script><noscript>$text</noscript>";
return $code;
}
Zusammengestellt von Alex Pleiner
© 2001-2003 zeitform Internet Dienste Bei Problemen wenden Sie sich bitte an den Webmaster
Permission is granted to copy, distribute and/or modify this document
under the terms of the GNU Free Documentation License, Version 1.1.