[Inhalt] -> [HTML] |
[info] |
||
Der aus Word 97 oder Word 2000 exportierte HTML-Code kann in der Regel nicht so verwendet werden, wie er vorliegt. Eine Reihe von Tools bietet dem Webdesigner hier die nötige Unterstützung:
Die hier gezeigte Lösung verwendet das Perl-Modul HTML::TreeBuilder, um den HTML-Code aus Word zu parsen, unerwünschte Elemente zu entfernen und in einer bereinigten Form zurück zu geben.
#### configuration #### # attributes to ignore my @ignore_attr = qw(bgcolor background color face style link alink vlink text onblur onchange onclick ondblclick onfocus onkeydown onkeyup onload onmousedown onmousemove onmouseout onmouseover onmouseup onreset onselect onunload class xmlns:w xmlns:o xmlns ); # tags to ignore my @ignore_tags = qw(font big small body dir html div span); # tags to drop with content my @ignore_elements = qw(script style head o:p); # call sub my $output = clean_up_htmltree($input); ############################################################ sub clean_up_htmltree { ############################################################ my $input = shift; my $warn = 0; my $htmlex; use HTML::TreeBuilder; my $h = HTML::TreeBuilder->new; $h->ignore_unknown(0); $h->warn($warn); $h->parse($input); # drop all unwanted tags foreach (@Conf::ignore_tags) { $htmlex = 1, next if lc($_) eq "html"; # remove ...? while (my $ok = $h->look_down('_tag', "$_")) { $ok->replace_with_content; } } # drop all unwanted elements (tags w/ content) foreach (@Conf::ignore_elements) { while (my $ok = $h->look_down('_tag', "$_")) { $ok->detach; } } # drop all unwanted attributes foreach my $attr (@Conf::ignore_attr) { while (my $ok = $h->look_down( sub { defined($_[0]->attr($attr)) } )) { $ok->attr($attr, undef); } } # drop unwanted script code foreach my $ok ( $h->look_down( sub { grep { /^<\s*!\[.+?\]\s*>$/ } $_[0]->content_list } ) ) { $ok->detach_content; } my $output = $h->as_HTML(undef, " ", {}); # params = entities to encode, indent, optional endtags $h = $h->delete(); # nuke it! if ($htmlex) { $output =~ s:^\s*::m; $output =~ s:\s*$::m; } return $output; }
print link_filter($text); sub link_filter { my ($text) = @_; my $return; use HTML::TokeParser::Simple; my $p = HTML::TokeParser::Simple->new(\$text); while ( my $token = $p->get_token ) { if ( $token->is_start_tag('a') && $token->return_attr->{href}) { my $anchor = $token->as_is; while (my $token = $p->get_token) { $anchor .= $token->as_is; last if $token->is_end_tag('a'); } $return .= $anchor; ### do something here!!! ### example # my $text = ""; # my $url = $token->return_attr->{href}; # while (my $token = $p->get_token) { # last if $token->is_end_tag('a'); # $text .= $token->as_is; # } #$return .= link_replace($url,$text); } else { $return .= $token->as_is; } } return $return; }
print filter($text); sub filter { my $text = shift || return ""; $text =~ s|<[aA]\b # <a [^<>]*\b # any attributes [hH][rR][eE][fF]=\"? # href= ([^\"\s<>]+) # URL \"? # href closed [^<>]* # any attributes > # anchor closed (?!<[iI][mM][gG]) # not <img ...> (.+?) # TEXT </[aA]> # </a> |link_replace($1,$2)|sgex; return $text; } sub link_replace { my ($url, $text) = @_; $url ||= "/"; ## host my $host = "www.map-forum.de"; ## images my $intern = '<img src="/images/link_int.png" width="14" height="10" alt="interner Link" border="0">'; my $extern = '<img src="/images/link_ext.png" width="14" height="10" alt="externer Link" border="0">'; ## e-email return obscure("<a href=\"$url\">$text</a>", $text) if $url =~ /^mailto:/; ## ignore image within anchor [see filter()] # if ($text =~ /^<img\b/) { # return "<a href=\"$url\">$text</a>" if ($url =~ /$host/ || $url !~ /^https?:\/\//); # return "<a href=\"$url\" target=\"_blank\">$text</a>"; # } ## http substr($text,1,0) = "</nobr>" if length($text); return "<a href=\"$url\"><nobr>$intern$text</a>" if ($url =~ /$host/ || $url !~ /^https?:\/\//); return "<a href=\"$url\" target=\"_blank\"><nobr>$extern$text</a>"; } sub obscure { my ($code, $text) = @_; ## replace @ by something other (maybe image) $text =~ s/@/{at}/g; ## split $code at every 4th char and join again $code = "<script language=\"JavaScript\">document.write('" . join("'+'", $code =~ /(.{1,4})/g) . "');</script><noscript>$text</noscript>"; return $code; }
Zusammengestellt von Alex Pleiner
© 2001-2003 zeitform Internet Dienste Bei Problemen wenden Sie sich bitte an den Webmaster
Permission is granted to copy, distribute and/or modify this document
under the terms of the GNU Free Documentation License, Version 1.1.