[Inhalt] -> [HTML]	[info]

HTML: Manipulation von HTML Code

Konvertierung und Säuberung des HTML-Exports von MS Word
Links in HTML filtern (HTML::TokeParser)
Links in HTML filtern (RegEx)

Konvertierung und Säuberung des HTML-Exports von MS Word

Der aus Word 97 oder Word 2000 exportierte HTML-Code kann in der Regel nicht so verwendet werden, wie er vorliegt. Eine Reihe von Tools bietet dem Webdesigner hier die nötige Unterstützung:

Die hier gezeigte Lösung verwendet das Perl-Modul HTML::TreeBuilder, um den HTML-Code aus Word zu parsen, unerwünschte Elemente zu entfernen und in einer bereinigten Form zurück zu geben.

  #### configuration ####
  # attributes to ignore
  my @ignore_attr =
      qw(bgcolor background color face style link alink 
         vlink text onblur onchange onclick ondblclick 
         onfocus onkeydown onkeyup onload onmousedown 
         onmousemove onmouseout onmouseover onmouseup
         onreset onselect onunload class xmlns:w xmlns:o 
         xmlns
        );

  # tags to ignore
  my @ignore_tags = 
      qw(font big small body dir html div span);

  # tags to drop with content
  my @ignore_elements = 
      qw(script style head o:p);

  # call sub
  my $output = clean_up_htmltree($input);

  ############################################################
  sub clean_up_htmltree {
  ############################################################

    my $input = shift;
    my $warn = 0; 
    my $htmlex; 
    use HTML::TreeBuilder;

    my $h = HTML::TreeBuilder->new;
    $h->ignore_unknown(0);
    $h->warn($warn);
    $h->parse($input);

    # drop all unwanted tags
    foreach (@Conf::ignore_tags) {
      $htmlex = 1, next if lc($_) eq "html"; # remove ...?
      while (my $ok = $h->look_down('_tag', "$_")) { 
        $ok->replace_with_content; 
      }
    }

    # drop all unwanted elements (tags w/ content)
    foreach (@Conf::ignore_elements) {
      while (my $ok = $h->look_down('_tag', "$_")) { 
        $ok->detach; 
      }
    }

    # drop all unwanted attributes
    foreach my $attr (@Conf::ignore_attr) {
      while (my $ok = $h->look_down( sub { defined($_[0]->attr($attr)) } )) { 
        $ok->attr($attr, undef);
      }
    }

    # drop unwanted script code 
    foreach my $ok ( $h->look_down( sub { grep { /^<\s*!\[.+?\]\s*>$/ } $_[0]->content_list } ) ) {
      $ok->detach_content; 
    }

    my $output = $h->as_HTML(undef, " ", {}); 
    # params = entities to encode, indent, optional endtags
    $h = $h->delete(); # nuke it!
    if ($htmlex) {
      $output =~ s:^\s*::m;
      $output =~ s:\s*$::m;
    }
    return $output;
  }

Links in HTML filtern (HTML::TokeParser)

print link_filter($text);

sub link_filter
  {
    my ($text) = @_;
    my $return;

    use HTML::TokeParser::Simple;
    my $p = HTML::TokeParser::Simple->new(\$text);

    while ( my $token = $p->get_token ) {
      if ( $token->is_start_tag('a') && $token->return_attr->{href}) {
      my $anchor = $token->as_is;
      while (my $token = $p->get_token) {
        $anchor .= $token->as_is;
	  last if $token->is_end_tag('a');
	  }
	  $return .= $anchor; ### do something here!!!

        ### example 
	# my $text = "";
	# my $url = $token->return_attr->{href};
	# while (my $token = $p->get_token) {
	#   last if $token->is_end_tag('a');
	#   $text .= $token->as_is;
	# }
	#$return .= link_replace($url,$text);
 
      }
      else
      {
        $return .= $token->as_is;
	}
    }
    return $return;
  }

Links in HTML filtern (RegEx)

print filter($text);

sub filter
{
    my $text = shift || return "";

    $text =~ s|<[aA]\b                 # <a
                 [^<>]*\b              # any attributes
                 [hH][rR][eE][fF]=\"?  # href=
                   ([^\"\s<>]+)        # URL
                 \"?                   # href closed
                 [^<>]*                # any attributes
               >                       # anchor closed
                 (?!<[iI][mM][gG])     # not <img ...>
                 (.+?)                 # TEXT
               </[aA]>                 # </a>

              |link_replace($1,$2)|sgex;

    return $text;
}

sub link_replace
{
    my ($url, $text) = @_;
    $url ||= "/";

    ## host
    my $host = "www.map-forum.de";

    ## images
    my $intern = '<img src="/images/link_int.png" width="14" height="10" alt="interner Link" border="0">';
    my $extern = '<img src="/images/link_ext.png" width="14" height="10" alt="externer Link" border="0">';

    ## e-email
    return obscure("<a href=\"$url\">$text</a>", $text) if $url =~ /^mailto:/;

    ## ignore image within anchor [see filter()]
    # if ($text =~ /^<img\b/) {
    #   return "<a href=\"$url\">$text</a>" if ($url =~ /$host/ || $url !~ /^https?:\/\//);
    #   return "<a href=\"$url\" target=\"_blank\">$text</a>";    
    # }

    ## http
    substr($text,1,0) = "</nobr>" if length($text);
    return "<a href=\"$url\"><nobr>$intern$text</a>" if ($url =~ /$host/ || $url !~ /^https?:\/\//);
    return "<a href=\"$url\" target=\"_blank\"><nobr>$extern$text</a>";
}

sub obscure
  {
    my ($code, $text) = @_;

    ## replace @ by something other (maybe image)
    $text =~ s/@/{at}/g;

    ## split $code at every 4th char and join again
    $code = "<script language=\"JavaScript\">document.write('" .
            join("'+'", $code =~ /(.{1,4})/g) .
            "');</script><noscript>$text</noscript>";

    return $code;
  }

Zusammengestellt von Alex Pleiner
© 2001-2003 zeitform Internet Dienste Bei Problemen wenden Sie sich bitte an den Webmaster
Permission is granted to copy, distribute and/or modify this document under the terms of the GNU Free Documentation License, Version 1.1.