#!/usr/bin/perl ### modules use MIME::Parser; use URI::Find; use URI; use Email::Folder; use strict; use vars qw(%hosts $entity $finder); ### vars my $debug = 1; %hosts = (); my $tmp_dir = "/tmp/spamcheck"; ### edit me my $mbox = "/home/ciphelp/spamcorpus"; ### edit me #my $mbox = "/home/ciphelp/Mail/spam"; ### mime parser and entity object my $parser = new MIME::Parser; $parser->output_dir($tmp_dir); ### uri finder $finder = URI::Find->new(sub { my $uri = URI->new(shift); $uri->scheme =~ /^(http|ftp)/ && $hosts{$uri->host}++;} ); ### main my $folder = Email::Folder->new($mbox); while ($_ = $folder->next_message) { print STDERR "." if $debug; $entity = $parser->parse_data($_->as_string()); split_entity($entity); $entity->purge(); } ### output print STDERR "\n" if $debug; print $_, "\n" foreach sort keys %hosts; exit; ### sub land sub split_entity { local $entity = shift; my $num_parts = $entity->parts; # how many mime parts? if ($num_parts) { split_entity( $entity->parts($_) ) foreach (0..$num_parts-1); } else { $finder->find(\$entity->bodyhandle->as_string) if $entity->effective_type =~ /^(message|text)\//; } } ###-fin-