#!/usr/bin/perl

### modules
use MIME::Parser;
use URI::Find;
use URI;
use Email::Folder;

use strict;
use vars qw(%hosts $entity $finder);

### vars
my $debug = 1;
%hosts = ();
my $tmp_dir = "/tmp/spamcheck";          ### edit me
my $mbox = "/home/ciphelp/spamcorpus";   ### edit me
#my $mbox = "/home/ciphelp/Mail/spam";

### mime parser and entity object
my $parser = new MIME::Parser;
$parser->output_dir($tmp_dir);

### uri finder
$finder = URI::Find->new(sub {
   my $uri = URI->new(shift);
   $uri->scheme =~ /^(http|ftp)/ && $hosts{$uri->host}++;} );

### main
my $folder = Email::Folder->new($mbox);

while ($_ = $folder->next_message) {

  print STDERR "." if $debug;
  $entity = $parser->parse_data($_->as_string());
  split_entity($entity);
  $entity->purge();

}

### output
print STDERR "\n" if $debug;
print $_, "\n" foreach sort keys %hosts;

exit;

### sub land
sub split_entity {
  local $entity = shift;
  my $num_parts = $entity->parts; # how many mime parts?

  if ($num_parts) {
    split_entity( $entity->parts($_) ) 
      foreach (0..$num_parts-1);
  } else {
    $finder->find(\$entity->bodyhandle->as_string) 
      if $entity->effective_type =~ /^(message|text)\//;
  }
}

###-fin-
