# # Explode.pm # Last Modification: Sun Jun 26 21:19:40 WEST 2011 # # Copyright (c) 2011 Henrique Dias . # All rights reserved. # This module is free software; you can redistribute it and/or modify # it under the same terms as Perl itself. # package MIME::Explode; use strict; use Carp; require Exporter; require DynaLoader; require AutoLoader; use SelfLoader; use vars qw($VERSION @ISA @EXPORT); @ISA = qw(Exporter DynaLoader); @EXPORT = qw(&rfc822_base64 &rfc822_qprint); $VERSION = '0.39'; use constant BUFFSIZE => 64; my %h_hash = ( 'content-type' => "", 'content-disposition' => "", 'content-transfer-encoding' => "", ); my @patterns = ( '^([^= ]+) *=[ \"]*([^\"]+)', '^(\w[\w\-\.]*):[\x20\x09]*([^\x0d\x0a\f]*)[\x0d\x0a\f]+', '^[\x0a\x0d]+$', '^begin\s*(\d\d\d)\s*(\S+)', '^From +[^ ]+ +[a-zA-Z]{3} [a-zA-Z]{3} [ \d]\d \d\d:\d\d:\d\d \d{4}( [\+\-]\d\d\d\d)?[\x0a\x0d]+', '^[\x20\x09]+(?=.*[^\x0a\x0d]+)', '^[\x20\x09]+\w+\=[^\=]+' ); my %content_type = ( "text/html" => ".html", "text/plain" => ".txt", "message/rfc822" => ".rfc822", "text/richtext" => ".richtext", ); SelfLoader->load_stubs(); sub new { my $proto = shift; my $class = ref($proto) || $proto; my $self = { output_dir => "/tmp", mkdir => 0755, decode_subject => 0, check_content_type => 0, content_types => [], types_action => "include", @_, }; bless($self, $class); $self->init(); return($self); } sub init { my $self = shift; return() if((-d $self->{'output_dir'}) || !$self->{'mkdir'}); mkdir($self->{'output_dir'}, $self->{'mkdir'}) or die(join("", "MIME::Explode: Failed to create directory \"", $self->{'output_dir'}, "\": $!")); return(); } sub clean_all { my $self = shift; my $dir = $self->{'output_dir'}; opendir(DIRECTORY, $dir) or return("Can't opendir \"$dir\": $!\n"); while(defined(my $file = readdir(DIRECTORY))) { next if($file =~ /^\.\.?$/); my $path = "$dir/$file"; if(my ($f) = ($path =~ /^(.+)$/)) { unlink($f) or return("Couldn't unlink \"$f\" file: $!"); } } closedir(DIRECTORY); rmdir($dir) or return("Couldn't rmdir \"$dir\" directory: $!"); return(); } sub parse { my $self = shift; local $/ = "\n"; my %headers = (); my %args = ( 'output_dir' => $self->{'output_dir'}, 'check_ctype' => $self->{'check_content_type'} || 0, 'decode_subject' => $self->{'decode_subject'}, 'ctypes' => {}, 'types_action' => $self->{'types_action'} eq "include" ? 1 : 0, ); $self->{'content_types'} = $self->{'exclude_types'} if(exists($self->{'exclude_types'}) && scalar(@{$self->{'exclude_types'}})); if(scalar(@{$self->{'content_types'}})) { my %ctypes = (); @ctypes{@{$self->{'content_types'}}} = (0 .. $#{$self->{'content_types'}}); $args{'ctypes'} = \%ctypes; } my $last = &_parse(\@_, 1, 0, "0", "", \%args, {}, \%headers); $self->{nmsgs} = ($last->[0]) ? (split(/\./, $last->[0]))[0] + 1 : 0; my ($fh_mail, $fh_tmp) = @_; if(defined($fh_tmp)) { while(<$fh_mail>) { print $fh_tmp $_; } } return(\%headers); } sub nmsgs { $_[0]->{'nmsgs'} } sub _parse { my $fhs = shift; my $header = shift; my $mbox = shift || 0; my $base = shift || "0"; my $origin = shift || ""; my $args = shift; my $files = shift; my ($fh_mail, $fh_tmp) = @{$fhs}; my ($tree, $key, $tmpbuff, $boundary, $ftmp) = (join("\.", $base, "0"), "", "", "", ""); my ($check_ctype, $ctlength) = (1, 0); my ($ph, $tmp, $exclude, $attcount, $checkhdr) = (0, 0, 0, 0, 0); my $fh; while(local $_ = <$fh_mail>) { defined($fh_tmp) and print $fh_tmp $_; if($header) { ($ph, $attcount, $exclude, $tmpbuff, $check_ctype, $ctlength, $ftmp) = (1, 0, 0, "", 1, 0, ""); if(!$mbox && $base eq "0" && /$patterns[4]/o) { $mbox = 1; next; } if(exists($_[0]->{$tree}->{$key})) { s/\x0d//og; if(s/$patterns[5]/ /o) { s/\s+$//o; if(ref($_[0]->{$tree}->{$key}) eq "ARRAY") { $_[0]->{$tree}->{$key}->[$#{$_[0]->{$tree}->{$key}}] .= $_; next; } if(ref($_[0]->{$tree}->{$key}) eq "HASH") { $_[0]->{$tree}->{$key}->{value} .= $_; } else { $key eq "subject" and $_[0]->{$tree}->{$key} =~ /\?\=$/o and s/^ (?=\=\?)//o; $_[0]->{$tree}->{$key} .= $_; } next; } if(exists($h_hash{$key}) && exists($_[0]->{$tree}->{$key}->{value})) { &header2hash($_[0]->{$tree}->{$key}, $_[0]->{$tree}->{$key}->{value}); } elsif($key eq "subject" && $args->{decode_subject}) { my @parts = &decode_mimewords($_[0]->{$tree}->{subject}); delete($_[0]->{$tree}->{subject}); $_[0]->{$tree}->{subject}->{value} = [map {$_->[0] || ""} @parts]; $_[0]->{$tree}->{subject}->{charset} = [map {$_->[1] || "us-ascii"} @parts]; } } elsif(/$patterns[6]/o) { next; } if(/$patterns[1]/o) { defined($fh) and &file_close($fh); ($header, $checkhdr) = (1, 1); $key = lc($1); if($key eq "received" || $key eq "x-received") { push(@{$_[0]->{$tree}->{$key}}, $2); next; } unless(exists($_[0]->{$tree}->{$key})) { $_[0]->{$tree}->{$key} = (exists($h_hash{$key})) ? {value => $2} : $2; } next; } next if(!$checkhdr && (length() <= 2) && /$patterns[2]/o); $header = 0; if(exists($_[0]->{$tree}->{'content-type'}) && exists($_[0]->{$tree}->{'content-type'}->{value})) { $_[0]->{$tree}->{'content-type'}->{value} = lc($_[0]->{$tree}->{'content-type'}->{value}); if(exists($_[0]->{$tree}->{'content-type'}->{boundary}) && $_[0]->{$tree}->{'content-type'}->{value} =~ /multipart\/\w+/o) { my $res = &_parse($fhs, $header, $mbox, $tree, $_[0]->{$tree}->{'content-type'}->{boundary}, $args, $files, $_[0]); if($res->[1]) { $mbox ? ($tmp = 1) : return([$tree, $res->[1]]); $_ = $res->[1]; } else { next; } } elsif($_[0]->{$tree}->{'content-type'}->{value} eq "message/rfc822") { my $res = &_parse($fhs, 1, $mbox, $tree, $origin, $args, $files, $_[0]); if($res->[1]) { $mbox ? ($tmp = 1) : return([$tree, $res->[1]]); $_ = $res->[1]; } else { next; } } } } $checkhdr = 0; $key = ""; defined($_) or next; if(/$patterns[3]/o) { my $file = &check_filename($files, $2); my $filepath = ($args->{output_dir}) ? join("/", $args->{output_dir}, $file) : $file; my $res = uu_file($fhs, $filepath, $1 || "644", { action => $args->{'types_action'}, mimetypes => $args->{'ctypes'} } ); $_[0]->{"$tree.$attcount"}->{'content-type'}->{value} = $res->[0]; $_[0]->{"$tree.$attcount"}->{'content-disposition'}->{filepath} = $filepath unless($res->[1]); $attcount++; next; } my $breakmsg = ""; unless(defined($fh)) { $boundary = $origin; if(exists($_[0]->{$tree}->{'content-type'}) && exists($_[0]->{$tree}->{'content-type'}->{value})) { $exclude = 1 if(($_[0]->{$tree}->{'content-type'}->{value} =~ /^multipart\/\w+$/o) || ($_[0]->{$tree}->{'content-type'}->{value} eq "message/rfc822")); } else { $check_ctype = 1; } unless($exclude) { if(exists($_[0]->{$tree}->{'content-transfer-encoding'}) && exists($_[0]->{$tree}->{'content-transfer-encoding'}->{value})) { $_[0]->{$tree}->{'content-transfer-encoding'}->{value} = lc($_[0]->{$tree}->{'content-transfer-encoding'}->{value}); if($_[0]->{$tree}->{'content-transfer-encoding'}->{value} eq "base64" || ($_[0]->{$tree}->{'content-transfer-encoding'}->{value} eq "quoted-printable" && $boundary)) { &set_filename($files, $_[0]->{$tree}); my $filepath = ($args->{output_dir}) ? join("/", $args->{output_dir}, $_[0]->{$tree}->{'content-disposition'}->{filename}) : $_[0]->{$tree}->{'content-disposition'}->{filename}; my $res = &decode_content($fhs, $_[0]->{$tree}->{'content-transfer-encoding'}->{value}, $filepath, $boundary ? "--$boundary" : "", { mimetype => $_[0]->{$tree}->{'content-type'}->{value} || "", checktype => $args->{'check_ctype'}, action => $args->{'types_action'}, mimetypes => $args->{'ctypes'}, mailbox => $mbox }); $_[0]->{$tree}->{'content-type'}->{value} = $res->[1] if($res->[1]); $_[0]->{$tree}->{'content-disposition'}->{filepath} = $filepath unless($res->[2]); $tmp = 1; unless($_ = $res->[0]) { $exclude = 1; next; } if($mbox && /$patterns[4]/o && scalar(@{[split(/\./o, $tree)]}) > 2) { $breakmsg = $_; $_ = "--$boundary--\r\n"; } } } } } if($mbox && /$patterns[4]/o) { if(scalar(@{[split(/\./o, $tree)]}) > 2) { $breakmsg = $_; $boundary ? ($_ = "--$boundary--\r\n") : return([$tree, $breakmsg]); } else { defined($fh) and &file_close($fh); $header = 1; my @ps = split(/\./o, $tree); $tree = join(".", ++$ps[0], "0"); next; } } $tmp = ((length() <= 2) && /$patterns[2]/o) ? 1 : 0; (defined($fh) || !$tmp) or next; if($boundary) { if(index($_, "--$boundary--") >= 0) { defined($fh) and &file_close($fh); if($mbox && scalar(@{[split(/\./o, $tree)]}) == 2) { ($tmp, $exclude) = (1, 1); $boundary = ""; next; } else { return([$tree, $breakmsg]); } } if(index($_, "--$boundary") >= 0) { defined($fh) and &file_close($fh); ($tmp, $header) = (1, 1); $boundary = ""; if($ph) { return([$tree]) if($_[0]->{$base}->{'content-type'}->{value} eq "message/rfc822"); my @ps = split(/\./o, $tree); $ps[$#ps]++; $tree = join("\.", @ps); } next; } } (!$exclude && $ph) or next; if($check_ctype && $args->{check_ctype}) { ($tmpbuff .= $_) =~ s/^[\n\r\t]+//o; if(length($tmpbuff) > BUFFSIZE) { $_[0]->{$tree}->{'content-type'}->{value} ||= ""; if(my $ct = set_content_type($tmpbuff, $_[0]->{$tree}->{'content-type'}->{value})) { $_[0]->{$tree}->{'content-type'}->{value} = $ct; $tmpbuff = ""; $check_ctype = 0; } if($exclude = exists($args->{'ctypes'}->{$_[0]->{$tree}->{'content-type'}->{value}}) ? ($args->{'types_action'} ? 0 : 1) : scalar(keys(%{$args->{'ctypes'}})) ? ($args->{'types_action'} ? 1 : 0) : ($args->{'types_action'} ? 0 : 1)) { if(defined($fh)) { &file_close($fh); unlink($_[0]->{$tree}->{'content-disposition'}->{filepath}); delete($_[0]->{$tree}->{'content-disposition'}->{filepath}); } next; } } } unless(defined($fh)) { &set_filename($files, $_[0]->{$tree}); $_[0]->{$tree}->{'content-disposition'}->{filepath} = ($args->{output_dir}) ? join("/", $args->{output_dir}, $_[0]->{$tree}->{'content-disposition'}->{filename}) : $_[0]->{$tree}->{'content-disposition'}->{filename}; defined($fh) and &file_close($fh); $fh = &file_open($_[0]->{$tree}->{'content-disposition'}->{filepath}); } if(defined($fh)) { if(!$ftmp && (length() <= 2) && /$patterns[2]/o) { $ftmp .= $_; next; } if($ftmp) { $_ = join("", $ftmp, $_); $ftmp = ""; } print $fh ($_[0]->{$tree}->{'content-transfer-encoding'}->{value} eq "quoted-printable") ? rfc822_qprint($_) : $_; exists($_[0]->{$tree}->{'content-length'}) or next; if(($ctlength += length()) >= $_[0]->{$tree}->{'content-length'}) { defined($fh) and &file_close($fh); $exclude = 1; next; } } } defined($fh) and &file_close($fh); return([$tree, ""]); } sub file_close { close($_[0]); undef($_[0]); } sub file_open { my $path = shift; local *FILE; if($path =~ /^(.+)$/) { $path = $1; } open(FILE, ">$path") or die("MIME::Explode: Couldn't open $path for writing: $!\n"); binmode(FILE); return *FILE; } sub header2hash { my $header = pop; my $params = semicolon_split($header); $_[0]->{value} = shift(@{$params}) || ""; map {/$patterns[0]/o and $_[0]->{lc($1)} = $2; } @{$params}; return(); } sub set_filename { my $files = shift; my $h = shift; my $file = "file"; if(exists($h->{'content-disposition'}->{filename})) { $file = $h->{'content-disposition'}->{filename}; } elsif(exists($h->{'content-type'}->{name})) { $file = $h->{'content-type'}->{name}; } elsif(exists($h->{'content-type'}->{value})) { my $ctype = lc($h->{'content-type'}->{value}); $file .= $content_type{$ctype} || ""; } $file =~ s/^[ \.]+$/file/o; $h->{'content-disposition'}->{filename} = &check_filename($files, $file); $h->{'content-transfer-encoding'}->{value} = "" unless(exists($h->{'content-transfer-encoding'}->{value})); return(); } bootstrap MIME::Explode $VERSION; 1; __DATA__ sub semicolon_split { my $str = shift || return([]); my @array = (); my $i = 0; for(split(/;/, $str)) { if(/\=/ or $i == 0) { s/^[\t ]+//; s/[\t ]+$//; $array[$i] = $_; $i++; } else { s/(?<=\")[\t ]+$//; $array[$i-1] .= "\;$_"; } } return(\@array); } sub check_filename { my $files = shift; my $rawfile = shift; my $file = &decode_mimewords($rawfile); $file =~ /[\/\\]?([^\/\\]+)$/o; $file = (length($1)) ? $1 : "file"; if(exists($files->{$file})) { my $n = $files->{$file}++; $file .= "-$n" unless($file =~ s/(\.[^\.]+)$/\-$n$1/o); } else { $files->{$file} = 1; } return($file); } sub decode_mimewords { my $encstr = shift; my @tokens = (); $@ = ''; $encstr =~ s/(\?\=)\r?\n[ \t](\=\?)/$1$2/ogs; pos($encstr) = 0; while (1) { last if(pos($encstr) >= length($encstr)); my $pos = pos($encstr); if($encstr =~ /\G=\?([^?]*)\?([bq])\?([^?]+)\?=/ogi) { my ($charset, $encoding, $enc) = ($1, lc($2), $3); my $dec = ($encoding eq "q") ? rfc822_qprint($enc) : rfc822_base64($enc); push(@tokens, [$dec, $charset]); next; } pos($encstr) = $pos; if($encstr =~ /\G=\?/g) { $@ .= qq|unterminated "=?..?..?=" in "$encstr" (pos $pos)\n|; push(@tokens, ['=?']); next; } pos($encstr) = $pos; if($encstr =~ /\G([\x00-\xFF]*?\n*)(?=(\Z|=\?))/og) { length($1) or die("MIME::Explode: internal logic err: empty token\n"); push(@tokens, [$1]); next; } die("MIME::Explode: unexpected case:\n($encstr) pos $pos\n"); } return (wantarray ? @tokens : join('',map {$_->[0]} @tokens)); } __END__ =head1 NAME MIME::Explode - Perl extension for explode MIME messages =head1 SYNOPSIS use MIME::Explode; my $explode = MIME::Explode->new( output_dir => "tmp", mkdir => 0755, decode_subject => 1, check_content_type => 1, content_types => ["image/gif", "image/jpeg", "image/bmp"], types_action => "exclude" ); print "Number of messages: ", $explode->nmsgs, "\n"; open(MAIL, "file.tmp") or die("Couldn't open file.tmp for writing: $!\n"); my $headers = $explode->parse(\*MAIL, \*OUTPUT); close(OUTPUT); close(MAIL); for my $part (sort{ $a cmp $b } keys(%{$headers})) { for my $k (keys(%{$headers->{$part}})) { if(ref($headers->{$part}->{$k}) eq "ARRAY") { for my $i (0 .. $#{$headers->{$part}->{$k}}) { print "$part => $k => $i => ", $headers->{$part}->{$k}->[$i], "\n"; } } elsif(ref($headers->{$part}->{$k}) eq "HASH") { for my $ks (keys(%{$headers->{$part}->{$k}})) { if(ref($headers->{$part}->{$k}->{$ks}) eq "ARRAY") { print "$part => $k => $ks => ", join(($ks eq "charset") ? " " : "", @{$headers->{$part}->{$k}->{$ks}}), "\n"; } else { print "$part => $k => $ks => ", $headers->{$part}->{$k}->{$ks}, "\n"; } print "$part => $k => $ks => ", $headers->{$part}->{$k}->{$ks}, "\n"; } } else { print "$part => $k => ", $headers->{$part}->{$k}, "\n"; } } } if(my $e = $explode->clean_all()) { print "Error: $e\n"; } =head1 DESCRIPTION MIME::Explode is perl module for parsing and decoding single or multipart MIME messages, and outputting its decoded components to a given directory ie, this module is designed to allows users to extract the attached files out of a MIME encoded email messages or mailboxes. =head1 METHODS =head2 new([, OPTION ...]) This method create a new MIME::Explode object. The following keys are available: =over 7 =item output_dir Directory where the decoded files are placed =item mkdir => octal_number If the value is set to octal number then make the output_dir directory (example: mkdir => 0755). =item check_content_type => 0 or 1 If the value is set to 1 the content-type of file is checked =item decode_subject => 0 or 1 If the value is set to 1 then the subject is decoded into a list. $header->{'0.0'}->{subject}->{value} = [ARRAYREF]; $header->{'0.0'}->{subject}->{charset} = [ARRAYREF]; $subject = join("", @{$header->{'0.0'}->{subject}->{value}}); =item exclude_types => [ARRAYREF] Not save files with specified content types (deprecated in next versions) =item content_types => [ARRAYREF] Array reference with content types for "include" or "exclude" =item types_action => "include" or "exclude" If the action is a "include", all attached files with specified content types are saved but if the action is a "exclude", no files are saved except if its in the array of content types. If no array is specified, but the action is a "include", all attached files are saved, otherwise all files are removed if action is a "exclude". The default action is "include". =back =head2 parse(FILEHANDLE, FILEHANDLE) This method parse the stream and splits it into its component entities. This method return a hash reference with all parts. The FILEHANDLE should be a reference to a GLOB. The second argument is optional. =head2 nmsgs Returns the number of parsed messages. =head2 clean_all Cleans all files from the "output_dir" directory and then removes the directory. If an error happens returns it. =head1 AUTHOR Henrique Dias =head1 CREDITS Thanks to Rui Castro for the revision. =head1 SEE ALSO MIME::Tools, perl(1). =cut