--- Words.pm.dist 2006-05-29 14:51:47.457609961 +0200 +++ Words.pm 2006-05-29 18:11:07.382179539 +0200 @@ -22,6 +22,11 @@ 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ', ); + ### Decode the string into a wide-character (utf-8) string: + $decoded = decode_mimewords_utf8( + 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ', + ); + ### Split string into array of decoded [DATA,CHARSET] pairs: @decoded = decode_mimewords( 'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= ', @@ -64,26 +69,25 @@ =cut -require 5.001; +require 5.006; ### Pragmas: use strict; -use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA); +use warnings; ### Exporting: -use Exporter; -%EXPORT_TAGS = (all => [qw(decode_mimewords - encode_mimeword - encode_mimewords +use base qw(Exporter); +our %EXPORT_TAGS = (all => [qw(decode_mimewords + decode_mimewords_utf8 + encode_mimeword + encode_mimewords )]); Exporter::export_ok_tags('all'); -### Inheritance: -@ISA = qw(Exporter); - ### Other modules: use MIME::Base64; use MIME::QuotedPrint; +use Encode; @@ -94,10 +98,10 @@ #------------------------------ ### The package version, both in 1.23 style *and* usable by MakeMaker: -$VERSION = "5.417"; +our $VERSION = "5.417"; ### Nonprintables (controls + x7F + 8bit): -my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF"; +my $NONPRINT = "\\x00-\\x1F\\x7F-\\x{FFFF}"; #------------------------------ @@ -117,7 +121,7 @@ # almost, but not exactly, quoted-printable. :-P sub _encode_Q { my $str = shift; - $str =~ s{([_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog; + $str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog; $str; } @@ -239,6 +243,47 @@ #------------------------------ +=item decode_mimewords_utf8 ENCODED, [OPTS...] + +I +Go through the string looking for RFC-1522-style "Q" +(quoted-printable, sort of) or "B" (base64) encoding, and decode them. + +Returns an UTF-8 (wide-character) scalar (or, a single-element array in +an array context) with the decoded data. + + $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= '; + binmode STDOUT, ':utf8'; + print decode_mimewords_utf8($enc); + +Any arguments past the ENCODED string are taken to define a hash of options: + +=over 4 + +=item Field + +Name of the mail field this string came from. I + +=back + +=cut + +sub decode_mimewords_utf8 { + my $encstr = shift; + my %params = @_; + + return undef if !defined $encstr; + + my $rv = ''; + + foreach (decode_mimewords($encstr, %params)) { + $rv .= Encode::decode($_[1] || 'us-ascii', $_[0]); + } + return $rv; +} + +#------------------------------ + =item encode_mimeword RAW, [ENCODING], [CHARSET] I @@ -249,14 +294,22 @@ $encoded = encode_mimeword("\xABFran\xE7ois\xBB"); You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">. -You may specify the CHARSET, which defaults to C. +You may specify the CHARSET, which defaults to C, if the +word is a wide-character (utf-8) string, or C otherwise. =cut sub encode_mimeword { my $word = shift; my $encoding = uc(shift || 'Q'); - my $charset = uc(shift || 'ISO-8859-1'); + my $charset = uc(shift); + + if (Encode::is_utf8($word)) { + $charset ||= 'UTF-8'; + $word = Encode::encode($charset, $word); + } else { + $charset ||= 'ISO-8859-1'; + } my $encfunc = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B); "=?$charset?$encoding?" . &$encfunc($word) . "?="; } @@ -279,8 +332,9 @@ =item Charset -Encode all unsafe stuff with this charset. Default is 'ISO-8859-1', -a.k.a. "Latin-1". +Encode all unsafe stuff with this charset. Default is 'UTF-8' if +the input is a wide-character (utf-8) string, or 'ISO-8859-1', +a.k.a. "Latin-1" otherwise. =item Encoding @@ -303,20 +357,66 @@ sub encode_mimewords { my ($rawstr, %params) = @_; - my $charset = $params{Charset} || 'ISO-8859-1'; + my $charset = $params{Charset}; my $encoding = lc($params{Encoding} || 'q'); + return undef if !defined $rawstr; # Just to be orthogonal + + if (Encode::is_utf8($rawstr)) { + $charset ||= 'UTF-8'; + } else { + $charset ||= 'ISO-8859-1'; + } + ### Encode any "words" with unsafe characters. - ### We limit such words to 18 characters, to guarantee that the - ### worst-case encoding give us no more than 54 + ~10 < 75 characters - my $word; - $rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]{1,18})}{ ### get next "word" - $word = $1; - (($word !~ /[$NONPRINT]/o) - ? $word ### no unsafe chars - : encode_mimeword($word, $encoding, $charset)); ### has unsafe chars - }xeg; - $rawstr; + ### FIXME: We limit such words to 18 characters (which is still + ### not enough to guarantee that the worst-case encoding + ### to < 75 characters (for example with multi-byte UTF-8 characters + ### and Quoted-Printable) + ### Also beware that RFC1522 specifies that if two adjacent encoded + ### strings are found, the whitespace between them is to be ignored. + ### So we add the whitespace to the second encoded word. + my $rv = ''; + my $stored_spaces = ''; + my $last_was_encoded; +WORD: + for my $word (split(/\b/, $rawstr)) { + # print STDERR "Split: got word <$word>\n"; + if ($last_was_encoded) { + if ($word =~ /^\s+$/) { # Whitespaces-after-nonprintable + $stored_spaces .= $word; + next WORD; + } elsif ($word =~ /^[^$NONPRINT]+$/o) { # Printable-after-nonprintable + $rv .= $stored_spaces . $word; + $stored_spaces = ''; + $last_was_encoded = 0; + next WORD; + } else { # Nonprintable-after-nonprintable + $word = $stored_spaces . $word; + $stored_spaces = ''; + $rv .= ' '; # separate two encoded words by a single space. + # Fall through + } + } else { + if ($word =~ /^[^$NONPRINT]+$/o) { # Spaces or printable + $rv .= $word; + next WORD; + } + # Nonprintable: fall through + } + + # Now we have a word with non-printable chars to encode. + # According to RFC1522, when spliting a long word to multiple + # encoded words, it should be joined by newline-space pair. + $rv .= join "\n ", map + { encode_mimeword($_, $encoding, $charset) } + ($word =~ /.{1,18}/g); + $last_was_encoded = 1; + } + + $rv .= $stored_spaces; # Trailing space after an encoded word + + return $rv; } 1;