--- Words.pm.dist	2006-05-29 14:51:47.457609961 +0200
+++ Words.pm	2006-05-29 18:11:07.382179539 +0200
@@ -22,6 +22,11 @@
           'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
           );
     
+    ### Decode the string into a wide-character (utf-8) string:
+    $decoded = decode_mimewords_utf8(
+          'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
+          );
+    
     ### Split string into array of decoded [DATA,CHARSET] pairs:
     @decoded = decode_mimewords(
           'To: =?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>',
@@ -64,26 +69,25 @@
 
 =cut
 
-require 5.001;
+require 5.006;
 
 ### Pragmas:
 use strict;
-use vars qw($VERSION @EXPORT_OK %EXPORT_TAGS @ISA);
+use warnings;
 
 ### Exporting:
-use Exporter;
-%EXPORT_TAGS = (all => [qw(decode_mimewords
-			   encode_mimeword
-			   encode_mimewords
+use base qw(Exporter);
+our %EXPORT_TAGS = (all => [qw(decode_mimewords
+			       decode_mimewords_utf8
+			       encode_mimeword
+			       encode_mimewords
 			   )]);
 Exporter::export_ok_tags('all');
 
-### Inheritance:
-@ISA = qw(Exporter);
-
 ### Other modules:
 use MIME::Base64;
 use MIME::QuotedPrint;
+use Encode;
 
 
 
@@ -94,10 +98,10 @@
 #------------------------------
 
 ### The package version, both in 1.23 style *and* usable by MakeMaker:
-$VERSION = "5.417";
+our $VERSION = "5.417";
 
 ### Nonprintables (controls + x7F + 8bit):
-my $NONPRINT = "\\x00-\\x1F\\x7F-\\xFF"; 
+my $NONPRINT = "\\x00-\\x1F\\x7F-\\x{FFFF}"; 
 
 
 #------------------------------
@@ -117,7 +121,7 @@
 #     almost, but not exactly, quoted-printable.  :-P
 sub _encode_Q {
     my $str = shift;
-    $str =~ s{([_\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
+    $str =~ s{([ _\?\=$NONPRINT])}{sprintf("=%02X", ord($1))}eog;
     $str;
 }
 
@@ -239,6 +243,47 @@
 
 #------------------------------
 
+=item decode_mimewords_utf8 ENCODED, [OPTS...]
+
+I<Function.>
+Go through the string looking for RFC-1522-style "Q"
+(quoted-printable, sort of) or "B" (base64) encoding, and decode them.
+
+Returns an UTF-8 (wide-character) scalar (or, a single-element array in
+an array context) with the decoded data. 
+
+    $enc = '=?ISO-8859-1?Q?Keld_J=F8rn_Simonsen?= <keld@dkuug.dk>';
+    binmode STDOUT, ':utf8';
+    print decode_mimewords_utf8($enc);
+
+Any arguments past the ENCODED string are taken to define a hash of options:
+
+=over 4
+
+=item Field
+
+Name of the mail field this string came from.  I<Currently ignored.>
+
+=back
+
+=cut
+
+sub decode_mimewords_utf8 {
+    my $encstr = shift;
+    my %params = @_;
+
+    return undef if !defined $encstr;
+
+    my $rv = '';
+
+    foreach (decode_mimewords($encstr, %params)) {
+	$rv .= Encode::decode($_[1] || 'us-ascii', $_[0]);
+    }
+    return $rv;
+}
+
+#------------------------------
+
 =item encode_mimeword RAW, [ENCODING], [CHARSET]
 
 I<Function.>
@@ -249,14 +294,22 @@
     $encoded = encode_mimeword("\xABFran\xE7ois\xBB");
 
 You may specify the ENCODING (C<"Q"> or C<"B">), which defaults to C<"Q">.
-You may specify the CHARSET, which defaults to C<iso-8859-1>.
+You may specify the CHARSET, which defaults to C<utf-8>, if the
+word is a wide-character (utf-8) string, or C<iso-8859-1> otherwise.
 
 =cut
 
 sub encode_mimeword {
     my $word = shift;
     my $encoding = uc(shift || 'Q');
-    my $charset  = uc(shift || 'ISO-8859-1');
+    my $charset = uc(shift);
+
+    if (Encode::is_utf8($word)) {
+	$charset ||= 'UTF-8';
+	$word = Encode::encode($charset, $word);
+    } else {
+        $charset ||= 'ISO-8859-1';
+    }
     my $encfunc  = (($encoding eq 'Q') ? \&_encode_Q : \&_encode_B);
     "=?$charset?$encoding?" . &$encfunc($word) . "?=";
 }
@@ -279,8 +332,9 @@
 
 =item Charset
 
-Encode all unsafe stuff with this charset.  Default is 'ISO-8859-1',
-a.k.a. "Latin-1".
+Encode all unsafe stuff with this charset.  Default is 'UTF-8' if
+the input is a wide-character (utf-8) string, or 'ISO-8859-1',
+a.k.a. "Latin-1" otherwise.
 
 =item Encoding
 
@@ -303,20 +357,66 @@
 
 sub encode_mimewords {
     my ($rawstr, %params) = @_;
-    my $charset  = $params{Charset} || 'ISO-8859-1';
+    my $charset  = $params{Charset};
     my $encoding = lc($params{Encoding} || 'q');
 
+    return undef if !defined $rawstr; # Just to be orthogonal
+
+    if (Encode::is_utf8($rawstr)) {
+	$charset ||= 'UTF-8';
+    } else {
+	$charset ||= 'ISO-8859-1';
+    }
+
     ### Encode any "words" with unsafe characters.
-    ###    We limit such words to 18 characters, to guarantee that the 
-    ###    worst-case encoding give us no more than 54 + ~10 < 75 characters
-    my $word;
-    $rawstr =~ s{([a-zA-Z0-9\x7F-\xFF]{1,18})}{     ### get next "word"
-	$word = $1;
-	(($word !~ /[$NONPRINT]/o) 
-	 ? $word                                          ### no unsafe chars
-	 : encode_mimeword($word, $encoding, $charset));  ### has unsafe chars
-    }xeg;
-    $rawstr;
+    ###    FIXME: We limit such words to 18 characters (which is still
+    ###    not enough to guarantee that the worst-case encoding
+    ###    to < 75 characters (for example with multi-byte UTF-8 characters
+    ###    and Quoted-Printable)
+    ### Also beware that RFC1522 specifies that if two adjacent encoded
+    ###    strings are found, the whitespace between them is to be ignored.
+    ###    So we add the whitespace to the second encoded word.
+    my $rv = '';
+    my $stored_spaces = '';
+    my $last_was_encoded;
+WORD:
+    for my $word (split(/\b/, $rawstr)) {
+	# print STDERR "Split: got word <$word>\n";
+        if ($last_was_encoded) {
+            if ($word =~ /^\s+$/) {  # Whitespaces-after-nonprintable
+                $stored_spaces .= $word;
+                next WORD;
+            } elsif ($word =~ /^[^$NONPRINT]+$/o) { # Printable-after-nonprintable
+                $rv .= $stored_spaces . $word;
+                $stored_spaces = '';
+                $last_was_encoded = 0;
+                next WORD;
+            } else { # Nonprintable-after-nonprintable
+                $word = $stored_spaces . $word;
+                $stored_spaces = '';
+		$rv .= ' '; # separate two encoded words by a single space.
+                # Fall through
+            }
+        } else {
+            if ($word =~ /^[^$NONPRINT]+$/o) { # Spaces or printable
+                $rv .= $word;
+                next WORD;
+            }
+            # Nonprintable: fall through
+        }
+
+        # Now we have a word with non-printable chars to encode.
+        # According to RFC1522, when spliting a long word to multiple
+        # encoded words, it should be joined by newline-space pair.
+        $rv .= join "\n ", map
+            { encode_mimeword($_, $encoding, $charset) }
+            ($word =~ /.{1,18}/g);
+        $last_was_encoded = 1;
+    }
+
+    $rv .= $stored_spaces; # Trailing space after an encoded word
+
+    return $rv;
 }
 
 1;