#!/usr/bin/perl
require 5;
use strict;    # Time-stamp: "2000-11-24 23:36:13 MST"
# no utf;  # Uncomment under Perl 5.6 or later

=head1 NAME

rtf2xml -- dump an RTF document's structure as XML

=head1 SYNOPSIS

  % cat foo.rtf
  {\stylesheet{\fs20 \snext0 Normal;}}
  % rtf2xml < foo.rtf
  <_><stylesheet/><_><fs _='20'/><snext _='0'/>Normal;</_></_>

=head1 DESCRIPTION

This program is a filter that takes an RTF stream on STDIN and emits
its document tree structure as XML on STDOUT.  It assumes that the
input is well-formed RTF.

=head1 COPYRIGHT

Copyright (c) 2000 Sean M. Burke.  All rights reserved.

This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=head1 AUTHOR

Sean M. Burke C<sburke@cpan.org>

=cut

binmode(STDIN);
die "$0 takes no command-line parameters.  Provide input on STDIN.\n" if @ARGV;

# For a modular approach to RTF parsing, see RTF::Parser in CPAN,
#  http://search.cpan.org/search?dist=RTF-Parser

my $open_count = 0;
while(<STDIN>) {
  while( # Iterate over tokens on each line
   m<
    \G(?:
      ([{}])  # \1
      |
      (?: \\
          ([a-z]+)       # \2: keyword
          (-?\d+)?       # \3: number
          \x20?
      )
      |
      (?: \\ 
        (?:  (?: '([0-9a-fA-F]{2})) # \4: hex escape
              | ([-_|~:*\cm\cj{}])) # \5: magic character
        )
      |
      ([\cm\cj]+) # \6
      |
      ([<>&])     # \7
      |
      ([^\\{}\cm\cj]+)   # \8: unescaped character data (not [<>&])
    )
  >sgx) {
    if( defined $1 ) {
      if($1 eq '{') {
	++$open_count;  print '<_>';
      } else {
        exit if --$open_count < 0;   print '</_>';
      }
    } elsif( defined $2 ) {
      #print "\n<!-- Left: [", substr($_, pos($_)), "] -->";
      if(defined($3)) {
        if($2 ne 'bin') {
          print "<$2 _='$3'/>"; # always -+\d+, so needs no escaping
        } else { # special case: the 'bin' word.
          if($3 < 1) { # sanity
            print "<bin _=''/>";
          } elsif($3 < (length $_ - pos($_))) {  # all here
            my $b = substr($_,pos($_), $3);
            pos($_) += $3; # skip over that stuff;
            $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/'&#'.(ord($1)).';'/eg;
            print "<bin _='$b'/>";
          } else {
            my $b = substr($_,pos($_), $3);
            my $l = $3 - length $b;
            $_ = ''; # clear buffer
            $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/'&#'.(ord($1)).';'/eg;
            print "<bin _='$b";
            while($l > 0 and !eof) {
              $l -= read(STDIN, $b, ($l > 512) ? 512 : $l);
                # Decent block size, I guess.
              $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/'&#'.(ord($1)).';'/eg;
              print $b;
            }
            print "'/>";
          }
        }
      } else {
        print "<$2/>"; # always a-z, so needs no escaping
      }
    } elsif( defined $4 ) {
      print "&#x$4;";
    } elsif( defined $5 ) {
      # ([-_|~:*\cm\cj{}]) )    # \5: magic character of some sort
      if($5 eq '*')      { print '<S/>'        } # S for Star.
      elsif($5 eq "\cm"
         or $5 eq "\cj") { print '<par/>'      }
      elsif($5 eq '-')   { print '&shy;'       }
      elsif($5 eq '|')   { print '<Formula/>'  }
      elsif($5 eq '_')   { print '&#x2011;'    } # nonbreaking hyphen
      elsif($5 eq ':')   { print '<Subentry/>' }
      elsif($5 eq '~')   { print '&nbsp;'      }
      else               { print $5            }   # { or }
    } elsif( defined $6 ) {
      # no-op
    } elsif(defined $7) {
      if($7 eq '&')    { print '&amp;' }
      elsif($7 eq '<') { print '&lt;'  }
      else             { print '&gt;'  } 
    } else {
      print $8;
    }
  }
}