#!/usr/bin/perl require 5; use strict; # Time-stamp: "2000-11-24 23:36:13 MST" # no utf; # Uncomment under Perl 5.6 or later =head1 NAME rtf2xml -- dump an RTF document's structure as XML =head1 SYNOPSIS % cat foo.rtf {\stylesheet{\fs20 \snext0 Normal;}} % rtf2xml < foo.rtf <_><stylesheet/><_><fs _='20'/><snext _='0'/>Normal;</_></_> =head1 DESCRIPTION This program is a filter that takes an RTF stream on STDIN and emits its document tree structure as XML on STDOUT. It assumes that the input is well-formed RTF. =head1 COPYRIGHT Copyright (c) 2000 Sean M. Burke. All rights reserved. This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =head1 AUTHOR Sean M. Burke C<sburke@cpan.org> =cut binmode(STDIN); die "$0 takes no command-line parameters. Provide input on STDIN.\n" if @ARGV; # For a modular approach to RTF parsing, see RTF::Parser in CPAN, # http://search.cpan.org/search?dist=RTF-Parser my $open_count = 0; while(<STDIN>) { while( # Iterate over tokens on each line m< \G(?: ([{}]) # \1 | (?: \\ ([a-z]+) # \2: keyword (-?\d+)? # \3: number \x20? ) | (?: \\ (?: (?: '([0-9a-fA-F]{2})) # \4: hex escape | ([-_|~:*\cm\cj{}])) # \5: magic character ) | ([\cm\cj]+) # \6 | ([<>&]) # \7 | ([^\\{}\cm\cj]+) # \8: unescaped character data (not [<>&]) ) >sgx) { if( defined $1 ) { if($1 eq '{') { ++$open_count; print '<_>'; } else { exit if --$open_count < 0; print '</_>'; } } elsif( defined $2 ) { #print "\n<!-- Left: [", substr($_, pos($_)), "] -->"; if(defined($3)) { if($2 ne 'bin') { print "<$2 _='$3'/>"; # always -+\d+, so needs no escaping } else { # special case: the 'bin' word. if($3 < 1) { # sanity print "<bin _=''/>"; } elsif($3 < (length $_ - pos($_))) { # all here my $b = substr($_,pos($_), $3); pos($_) += $3; # skip over that stuff; $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/'&#'.(ord($1)).';'/eg; print "<bin _='$b'/>"; } else { my $b = substr($_,pos($_), $3); my $l = $3 - length $b; $_ = ''; # clear buffer $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/'&#'.(ord($1)).';'/eg; print "<bin _='$b"; while($l > 0 and !eof) { $l -= read(STDIN, $b, ($l > 512) ? 512 : $l); # Decent block size, I guess. $b =~ s/([^\x21-\x25\x27-\x3B\x3D\x3F-\x7E])/'&#'.(ord($1)).';'/eg; print $b; } print "'/>"; } } } else { print "<$2/>"; # always a-z, so needs no escaping } } elsif( defined $4 ) { print "&#x$4;"; } elsif( defined $5 ) { # ([-_|~:*\cm\cj{}]) ) # \5: magic character of some sort if($5 eq '*') { print '<S/>' } # S for Star. elsif($5 eq "\cm" or $5 eq "\cj") { print '<par/>' } elsif($5 eq '-') { print '­' } elsif($5 eq '|') { print '<Formula/>' } elsif($5 eq '_') { print '‑' } # nonbreaking hyphen elsif($5 eq ':') { print '<Subentry/>' } elsif($5 eq '~') { print ' ' } else { print $5 } # { or } } elsif( defined $6 ) { # no-op } elsif(defined $7) { if($7 eq '&') { print '&' } elsif($7 eq '<') { print '<' } else { print '>' } } else { print $8; } } }