#!/usr/local/bin/perl -w
## rtfcitations - extracts RefDB citations from a RTF document
## and outputs them as a citationlistx document
## citations must be written as [[cit-Z]], multi-head citations as
## [[cit1-Z][cit2-Z]...], where cit, cit1, cit2 are citation keys from
## a RefDB database, and -Z is one of -X, -Q, -A, -S, and -Y, denoting
## the type of the citation
## usage: rtfcitations < source.rtf > source.id.xml

## markus@mhoenicka.de 2007-12-26

##   This program is free software; you can redistribute it and/or modify
##   it under the terms of the GNU General Public License as published by
##   the Free Software Foundation; either version 2 of the License, or
##   (at your option) any later version.
##   
##   This program is distributed in the hope that it will be useful,
##   but WITHOUT ANY WARRANTY; without even the implied warranty of
##   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
##   GNU General Public License for more details.
   
##   You should have received a copy of the GNU General Public License
##   along with this program; if not, see <http://www.gnu.org/licenses/>

## use this module to read command line options
#use Getopt::Std;

#my %opts;
my $multi = 1;

## the switches are:
## -h: prints help
#getopts('h', \%opts);

## loop over all command line options
#while (($key, $value) = each %opts) {
#    if ($key eq "h") {
if ($ARGV[0] eq "-h") {
    print "rtfcitations reads a plain text document from stdin and scans it for RefDB citations (such as \"[[Miller1999-X]]\" or \"[[Miller1999-X][Doe2001-X]]\"). These citations are written to stdout in XML format using the citationlistx.dtd.\nCurrently there are no command-line options.\n";
    exit(0);
}
#}

## start output
print "<?xml version=\"1.0\"?>\n";
print "<!DOCTYPE CITATIONLIST PUBLIC \"-//Markus Hoenicka//DTD CitationList V1.0//EN\" \"http://refdb.sourceforge.net/dtd/citationlistx-1.0/citationlistx.dtd\">\n";
print "<citationlist>\n";

## read data from stdin
while (<>) {
    chomp;

    ## split each line into tokens separated by citations. As the regular
    ## expression used in split() contains parentheses, the citations
    ## are returned too as members of the result array

    ## the +? sequence obtains the shortest possible match to make sure
    ## consecutive citations in the same line are treated separately
    my @citations = split(/(\[\[.+?\]\])/);

    ## loop over all tokens and use only the citations for further processing
    foreach my $citation (@citations) {
	if ($citation =~ /\[\[.+\]\]/) {
	    ## start citation element
	    print "  <citation>\n";

	    ## remove outer pair of square brackets
	    $citation =~ s/^\[(.+)\]$/$1/;
	    #print "citation went to >>" . $citation . "<<\n";
	    ## split into individual references
	    my @xrefs = split(/\[/, $citation);
	    ## remove leading empty item
	    shift(@xrefs);
	    #print "xrefs went to " . scalar @xrefs . "\n";
	    if (scalar @xrefs > 1) {
		## multi-head citations require an additional xref element
		my $multitarget = $xrefs[0];
#		print "\n      <xref endterm=\"ID$endterm\"";
		$multitarget =~ s/(.+)\-.\]$/      <xref endterm=\"IM$multi\">ID$1<\/xref>/;
		print $multitarget . "\n";
		$multi++;
	    }
	    foreach my $xref (@xrefs) {
		## remove trailing square bracket and surround with xref
		## tags
		#print ">>" . $xref . "<<\n";
		$xref =~ s/(.+)\-.\]$/      <xref>ID$1<\/xref>/;
		print $xref . "\n";
	    }

	    print "  </citation>\n";
	}
    }
}

## end output
print "</citationlist>\n";

## the end
exit 0;


