#!/usr/bin/perl
# This Perl script extracts the genus, species, and page information 
# from an xml document marked up in the biolit.dtd.

use strict;
use warnings;
use XML::Twig;

print STDOUT "Enter input file:  ";
my $fin = <STDIN>;
chomp $fin;

print STDOUT "Enter filename for output:  ";
my $fout = <STDIN>;
chomp $fout;

# Open XML file and read into string
open(FIN, $fin) or die "Couldn't open file";
my $xml;

my $bytes = read(FIN, $xml, 524288);
if ($bytes > 524288) {
  print "Buffer overflow";
  exit 1;
}

close FIN;

# Add DTD
my $dtd = "<!DOCTYPE BIOLIT [ <!ENTITY % HTMLlat1 PUBLIC \"-//W3C//ENTITIES Latin1//EN//HTML\" \"HTMLlat1.ent\"> %HTMLlat1; ]>\n";

$xml = $dtd . $xml;


# Twigify it
my $t= XML::Twig->new();
$t->parse($xml);

my $root = $t->root;

my @entries = $root->children('ENTRY');

# Get organisms and page info
my @results = ();
foreach my $entry (@entries) {
	my $org = $entry->first_child('ORGANISM');

	my $genus = $org->{'att'}->{'GENUS'};

	my $species;
        if ($org->{'att'}->{'SPECIES'}) {
		$species = $org->{'att'}->{'SPECIES'};
	} else {
		$species = "";
	}

	my $page;
	if ($entry->first_child('IMAGE')) {
		$page = $entry->first_child('IMAGE')->{'att'}->{'PAGE'};
	} else {
		$page = "";
	}

#	print $genus . $species . $page;
	if ($species ne "") {
	  push @results, $genus . " " . $species . "-p." . $page;
	} else {
	  push @results, $genus . "-p." . $page;
	}
}


# Remove duplicates
my %seen = ();
foreach my $item (@results) {
    $seen{$item}++;
}
my @uniq = keys %seen;

# Print sorted list joined by comma
open(OUTFILE, ">$fout") or die;
print OUTFILE join ", ", sort(@uniq);
close OUTFILE;
