#!/usr/bin/perl
#
# Copyright (c) Michel Klein 2006 and Rinke Hoekstra 2007
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
# USA
#
# This is a revised version of the bib2rdf script by Michel Klein.
#
# Any questions relating to this software should be directed to:
# hoekstra@uva.nl
#

use Text::BibTeX qw(:macrosubs :metatypes);
use Digest::SHA1  qw(sha1 sha1_hex);
use LWP::Simple;
use CGI;

my $home = "http://www.leibnizcenter.org/~hoekstra/bib2rss/";

add_macro_text(jan, "January");
add_macro_text(feb, "February");
add_macro_text(mar, "March");
add_macro_text(apr, "April");
add_macro_text(may, "May");
add_macro_text(jun, "June");
add_macro_text(jul, "July");
add_macro_text(aug, "August");
add_macro_text(sep, "September");
add_macro_text(oct, "October");
add_macro_text(nov, "November");
add_macro_text(dec, "December");

%tags = ("inbook", "Inbook",
         "incollection", "InCollection",
     "conference", "InProceedings",
     "inproceedings", "InProceedings",
     "mastersthesis", "MasterThesis",
     "phdthesis", "PhDThesis",
     "techreport", "TechnicalReport",
     "unpublished", "Unpublished");

$query = new CGI;
$url = $query->param('url');
$desc = $query->param('description');
$title = $query->param('title');
$abs = (!($query->param('relative') eq 'on'));
$label = (!($query->param('nolabel') eq 'on'));


if ($url eq "") {
  print $query->redirect($home);
  exit;
}

$fn="tmp/".getppid.".bib";
@lines = get($url);



if (!(@lines)) {
  print "Content-type: text/html\n\n";
  print "<html><body><h2>Error</h2>Cannot find <tt>$url</tt>.</body></html>";
  exit;
}

open(BIB, ">$fn");
foreach $line (@lines) {
  $line =~ s/\r//g;
  $line =~ s/and$/and /g;
  print BIB $line;
}
close(BIB);

print "Content-type: text/xml; charset=utf-8\n\n";

print <<EOH;
<?xml version="1.0" encoding="utf-8"?>
<rdf:RDF xmlns="http://purl.org/rss/1.0/"
		 xmlns:dc="http://purl.org/dc/elements/1.1/"
		 xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"
         xmlns:my="http://www.cs.vu.nl/~mcaklein/onto/swrc-ext/2005/05#"
         xmlns:foaf="http://xmlns.com/foaf/0.1/"
     	 xmlns:ow="http://swrc.ontoware.org/ontology#"
		 xmlns:xhtml="http://www.w3.org/1999/xhtml">
EOH

print "  <channel rdf:about=\"$url\">\n";
print "    <title>$title</title>\n";
print "    <link>$url</link>\n";
print "    <description>$desc</description>\n";


print "  <items>\n";
print "    <rdf:Seq>\n";

# Read bibtex from file
$bibfile = new Text::BibTeX::File $fn;

while ($entry = new Text::BibTeX::Entry $bibfile) {
	next unless $entry->parse_ok;
	next unless $entry->metatype == BTE_REGULAR;

	@fields = $entry->fieldlist();
	$type = &field2tag($entry->type());
	$key = $entry->key();
	$key =~ s/:/_/g;

    print "      <rdf:li rdf:resource=\"$url#$key\"/>\n";

}
print "    </rdf:Seq>\n";
print "  </items>\n";

$bibfile = new Text::BibTeX::File $fn;

while ($entry = new Text::BibTeX::Entry $bibfile)
{
  next unless $entry->parse_ok;
  next unless $entry->metatype == BTE_REGULAR;

  @fields = $entry->fieldlist();
  $type = &field2tag($entry->type());
  $key = $entry->key();
  $key =~ s/:/_/g;

  if ($abs)  {
    print "<item rdf:about=\"$url#$key\">\n";
  } else {
    print "<item rdf:ID=\"$key\">\n";
  }
  print "  <link>$url#$key</link>\n";

  undef @authors;
  undef @ids;
  undef $title;
  undef $journal;
  undef $volume;
  undef $number;
  undef $booktitle;
  undef $publisher;
  undef $month;
  undef $year;

  for $field (@fields) {

   $value = $entry->get($field);
   if ($value ne "") {

    if (!(($field eq 'url') ||
          ($field eq 'howpublished') ||
          ($field eq 'note'))) {
      $value = &deTex($value);
    }

    # Clean-up tex in url-field
    if ($field eq 'url') {
      $value =~ s/\\~{}/~/g;
      $value =~ s/\"//g;
      if ($value =~ /\\url/) {
        $value =~ s/\\url{//g;
        $value =~ s/}$//g;
      }
    }

    # Cleanup month-field
    if ($field eq 'month') {
      # Fix for bug in Text::BibTex which leaves October out
      if ($value !~ /[a-zA-Z]/) {
        $value = "October $value";
      } else {
        # Add space between month and day
        $value =~ s/(\D+)(\d.*)/$1 $2/g;
      }
    }
    
    # Encode XML specific characters
    $value = &encodeXML($value);

    # Turn \url{} into HTML url's for abstracts and notes
    if (($field eq 'note') || ($field eq 'abstract')) {
      if ($value =~ /\\url{.*}/ ) {
         $value =~ s/\\url{(.*)}/<a href=\"$1\">$1<\/a>/g;
      }
    }

    # Read values
    if ($label) {
      if ($field eq 'title') { $title  = $value; }
      if ($field eq 'journal') { $journal  = $value; }
      if ($field eq 'volume') { $volume  = $value; }
      if ($field eq 'number') { $number  = $value; }
      if ($field eq 'booktitle') { $booktitle  = $value; }
      if ($field eq 'publisher') { $publisher  = $value; }
      if ($field eq 'month') { $month  = $value; }
      if ($field eq 'year') { $year  = $value; }
	  if ($field eq 'abstract') { $abstract = $value; }
    }


	
    # Parse indivudual names in author and editor fields
    if (($field eq 'author') || ($field eq 'editor')) {
      @names = $entry->split($field);
      if ($field eq 'author') { @authors = @names; }

      # Print individual author/editor ID's and store their ID and fullname
      for $name (@names) {
        $name =~ s/(^\s+|\s+$)//;
        $name = &deTex($name);
        $id = "#".&storeName('person', $name);
        @ids = (@ids, $id);
        if ($abs) {$id = "$url$id";}
        # print "  <ow:$field rdf:resource=\"$id\"/>\n";
      }
	} elsif ($field eq 'year') {
	  print "  <dc:date>$value-";
	  if ($month eq 'January') {
	    print "01-";
	  } elsif ($month eq 'February') {
	    print "02-";
	  } elsif ($month eq 'March') {
	    print "03-";
	  } elsif ($month eq 'April') {
	    print "04-";
	  } elsif ($month eq 'May') {
	    print "05-";
	  } elsif ($month eq "June") {
	    print "06-";
	  } elsif ($month eq 'July') {
	    print "07-";
	  } elsif ($month eq 'August') {
	    print "08-";
	  } elsif ($month eq 'September') {
	    print "09-";
	  } elsif ($month eq 'October') {
	    print "10-";
	  } elsif ($month eq 'November') {
	    print "11-";
	  } elsif ($month eq 'December') {
	    print "12-";
	  } else {
	    print "01-";
  	  }
	  print "01</dc:date>\n";
	} elsif ($field eq 'title'){
	  $value = &deTex($value);
	  print "  <title>$value</title>\n";
    } 
   }
  }
  if ($label) { &printLabel; }
  if ($label) { &printAuthors; }

  print "</item>\n\n";
}





print "</channel>\n";
print "</rdf:RDF>\n";
unlink($fn);

# Subroutine to make all data PCDATA
sub encodeXML {
  $value = $_[0];

  #XML encodings
  $value =~ s/&/&amp;/g;
  $value =~ s/</&lt;/g;
  $value =~ s/>/&gt;/g;

  # convert to UTF
  $value =~ s/([^\x20-\x7F])/'&#' . ord($1) . ';'/gse;

  # remove allready encoded UTF (<32)
  $value =~ s/&#[0-31];//gse;

  $value;
}

# Subroutine to remove TeX specific characters
sub deTex {
  $value = $_[0];
  $value =~ s/~/ /g;
  $value =~ s/(\\'|\\"|\\|{|}|,$)//g;
  $value =~ s/<//g;
  $value =~ s/>//g;
  $value =~ s/--/-/g;
  $value =~ s/(``|'')/"/g;
  $value;
}

# Subroutine to create stack of persons / indivuduals;
# returns a string that can be used as identifier in RDF
sub storeName {
  $name = $_[1];
  $mkey = $name;
  $name = &encodeXML($name);
  $mkey =~ s/&amp;/and/g;
  $mkey =~ s/\s+/_/g;
  $mkey =~ s/(:|\/)/_/g;
  $mkey =~ s/(;|&|,|\.|\#)//g;
  $mkey =~ tr/A-Z/a-z/;
  $mkey =~ s/([^\x20-\x7F])//gse;
  if ($_[0] eq 'person') {
    %persons = (%persons, $mkey, $name);
  } elsif ($_[0] eq 'organization') {
    %organizations = (%organizations, $mkey, $name);
  }
  $mkey;
}

# Subroutine to translate unknown fields in the bibtex-file to
# properties of publications
sub field2tag {
  # this should be generalized to allow arbitrary translations
  $field = $tags{$_[0]};
  if ($field eq "") {
    $field = "\u$_[0]";
  }
  "ow:$field";
}

# Subroutine to create a dc:description with a readable summary of
# the bibtex entry
sub printLabel {
  undef $label;

  if (@authors) {
    for $name (@authors) {
      $name = &deTex($name);
      $name = &encodeXML($name);
      $label = $label . $name . ', ';
    }
    $label =~ s/, $/. /;
  }
  $label = $label . $title if ($title);
  $label = $label . ". $journal" if ($journal);
  if ($volume) {
    $label = $label . " $volume";
    $label = $label . "($number)" if ($number);
  }
  $label = $label . ". In: $booktitle" if ($booktitle);
  $label = $label . ", number $number" if ($number && !($volume));
  $label = $label . ", $publisher" if ($publisher);
  $label = $label . ", $month" if ($month);
  $label = $label . ", $year" if ($year);
  $label = $label . "<xhtml:br/><xhtml:br/>\n$abstract" if ($abstract);
  print "  <dc:description>$label</dc:description>\n";
}

# Subroutine to create a dc:creator with the author names
sub printAuthors {
  undef $label;

  if (@authors) {
    for $name (@authors) {
      $name = &deTex($name);
      $name = &encodeXML($name);
      $label = $label . $name . ', ';
    }
    $label =~ s/, $/. /;
  }
  print "  <dc:creator>$label</dc:creator>\n";
}
