#
# bibliography package for Perl
#
# Medline
#
# Dana Jacobsen (dana@acm.org)
# 22 January 1995 (last modified 17 January 1996)
#
# Note that there are many, many variations of the format called "medline".
# This currently reads the MEDLARS format used by Entrez (available at
# <http://atlas.nlm.nih.gov:5700/Entrez/index.html>).  I believe it also
# handles BRS MedLine.
#
# It does NOT understand MELVYL MEDLINE D TAG format, since that format
# is incompatable with MEDLARS.  It looks very similar, but uses different
# tag names for each field.  It should not be hard to modify this to read
# that format.
#

package bp_ISI;

$version = "ISI (ml 24 Apr 2000)";

######

&bib'reg_format( #'
  'ISI',    # name
  'ISI',        # short name
  'bp_ISI', # package name
  'none',       # default character set
  'suffix is med',
# our functions
  'options',
  'open is standard',
  'close is standard',
  'read',
  'write is standard',
  'clear is standard',
  'explode',
  'implode is unsupported',
  'tocanon',
  'fromcanon is unsupported',
);

######

$opt_html = 0;

######

sub options {
  local($opt) = @_;

  &bib'panic("ISI options called with no arguments!") unless defined $opt; #'
  &bib'debugs("parsing ISI option '$opt'", 64); #'
  return undef unless $opt =~ /=/;
  local($_, $val) = split(/\s*=\s*/, $opt, 2);
  &bib'debugs("option split: $_ = $val", 8); #'
  /^html$/       && do { $opt_html = &bib'parse_num_option($val); #'
                         return 1; };
  undef;
}

######

# We have our own read routine because we would like to handle the case
# of HTML output from Entrez.  For example, turn on the HTML option, then
# it can parse the output of:
#   <http://atlas.nlm.nih.gov:5700/htbin-post/Entrez/query?
#    db=m&form=4&term=ras&field=word&dispmax=10&dopt=l&title=no>
# directly.  Unfortunately, we have to do this specially since they don't
# put blank lines between entries.

sub read {
  local($file) = @_;
  local($record);

  &bib'debugs( #'
      "reading $file<$bib'glb_current_fmt>", 32); #'

  if ($opt_html) {
    local($/) = '</pre>';
    $record = scalar(<$bib'glb_current_fh>); #'
    if( !($record =~ /<pre>/i) ) {
      $record = scalar(<$bib'glb_current_fh>); #'
    }
    $record =~ s/^<HR>\s*//;
    $record =~ s/.*<pre>\s*//;
    # Check for the last part of the file.  If we think we found it,
    # read again.  This should yield an eof.
  } else {
    # read a paragraph
#    local($/) = 'ER ';
    while( <$bib'glb_current_fh> ) #'
    {
       if( /^ER/ ) {
	  last ;
       }
       next if ( /^\s*$/ ) ;
       $record .= $_ ;
    }
  }
  $record;
}

######

####
# find matching closing >. returns two strings, the part inside the brakets,
# and the part after the closing braket.
sub matching
{
   my ($op,$cl,$str) = @_ ;
   my $rstr = "" ;
   my ($s1, $s2, $s3, $rest) ;
   if( $str =~ /^([^$op$cl]*)$op([.\n])*/ )  {
     $s1 = $1 ;
     ($s2, $rest) = matching($op,$cl,$2.$' #'
                            ) ; 
     ($s3, $rest) = matching($op,$cl,$rest) ;
     if( defined $s2 && $s2 ne "" && defined $s3 && $s3 ne "" ) {
     return ($s1.$op.$s2.$cl.$s3,$rest) ;}
   } elsif ( $str =~ /^([^$op$cl]*)$cl([.\n]*)/ ) {
     return ($1,
             $2.$'  #'
	    ) ; 
   }
   return  ;
}


sub explode {
  local($_) = @_;
  local(%entry) = ();
  local($val);

  local($field) = undef;
  local(@lines) = split(/\n/);
  my $prev ;

  foreach (@lines) {
    if( defined $prev ) {
       $cur = $prev . " " . $_ ;
    } else {
       $cur = $_ ;
       
    }
    doAgain:
    undef $prev ;
    if ( $cur =~/^<title>.*<\/title>$/) {
      next if $opt_html;
      # We could guess that it's html and change options here.
    }
    if ($opt_html) {
#      $cur =~ s/^<pre>\s*//i;
      if( $cur =~ /^\s*</ ) {
         my $s2,$s3 ;
	 $s2 = $' ; #'
	 ($s3,$prev) = matching("<",">",$s2) ;
	 if( !defined $s3) {
	   $prev = $cur ;
	 } else {
	   $cur=$prev;
	   goto doAgain;
	 }
	 next ;
      }
      next if $cur =~ /^\s*$/;
    }
    if (!($cur =~ /^[A-Z]{2}\s+/)) {
      if( !defined($field) ) {
          &bib'gotwarn("ISI explode--Problems parsing entry: $cur||") ; #'
	  next;
      }
      $cur =~ s/^\s+//;
      if( $field eq "AU" ) {
        $entry{$field} .= ";" . $cur;
      } else {
        $entry{$field} .= " " . $cur;
      }
      next;
    }
    if ($cur =~ /^[A-Z]{2}\s+/) {
      ($field, $val) = ($cur =~ /^([A-Z]+)\s+(.*)/);
      if (defined $entry{$field}) {
        $entry{$field} .= $bib'cs_sep . $val; #'
      } else {
        $entry{$field} = $val;
      }
      next;
    }
    next if $cur =~ /^\d+$/;   # RefMan puts numbers here
    &bib'gotwarn( #'
      "ISI explode--can't parse: $cur"); #'
  }
  %entry;
}

######


sub implode {
  local(%entry) = @_;
  return &bib'goterror("ISI implode isn't supported."); #'"
}

######

# We want to check for any fields we don't recognize, because we don't
# have documentation on the format, so there may be something important
# being missed.

%med_to_can_fields = (
  'ID', 'Keywords',
  'C1', 'AuthorAddress',
  'TI', 'Title',
  'AB', 'Abstract',
  'SE', 'SuperTitle',
  'SO', 'Journal',
  'PU', 'Publisher',
  'PY', 'Year',
  'PD', 'Month',
#  'PG', 'Pages',
  'VL', 'Volume',
  'IS', 'Number',
  'WP', 'url',
#  'URLS', 'urls',
   'LA', 0,
   'DT', 0,
   'NR', 0,
   'SN', 0,
   'CR', 0,
   'TC', 0,
   'PG', 0,
   'JI', 0,
   'PN', 0,
   'SI', 0,
   'GA', 0,
   'PI', 0,
   'RP', 0,
   'CP', 0,
   'J9', 0,
   'PA', 0,
   'ER', 0,
   'UT', 0
);


sub tocanon {
  local(%entry) = @_;
  local(%can);
  local($type, $field);

  # AU
  if (defined $entry{'AU'}) {
    local($n);
    $can{'Authors'} = '';
    foreach $n (split(";", #'
                     $entry{'AU'})) 
    { 
        $can{'Authors'} .= $bib'cs_sep . &ISIname_to_canon($n); #'
    }
    $can{'Authors'} =~ s/^$bib'cs_sep//; #'
  }

  
  if( $entry{PT} =~ /^[Jj]/ ) {
    $type = 'article' ;
  } elsif( $entry{PT} =~ /^[Bb]/ ) {
    $type = 'inbook' ;
  } else {
    $type = 'article' ;
  }

  delete $entry{PT} ;

  $can{'CiteType'} = $type;

  delete $entry{AU};
  delete $entry{ED};

  if( defined $entry{'BP'} ) {
    $can{'Pages'} = $entry{'BP'} ;
    if( defined $entry{'EP'} ) {
      $can{'Pages'} .= "-$entry{'EP'}" ;
    }
  }
  delete $entry{BP} ;
  delete $entry{EP} ;

  if( defined $entry{DE} ) {
     $entry{ID} .= " ".$entry{DE} ;
  }
  delete $entry{DE} ;


  if( defined $entry{BS} ) {
     $entry{SE} .= " " . $entry{BS} ;
  }
  delete $entry{BS} ;

  if( defined $entry{FN} ) { #'
     if( !($entry{FN} =~ /ISI\s*Export\s*Format/) ) {
        &bib'gotwarn("ISI: format is: $entry{'FN'}") ; #'
     }
     if( defined $entry{VR} && $entry{VR} ne "1.0" ) {
	&bib'gotwarn("ISI: version is $entry{'VR'}") ; #'
     }
#     $can{'OrigFromat'} = $entry{FN}." ".$entry{VR} ;
   }
   delete $entry{FN} ;
   delete $entry{VR} ;

  foreach $field (keys %entry) {
    if (!defined $med_to_can_fields{$field}) {
      &bib'gotwarn("ISI: Unknown field: $field"); #'
    } elsif ($med_to_can_fields{$field}) {
      $can{$med_to_can_fields{$field}} = $entry{$field};
    }
  }

  %can;
}

# takes a SO entry and splits it into seperate fields
sub parseSO {
  local($journal, $year, $month, $volume, $pages);

  if (! (($journal, $year, $month, $volume, $pages)
          = $entry{SO} =~ /(.*)\s+(\d\d\d\d)\s*(.*);(.*):(.*)$/) ) {
    return &bib'gotwarn("Couldn't parse SO field: $entry{SO}");
  }
  $entry{TA} = $journal unless $journal =~ /^\s*$/;
  $entry{DP} = $year;
  $entry{DP} .= " $month" unless $month =~ /^\s*$/;
  if (defined $volume) {
    if ( !(($entry{VI}, $entry{IP}) = $volume =~ /(\d*)\s*\((\d*)\)/) ) {
      $entry{VI} = $volume;
    }
  }
  $entry{PG} = $pages unless $pages =~ /^\s*$/;
}

sub ISIname_to_canon {
  local($name) = @_;
  local($last, $von, $first, $cname);

  ($last, $first) = $name =~ /(.*)\s*,\s*([A-Z]*)$/;
  $last = '' unless defined $last;
  $first = '' unless defined $first;
  $first =~ s/([A-Z])/$1. /g;
  $first =~ s/\s+$//;
  $von = '';
  # (the von processing is from name_to_canon in bp-p-utils.pl)
  while ($last =~ /^([a-z]+)\s+/) {
    $von .= " $1";
    substr($last, 0, length($1)+1) = '';
  }
  $von =~ s/^ //;
  
  $cname = join( $bib'cs_sep2, $last, $von, $first, '');
  $cname;
}

######


#######################
# end of package
#######################

1;
