#!/usr/bin/env python
#---
# $Id: dict2db,v 1.7 2003/12/09 05:11:00 elzubeir Exp $
#
# ------------
# Description:
# ------------
#
# Convert the Buckwalter data sets to 
#
# (C) Copyright 2003, Arabeyes, Mohammed Elzubeir
# -----------------
# Revision Details:    (Updated by Revision Control System)
# -----------------
#  $Date: 2003/12/09 05:11:00 $
#  $Author: elzubeir $
#  $Revision: 1.7 $
#  $Source: /home/arabeyes/cvs/projects/duali/pyduali/dict2db,v $
#
#  This program is written under the BSD License.
#---

import sys, os, string, getopt, anydbm 

scriptname = os.path.splitext(os.path.basename(sys.argv[0]))[0]
scriptversion = '$Revision: 1.7 $'

def chomp(s):
  if (s.endswith('\n')):
    return s[:-1]
  else:
    return s

def getGlossPOS(s):
  """
  Break up the glosspos into a tuple, first the pos then the gloss
  If the there is no pos then simply return an empty string and the gloss
  """
  start = s.find('<pos>')
  if (not start):
    return ('', s)
  end = s.find('</pos>', start)
  return (s[start+5:end-6].strip(), s[:start-1].strip())

def createDictDB(filename):
  "Create bsddb hash dictionary"

  w_stem = ''

  try:
    lines = open(filename, 'r').readlines()
  except IOError:
    print "Unable to read '%s'" % filename
    sys.exit(0)

  dbfile = "%sdb" % filename
  dict = anydbm.open(dbfile, 'n')

  print "\nWriting %s..." % (filename),
  for line in lines:
    line = chomp(line)
    if (line.startswith(';;')):
      dict[line[3:]] = "\t\t\t\t"
      w_stem = line[3:]
    if (line.startswith(';')):
      pass
    else:
      nsplit = len(line.split('\t'))
      if (nsplit < 4 or nsplit > 4):
        print "\nFatal error in file: %s\nline: %s" % (filename, line)
        sys.exit(1)
      (w_vanilla, w_full, w_cat, w_glossPOS) = line.split('\t')
      w_pos, w_gloss = getGlossPOS(w_glossPOS)
      if (len(w_pos)==0):
        if (w_cat.startswith('Pref-0') 
          or w_cat.startswith('Suff-0')):
          w_pos = ""
        elif (w_cat.startswith('F')):
          w_pos = "%s/FUNC_WORD" % w_full
        elif (w_cat.startswith('IV')):
          w_pos = "%s/VERB_IMPERFECT" % w_full
        elif (w_cat.startswith('PV')):
          w_pos = "%s/VERB_PERFECT" % w_full
        elif (w_cat.startswith('CV')):
          w_pos = "%s/VERB_IMPERATIVE" % w_full
        elif (w_cat.startswith('N')):
          w_pos = "%s/NOUN" % w_full # needs review here
        else:
          print "Fatal error has occurred parsing %s" \
              % filename
          print "line: %s" % line
          sys.exit(1)
      try:
        dict[w_vanilla] = "%s\t%s\t%s\t%s\t%s" % (w_full, w_cat, w_gloss, w_pos,
                                                  w_stem)
      except:
        print "\nWarning: Failed to index \'%s\' - blame MS-Windows ;)" \
              % w_gloss
  if os.uname()[0] == 'FreeBSD':
    pass
  else:
    dict.sync()

def usage():
  "Display usage options"

  print "(C) Copyright 2003, Arabeyes, Mohammed Elzubeir\n"
  print "Usage: %s [OPTIONS]" % scriptname
  print "\t[-h | --help           ]\toutputs this usage message"
  print "\t[-v | --version        ]\tprogram version"
  print "\t[-p | --path           ]\tpath to dictionary database"
  print "\r\nThis program is licensed under the BSD License\n"

def grabargs():
  "Grab command-line arguments"

  path = ''

  try:
    opts, args = getopt.getopt(sys.argv[1:], "hvp:",
                               ["help", "version", "path="],)
  except getopt.GetoptError:
    usage()
    sys.exit(0)
  for o, val in opts:    
    if o in ("-h", "--help"):
      usage()
      sys.exit(0)
    if o in ("-v", "--version"):
      print scriptversion
      sys.exit(0)
    if o in ("-p", "--path"):
      path = val
  return (path)


def main():
  "Main function"

  print "%s - %s" % (scriptname, scriptversion)
  path = grabargs()

  stems = "stems"
  prefixes = "prefixes"
  suffixes = "suffixes"

  if not path:
    if not os.path.exists(path):
      print "Path does not exist!"
      sys.exit(0)
  else:
    stems = os.path.join(path, stems)
    prefixes = os.path.join(path, prefixes)
    suffixes = os.path.join(path, suffixes)
  
  createDictDB(stems)
  createDictDB(prefixes)
  createDictDB(suffixes)

  sys.exit(0)

if __name__ == "__main__":
  main()

