/* ,file-id archive://[lord]/437/rx/posix.c/1998-05-18
 */
/*	Copyright (C) 1997 Tom Lord
 * 
 * This program is provided to you under the terms of the Liberty Software
 * License.  You are NOT permitted to redistribute, modify, or use it
 * except in very specific ways described by that license.
 *
 * This software comes with NO WARRANTY.
 * 
 * You should have received a copy of the Liberty Software License
 * along with this software; see the file =LICENSE.  If not, write to
 * the Tom Lord, 1810 Francisco St. #2, Berkeley CA, 94703, USA.  
 */





#include <ctype.h>
#include "vu/bitset.h"
#include "vu/dstr.h"
#include "vu/hashtab.h"
#include "rexp.h"
#include "errnorx.h"
#include "comp.h"
#include "nfa.h"
#include "dfa.h"
#include "super.h"
#include "unfa.h"
#include "match-regexp.h"
#include "posix.h"


#define RX_MANY_CASES 30


/* regcomp takes a regular expression as a string and compiles it.
 *
 * PATTERN is the address of the pattern string.
 *
 * CFLAGS is a series of bits which affect compilation.
 *
 *   If REG_EXTENDED is set, we use POSIX extended syntax; otherwise, we
 *   use POSIX basic syntax.
 *
 *   If REG_NEWLINE is set, then . and [^...] don't match newline.
 *   Also, regexec will try a match beginning after every newline.
 *
 *   If REG_ICASE is set, then we considers upper- and lowercase
 *   versions of letters to be equivalent when matching.
 *
 *   If REG_NOSUB is set, then when PREG is passed to regexec, that
 *   routine will report only success or failure, and nothing about the
 *   registers.
 *
 * It returns 0 if it succeeds, nonzero if it doesn't.  (See regex.h for
 * the return codes and their meanings.)  
 */


int
regncomp (regex_t * preg, const char * pattern, int len, int cflags)
{
  int ret;
  int extended_p;

  memset0 ((char *)preg, sizeof (*preg));
  extended_p = !!(cflags & REG_EXTENDED);

  if (!(cflags & REG_ICASE))
    preg->translate = 0;
  else
    {
      unsigned i;

      preg->translate = (unsigned char *) malloc (256);
      if (!preg->translate)
        return (int) REG_ESPACE;

      /* Map uppercase characters to corresponding lowercase ones.  */
      for (i = 0; i < 256; i++)
        preg->translate[i] = isupper (i) ? tolower (i) : i;
    }


  /* If REG_NEWLINE is set, newlines are treated differently.  */
  if (!(cflags & REG_NEWLINE))
    preg->newline_anchor = 0;
  else
    preg->newline_anchor = 1;

  preg->no_sub = !!(cflags & REG_NOSUB);

  ret = rx_parse (&preg->pattern,
		  &preg->re_nsub,
		  pattern, len,
		  extended_p,
		  (cflags & REG_NEWLINE),
		  (cflags & REG_DFA_ONLY),
		  256,
		  preg->translate);

  /* POSIX doesn't distinguish between an unmatched open-group and an
   * unmatched close-group: both are REG_EPAREN.
   */
  if (ret == REG_ERPAREN)
    ret = REG_EPAREN;

  if (!ret)
    {
      preg->re_nsub = 1;
      preg->subexps = 0;
      rx_analyze_rexp (&preg->subexps, &preg->re_nsub, preg->pattern);
      preg->is_nullable = rx_fill_in_fastmap (256,
					      preg->fastmap,
					      preg->pattern);

      preg->is_anchored = rx_is_anchored_p (preg->pattern);
    }

  return (int) ret;
}


int
regtreecomp (regex_t * preg, struct rx_exp_node * pattern, int cflags)
{
  int ret;
  int extended_p;

  memset0 ((char *)preg, sizeof (*preg));

  if (!(cflags & REG_ICASE))
    preg->translate = 0;
  else
    {
      unsigned i;

      preg->translate = (unsigned char *) malloc (256);
      if (!preg->translate)
        return (int) REG_ESPACE;

      /* Map uppercase characters to corresponding lowercase ones.  */
      for (i = 0; i < 256; i++)
        preg->translate[i] = isupper (i) ? tolower (i) : i;
    }

  /* If REG_NEWLINE is set, newlines are treated differently.  */
  if (!(cflags & REG_NEWLINE))
    preg->newline_anchor = 0;
  else
    preg->newline_anchor = 1;

  preg->no_sub = !!(cflags & REG_NOSUB);
  preg->pattern = pattern;
  preg->re_nsub = 1;
  preg->subexps = 0;
  rx_analyze_rexp (&preg->subexps, &preg->re_nsub, preg->pattern);
  preg->is_nullable = rx_fill_in_fastmap (256,
					  preg->fastmap,
					  preg->pattern);
  preg->is_anchored = rx_is_anchored_p (preg->pattern);
  return 0;
}


int
regcomp (regex_t * preg, const char * pattern, int cflags)
{
  /* POSIX says a null character in the pattern terminates it, so we
   * can use strlen here in compiling the pattern.  
   */

  return regncomp (preg, pattern, strlen (pattern), cflags);
}




/* Returns a message corresponding to an error code, ERRCODE, returned
   from either regcomp or regexec.   */

int
regerror (int errcode, const regex_t *preg,
	  char *errbuf, int errbuf_size)
{
  const char *msg;
  int msg_size;

  msg = rx_error_msg[errcode] == 0 ? "Success" : rx_error_msg[errcode];
  msg_size = strlen (msg) + 1; /* Includes the 0.  */
  if (errbuf_size != 0)
    {
      if (msg_size > errbuf_size)
        {
          strncpy (errbuf, msg, errbuf_size - 1);
          errbuf[errbuf_size - 1] = 0;
        }
      else
        strcpy (errbuf, msg);
    }
  return msg_size;
}


int
rx_regmatch (regmatch_t pmatch[],
	      const regex_t *preg,
	      struct rx_context_rules * rules,
	      int start,
	      int end,
	      const char *string)
{
  struct rx_solutions * solutions;
  int answer;
  struct rx_context_rules local_rules;
  int orig_end;
  int end_lower_bound;
  int end_upper_bound;
  
  local_rules = *rules;
  orig_end = end;

  if (!preg->pattern)
    {
      end_lower_bound = start;
      end_upper_bound = start;
    }
  else if (preg->pattern->len >= 0)
    {
      end_lower_bound = start + preg->pattern->len;
      end_upper_bound = start + preg->pattern->len;
    }
  else
    {
      end_lower_bound = start;
      end_upper_bound = end;
    }
  end = end_upper_bound;
  while (end >= end_lower_bound)
    {
      local_rules.not_eol = (rules->not_eol
			     ? (   (end == orig_end)
				|| !local_rules.newline_anchor
				|| (string[end] != '\n'))
			     : (   (end != orig_end)
				&& (!local_rules.newline_anchor
				    || (string[end] != '\n'))));
      solutions = rx_basic_make_solutions (pmatch, preg->pattern, preg->subexps,
					   start, end, &local_rules, string);
      if (!solutions)
	return REG_ESPACE;
      
      answer = rx_next_solution (solutions);

      if (answer == 1)
	{
	  if (pmatch)
	    {
	      pmatch[0].rm_so = start;
	      pmatch[0].rm_eo = end;
	      pmatch[0].final_tag = rx_solutions_final_tag (solutions);
	    }
	  rx_basic_free_solutions (solutions);
	  return 0;
	}
      else
	rx_basic_free_solutions (solutions);

      --end;
    }

  switch (answer)
    {
    default:
      return REG_ESPACE;

    case 0:
      return REG_NOMATCH;
    }
}


int
rx_regexec (regmatch_t pmatch[],
	    const regex_t *preg,
	    struct rx_context_rules * rules,
	    int start,
	    int end,
	    const char *string)
{
  int x;
  int stat;
  int anchored;
  struct rx_exp_node * simplified;
  struct rx_unfa * unfa;
  struct rx_dfa machine;

  anchored = preg->is_anchored;

  unfa = 0;
  if ((end - start) > RX_MANY_CASES)
    {
      rx_simplify_rexp (&simplified, 256, preg->pattern, preg->subexps);
      unfa = rx_unfa (simplified, 256);
      if (!unfa)
	{
	  rx_free_rexp (simplified);
	  return REG_ESPACE;
	}
      rx_init_dfa_from_rx (&machine, unfa->nfa);
      rx_free_rexp (simplified);
    }

  for (x = start; x <= end; ++x)
    {
      if (preg->is_nullable
	  || ((x < end)
	      && (preg->fastmap[((unsigned char *)string)[x]])))
	{
	  if ((end - start) > RX_MANY_CASES)
	    {
	      int amt;
	      rx_dfa_goto_start_superstate (&machine);
	      amt = rx_dfa_advance_to_final (&machine, string + x, end - start - x);
	      if (!machine.final_tag && (amt < (end - start - x)))
		goto nomatch;
	    }
	  stat = rx_regmatch (pmatch, preg, rules, x, end, string);
	  if (!stat || (stat != REG_NOMATCH))
	    {
	      rx_free_unfa (unfa);
	      return stat;
	    }
	}
    nomatch:
      if (anchored)
	if (!preg->newline_anchor)
	  {
	    rx_free_unfa (unfa);
	    return REG_NOMATCH;
	  }
	else
	  while (x < end)
	    if (string[x] == '\n')
	      break;
	    else
	      ++x;
    }
  rx_free_unfa (unfa);
  return REG_NOMATCH;
}



/* regexec searches for a given pattern, specified by PREG, in the
 * string STRING.
 *
 * If NMATCH is zero or REG_NOSUB was set in the cflags argument to
 * `regcomp', we ignore PMATCH.  Otherwise, we assume PMATCH has at
 * least NMATCH elements, and we set them to the offsets of the
 * corresponding matched substrings.
 *
 * EFLAGS specifies `execution flags' which affect matching: if
 * REG_NOTBOL is set, then ^ does not match at the beginning of the
 * string; if REG_NOTEOL is set, then $ does not match at the end.
 *
 * We return 0 if we find a match and REG_NOMATCH if not.  
 */

int
regnexec (const regex_t *preg,
	  const char *string,
	  int len,
	  int nmatch,
	  regmatch_t **pmatch,
	  int eflags)
{
  int want_reg_info;
  struct rx_context_rules rules;
  regmatch_t * regs;
  int nregs;
  int stat;

  want_reg_info = (!preg->no_sub && (nmatch > 0));

  rules.newline_anchor = preg->newline_anchor;
  rules.not_bol = !!(eflags & REG_NOTBOL);
  rules.not_eol = !!(eflags & REG_NOTEOL);
  rules.case_indep = !!(eflags & REG_ICASE);

  if (nmatch >= preg->re_nsub)
    {
      regs = *pmatch;
      nregs = nmatch;
    }
  else
    {
      regs = (regmatch_t *)malloc (preg->re_nsub * sizeof (*regs));
      if (!regs)
	return REG_ESPACE;
      nregs = preg->re_nsub;
    }

  {
    int x;
    for (x = 0; x < nregs; ++x)
      regs[x].rm_so = regs[x].rm_eo = -1;
  }


  stat = rx_regexec (regs, preg, &rules, 0, len, string);

  if (!stat && want_reg_info && pmatch && (regs != *pmatch))
    {
      int x;
      for (x = 0; x < nmatch; ++x)
	(*pmatch)[x] = regs[x];
    }

  if (!stat && (eflags & REG_ALLOC_REGS))
    *pmatch = regs;
  else if (regs && (!pmatch || (regs != *pmatch)))
    free (regs);
  
  return stat;
}

int
regexec (const regex_t *preg,
	 const char *string,
	 int nmatch,
	 regmatch_t pmatch[],
	 int eflags)
{
  return regnexec (preg,
		   string,
		   strlen (string),
		   nmatch,
		   &pmatch,
		   (eflags & ~REG_ALLOC_REGS));
}


/* Free dynamically allocated space used by PREG.  */

void
regfree (regex_t *preg)
{
  if (preg->pattern)
    {
      rx_free_rexp (preg->pattern);
      preg->pattern = 0;
    }
  if (preg->subexps)
    {
      free (preg->subexps);
      preg->subexps = 0;
    }
  if (preg->translate != 0)
    {
      free (preg->translate);
      preg->translate = 0;
    }
}
