#!/usr/bin/awk -f
#****************************************************************************
#  ##   ##         #####   #####  ##     **       NoSQL RDBMS - env2sed     *
#  ###  ##        ####### ####### ##     **        $Revision: 1.1.1.1 $       *
#  #### ##        ###     ##   ## ##     ************************************
#  #######  ####  #####   ##   ## ##     **   Carlo Strozzi (c) 1998-2000   *
#  ####### ######   ##### ## # ## ##     ************************************
#  ## #### ##  ##     ### ##  ### ##     **           Written by            *
#  ##  ### ###### ####### ######  ###### **          Carlo Strozzi          *
#  ##   ##  ####   #####   #### # ###### **     e-mail: carlos@linux.it     *
#****************************************************************************
#   NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.                          *
#   This program comes with ABSOLUTELY NO WARRANTY; for details             *
#   refer to the GNU General Public License.                                *
#****************************************************************************
# NOTE: to edit, set ts=8 in 'vi' (or equivalent)
#       to print, pipe through 'pr -t -e8'
#****************************************************************************
#
# Converts environment variables into sed(1) replacement statements.
#
# Usage:  env2sed
# 
# Note: options must be passed through the environment variable _awk_args,
#       i.e.: _awk_args='[options]'
#
# Options:
#     -m|--match R
#           Only do those variables that match the given regular expression.
#           R must be a valid awk(1) pattern, without surrounding slashes.
#
#     -d|--delete R
#           Delete anything that match the regular expression R in variable
#           values. R must be a valid awk(1) pattern, without surrounding
#           slashes. If both '-d' and '-b' are specified, they can affect
#           each other, in that '-d' is done before '-b'.
#
#     -b|--blank R
#           Anything that matches the regular expression R in variable
#           values is replaced with one single blank. R must be a valid
#           awk(1) pattern, without surrounding slashes. If both '-d' and
#           '-b' are specified, they can affect each other, in that '-d'
#           is done before '-b'.
#
#     -c|--cgi
#           Reverse the encoding done by the 'cgi2*' operator on Common
#           Gateway Interface (CGI) variables. This is done on what is
#           left after any '-b' and '-d' processing.
#
#     -x|--debug
#           Print command-line regular expressions to STDERR.
#
#     -u|--unescape
#           Turn NoSQL escapes \t and \n into horizontal tabs and newlines
#           respectively. This is not done by default, as the environment
#           is already supposed to contain those characters in their
#           unescaped form.
#
#     -s|--strip-names R
#           Strip anything that matches the regular expression R from
#           variable _names_. This can be useful, for instance, to remove
#           the leading 'WWW_' from CGI varibles set by the 'uncgi' program.
#
#     -p|--prefix P
#           Prefix the left-hand side of each output assignments 
#           with the string 'P'.
#
#     -h|--html
#           The output data must be suitable for inclusion in an HTML
#           page. This causes TABs and newlines to be converted into
#           the ASCII escapes &#9; and &#10; respectively.
#
#     -C|--strip-comments
#          Print the statements necessary to remove comment-lines,
#          i.e. lines that start with a '#' character.
#
# Environment variable names that do not match the regular expression
# /^[A-Za-z0-9_]+$/ are skipped, and a warning message is printed
# to STDERR.
#
# The program operates on environment variables rather than on STDIN,
# and this makes possible to operate on variables that contain physical 
# newlines and tabs rather than their NoSQL escapes \n and \t.
#
# Hint: to pass extra commands to sed(1) in one single invocation you
# can use the form:
#
#	     export _awk_args='[options]'
#            sed -e "$(env2sed)" -e "..." ...
#
# Warning: not all shells are robust enough to allow for large amounts
# of data in one single shell variable or string. If that's the case,
# then it is preferable to change the above usage example as follows:
#
#	     env2sed > temp_file
#	     echo another_sed_command >> temp_file
#	     echo ... and_so_on ... >> temp_file
#
#	     sed -f temp_file
#
#****************************************************************************
# NOTE: to edit, set ts=8 in 'vi' (or equivalent)
#       to print, pipe through 'pr -t -e8'
#****************************************************************************

BEGIN {
  NULL = ""; OFS = "\t"

  # I need to use the environment for passing args, as otherwise escapes
  # must be doubled, i.e. \n -> \\n, \t -> \\t, etc.

  split( ENVIRON["_awk_args"], args, " " )

  while ( args[++i] != NULL )
  {
    if ( args[i] == "-m" || args[i] == "--match" ) m_pattern = args[++i]
    else if ( args[i] == "-d" || args[i] == "--delete" )
    {
      remove = 1; d_pattern = args[++i]
    }
    else if ( args[i] == "-b" || args[i] == "--blank" )
    {
      blank = 1; b_pattern = args[++i]
    }
    else if ( args[i] == "-x" || args[i] == "--debug" ) debug = 1
    else if ( args[i] == "-c" || args[i] == "--cgi" ) cgi = 1
    else if ( args[i] == "-u" || args[i] == "--unescape" ) unescape = 1
    else if ( args[i] == "-s" || args[i] == "--strip-names" )
    {
      strip_names = 1; s_pattern = args[++i]
    }
    else if ( args[i] == "-p" || args[i] == "--prefix" ) prefix = args[++i]
    else if ( args[i] == "-h" || args[i] == "--html" ) html = 1
    else if ( args[i] == "-C" || args[i] == "--strip-comments" )
    {
      no_comments = 1
    }
  }

  if ( debug )
  {
    print "args:       " _awk_args    > "/dev/stderr"
    print "-m pattern: " m_pattern    > "/dev/stderr"
    print "-d pattern: " d_pattern    > "/dev/stderr"
    print "-b pattern: " b_pattern    > "/dev/stderr"
    print "-s pattern: " s_pattern    > "/dev/stderr"
  }

  if ( m_pattern == NULL ) m_pattern = ".*"

  # Handle comments.
  if ( no_comments ) printf("/^#.*$/d\n")

  for ( env in ENVIRON )
  {
    if ( env !~ m_pattern ) continue

    # Always skip rc(1) functions, options and NoSQL-specific stuff.
    if ( env ~ /^fn_/ || env ~ /^_nosql_/ || env == "_awk_args" ) continue

    # Skip invalid variable names. They may occur when we receive
    # them from a WWW Browser.
    if ( env !~ /^[A-Za-z0-9_]+$/ )
    {
      print "env2sed: bad variable name " env > "/dev/stderr"
      continue
    }

    # Honour the '-s' switch.
    dd = env
    if ( strip_names ) gsub( s_pattern, NULL, dd )

    # In case we stripped the whole name.
    if ( dd == NULL ) continue

    var = ENVIRON[ env ]

    # Honour '-d' and '-b' first.
    if ( remove ) gsub( d_pattern, NULL, var)
    if ( blank ) gsub( b_pattern, " ", var)

    if ( cgi )
    {
      # Revert the encoding done by the 'cgi2sh' utility.
      # These codes must appare exactly in the same order as they
      # appare inside the 'rewind' option of that utility.

      gsub( "&#9;", "\t", var )                # tab
      gsub( "&#10;", "\n", var )               # newline
      gsub( "&#39;", "'", var )                # single quote
      gsub( "&#96;", "`", var )                # backtick
      gsub( "&#34;", "\"", var )               # double quote
      gsub( "&#62;", ">", var )                # Close tag
      gsub( "&#60;", "<", var )                # Open tag
      gsub( "&#35;", "#", var )                # Hash mark
      gsub( "&amp;", "\&", var )               # Ampersand
    }

    # Unescape NoSQL special characters if requested.
    if ( unescape ) var = NoSQL_Unescape( var )

    # Escape TABs and newlines in output for HTML if requested.
    if ( html ) {
       gsub( /\t/, "\&#9;", var )
       gsub( /\n/, "\&#10;", var )
    }

    # Apply actual sed(1) escaping. Do '\' first!

    var = Mawk_Bug( var )
    gsub( "\n", "\\\n", var )
    gsub( "&", "\\\\&", var )
    gsub( "#", "\\\\#", var )
    gsub( "/", "\\\/", var )

    # Add more sed(1) escapes here if necessary.

    printf("s/%s%s/%s/g", prefix, dd, var)

    printf("\n")
  }
}

########################################################################
# NoSQL_Unescape(string)
#
# Takes a string and translates any unescaped '\t' and '\n' strings into
# physical tabs and newlines respectively. Returns the converted string.
########################################################################
function NoSQL_Unescape(s,		S,i,s_length,a,escaped) {
  s_length = split(s, a, "")
  s_length++				# Cope with s_length==1
  while ( ++i <= s_length ) {
    if ( a[i] == "\\" && !escaped ) { escaped = 1; continue }
    if ( a[i] == "n" && escaped ) { S = S "\n"; escaped = 0; continue }
    if ( a[i] == "t" && escaped ) { S = S "\t"; escaped = 0; continue }
    if ( escaped ) { S = S "\\" a[i]; escaped = 0; continue }
    S = S a[i]
  }
  return S
}

########################################################################
# Mawk_Bug(string)
#
# Takes a string and turns all '\' characters into their escaped form
# '\\'. Returns the escaped string. This could be done with just a gsub(),
# but mawk(1) has a bug that makes it behave differently from other awk
# implementations:
#
# gsub( /\\/, "\\\\", field )		# This works with both gawk(1)
#					# and the original nawk(1).
#
# gsub( /\\/, "\\\\\\", field )		# This works just with mawk(1),
#					# otherwise it produces more
#					# backslashes than necessary,
#					# which looks rather obvious.
#
########################################################################
function Mawk_Bug( s,		a,i,j,S ) {

   i = split( s, a, "\\" )
   S = a[1]
   for ( j = 2; j <= i; j++ ) S = S "\\\\" a[j]
   return S
}

########################################################################
# End of program.
########################################################################

