<?php
/* ******************************************************************** */
/* CATALYST PHP Source Code                                             */
/* -------------------------------------------------------------------- */
/* This program is free software; you can redistribute it and/or modify */
/* it under the terms of the GNU General Public License as published by */
/* the Free Software Foundation; either version 2 of the License, or    */
/* (at your option) any later version.                                  */
/*                                                                      */
/* This program is distributed in the hope that it will be useful,      */
/* but WITHOUT ANY WARRANTY; without even the implied warranty of       */
/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the        */
/* GNU General Public License for more details.                         */
/*                                                                      */
/* You should have received a copy of the GNU General Public License    */
/* along with this program; if not, write to:                           */
/*   The Free Software Foundation, Inc., 59 Temple Place, Suite 330,    */
/*   Boston, MA  02111-1307  USA                                        */
/* -------------------------------------------------------------------- */
/*                                                                      */
/* Filename:    lucene-fileindex-defs.php                               */
/* Author:      Paul Waite                                              */
/* Description: Search Engine Module                                    */
/*              Specialised indexing class for indexing file content.   */
/*              Still tied to the deprecated lucene-defs.php module.    */
/*                                                                      */
/* ******************************************************************** */
/** @package search */

/**
* The file indexer class.
* This class indexes files on disc, either one by one or as a whole
* file hierarchy tree.
* @package search
*/
class fileindexer {
  // Public
  /** Application we are indexing for */
  var $application = "";
  /** Host to connect to */
  var $host = "";
  /** Port to connect to */
  var $port = "";

  // Private
  /** The index ID
      @access private */
  var $ixid;
  /** ID generation source
      @access private */
  var $idsource = ID_FROM_INC;
  /** Scan for meta tags as fields in file content. Recommended.
      @access private */
  var $metascan = true;
  /** Meta fields definitions array. Contains definitions
      for the fields we will process if found as meta tags.
      @access private */
  var $meta_fields = array();
  /** Index fields definitions array. Contains definitions
      for the fields we are expecting to index.
      @access private */
  var $field_definitions = array();
  /** Fields for indexing. This is an array of fieldname/value
      pairs which should be added during the indexing. These
      fields do not have to appear in $field_definitions.
      @access private */
  var $indexfields = array();
  /** ID generation offset
      @access private */
  var $idoffset = 0;
  /** ID generation prefix
      @access private */
  var $idprefix = "";
  /** Timeout for indexing commands in seconds (can usually leave
      as nullstring)
      @access private */
  var $timeoutsecs = "";
  /** Path to a lockfile we should give way to. If this value
      is not nullstring, then no indexing will be done while the
      file exists. If lockfile_wait is > 0, then we only wait
      this many seconds.
      @access private */
  var $lockfile = "";
  /** Number of seconds to wait on a lockfile. If zero, wait forever.
      @access private */
  var $lockfile_wait_secs = 0;
  /** Indexing execution timer
      @access private */
  var $timer;
  // .....................................................................
  /**
  * Constructor
  * Create a new file indexer
  * @param string $application Application name
  * @param string $host Hostname or IP of search engine server
  * @param string $port Port of search engine server
  */
  function fileindexer($application="?", $host="", $port="") {
    // Store for reference..
    $this->application = $application;
    $this->host = $host;
    $this->port = $port;
    $this->timer = new microtimer();
  } // fileindexer
  // .....................................................................
  /**
  * Define a field. We supply the name of the field, it's type (Text, Date
  * or Id), and whether it should be stored by the search engine for later
  * retreival in queries. For example you would not store the raw
  * document/content as this is usually stored elsewhere.
  * IMPORTANT NOTE: Fields defined here will automatically be included as
  * meta fields.
  * @see meta_fields()
  * @param string  $fieldname Name of the field to index
  * @param string  $type Type of field data: Text, Date or Id.
  * @param boolean $stored If true then search engine will store the content itself
  * @param boolean $indexed If true then search engine will index the field content
  */
  function define_field($fieldname, $type, $stored=STORED, $indexed=INDEXED) {
    $this->field_definitions[$fieldname]
          = $type . "|" . (($stored)  ? "true" : "false") . "|" . (($indexed)  ? "true" : "false");
    // Register for meta tags..
    $this->meta_field($fieldname, $type);
  } // define_field
  // .....................................................................
  /**
  * Define a lockfile which we must avoid during indexing. If defined
  * then no indexing will take place while the lockfile exists. The
  * second parameter allows you to specify a limit to the patience of
  * this process, in seconds. Zero means wait forever.
  * @param string $lockfile Path to the lockfile. Nullstring = not defined
  * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
  */
  function avoid_lockfile($lockfile, $wait_secs=0) {
    $this->lockfile = $lockfile;
    $this->lockfile_wait_secs = $wait_secs;
  } // avoid_lockfile
  // .....................................................................
  /**
  * Define a field as a meta tag. This ensures that the field will be
  * picked up from the file meta tags, if present. If it is not listed
  * here then it will be ignored.
  * IMPORTANT NOTE: We define the strict rule that ONLY fields which have
  * been defined here can be added to the indexing via the meta tag scanning.
  * Ie. you must define fields here explicitly, or via the define_field()
  * method, or they will be ignored even if they turn up as a meta tag.
  * This is so we can restrict the indexing, and be sure of field types.
  * @see define_field()
  * @param string  $fieldname Name of the field to process as meta tag
  * @param string  $type Type of field data: Text, Date or Id.
  */
  function meta_field($fieldname, $type) {
    $this->meta_fields[$fieldname] = $type;
  } // meta_field
  // .....................................................................
  /**
  * Supply field content for indexing. This causes the search engine to take
  * the given fieldname and index the given value against it.
  * The field name can have the field type included in the form 'Foo:Date',
  * where 'Date' is the type in this instance. In fact, since 'Text' is the
  * default filed type, 'Date' is probably the only one you need to use
  * as the current implementation stands.
  * @param string $fieldname Name of the field to index.
  * @param string $fieldvalue Content of the field to index
  */
  function index_field($fieldname, $fieldvalue) {
    $this->indexfields[$fieldname] = $fieldvalue;
  } // index_field
  // .....................................................................
  /**
  * Set the source for ID generation. Since we are indexing a bunch of
  * files, the ID's have to be generated on demand inside the loop. So
  * we provide for various ways here, and you can extend this class to
  * provide more if required.
  * Main ways:
  *   ID_FROM_INC      Increment a counter by 1 each time (with offset)
  *   ID_FROM_NAME     Take the filename, strip the extension, add prefix
  *   ID_FROM_FILENAME Take the full filename, add prefix
  *   ID_FROM_PATH     Take the full file path
  *   NB: These are all defined as integer constants.
  * @param integer $idsource Source of ID generation
  * @param mixed $pfxofs String prefix, or integer offset
  */
  function id_generate($idsource=ID_FROM_INC, $pfxofs="") {
    $this->idsource = $idsource;
    if ($pfxofs != "") {
      if (is_string($pfxofs)) {
        $this->idprefix = $pfxofs;
      }
      else {
        $this->idoffset = (int)$pfxofs;
      }
    }
  } // id_generate
  // .....................................................................
  /**
  * Flag that we should do a tag scan on the content of the files to try
  * and extract fields to index. Note that any tags thus found will only
  * be used if the field name has been defined with the method define_field();
  * This causes both the <title> tag and <meta> tags to be considered.
  * @see fileindexer::define_field()
  */
  function scantags() {
    $this->metascan = true;
  } // scantags
  // .....................................................................
  /**
  * Flag that we should NOT do a tag scan on the content of the files.
  */
  function noscantags() {
    $this->metascan = false;
  } // noscantags
  // .....................................................................
  /**
  * Index a file located at the given path, using given ID.
  * You can also use the parameter $fields to supply an array of
  * fieldname/value pairs to index with this file, for one-off indexing of
  * files. If the fieldname is a date field, make sure to define the
  * name as 'Foo:Date', to cause the field definition to be correct.
  * @param string $path Path to the head of the file tree to index
  * @param string $id ID to associate with the indexed file content
  * @param mixed $fields Array of field/values to index with file
  */
  function index_file($path, $id, $fields=false) {
    $success = false;
    $f = new inputfile($path);
    if ($f->opened) {
      $f->readall();
      $f->closefile();

      // Wait for a lockfile, if we really have to..
      if ($this->lockfile != "" && file_exists($this->lockfile)) {
        $waitforit = true;
        debugbr("waiting for lockfile..", DBG_DEBUG);
        if ($this->lockfile_wait_secs > 0) {
          $locktimer = new microtimer();
          $locktimer->start();
        }
        do {
          clearstatcache();
          if (!file_exists($this->lockfile)) {
            $waitforit = false;
            debugbr("lockfile has been removed..", DBG_DEBUG);
          }
          elseif ($this->lockfile_wait_secs > 0 && $locktimer->secs() >= $this->lockfile_wait_secs) {
            $waitforit = false;
            debugbr("lockfile wait (" . $this->lockfile_wait_secs ."secs) timed out..", DBG_DEBUG);
          }
          else {
            sleep(1);
          }
        } while ($waitforit === true);
      }

      // Create the index message..
      $ix = new lucene_indexmsg($this->application, $this->host, $this->port);

      // Define the fields for the index message..
      foreach ($this->field_definitions as $fieldname => $attributes) {
        $bits = explode("|", $attributes);
        $type = $bits[0];
        $stored  = (strcasecmp($bits[1], "true") == 0);
        $indexed = (strcasecmp($bits[2], "true") == 0);
        $ix->define_field($fieldname, $type, $stored, $indexed);
      }

      // Scan file content for meta tags for index fields..
      $content = preg_replace("/[\xe2][\x80]./", "", $f->content);
      $content = preg_replace("/[\xc2][\xb7]./", "", $content);
      $content = preg_replace("/[\xc2]&/", " ", $content);
      $content = preg_replace("/[\xc3]&/", " ", $content);

      if ($this->metascan) {
        $tagpat = "/<meta name=\"(.*?)\" content=\"(.*?)\">/i";
        $matches = array();
        if (preg_match_all($tagpat, $content, $matches)) {
          for ($i=0; $i < count($matches[0]); $i++) {
            $fieldname  = $matches[1][$i];
            $fieldvalue = $matches[2][$i];
            if (isset($this->meta_fields[$fieldname])) {
              // Get type..
              $type = $this->meta_fields[$fieldname];
              if (!strcasecmp($type, "date")) {
                // Newsquest date field format requires stripping off a prefix
                // 'DT' - a temporary hack which should be completely transparent
                // to everyone else using this. NB: originally NewsQuest only
                // stored date in 'DTdd/mm/yyyy' format. This parsing is also
                // compatible with the new 'DTdd/mm/yyyy hh:mm[:ss]' format.
                if (substr($fieldvalue, 0, 2) == "DT") {
                  $fieldvalue = substr($fieldvalue, 2);
                }
                // Need to convert to Unix timestamp..
                $ts = displaydate_to_timestamp($fieldvalue);
                $fieldvalue = $ts;
              }
              debugbr("meta tag index field: $fieldname=$fieldvalue");
              $ix->index_field($fieldname, $fieldvalue);
            }
            else {
              debugbr("rejected unlisted tag field: $fieldname");
            }
          }
        }
        // Check for title tag in HTML page if required field..
        if (preg_match("/<(title)>(.*?)<\/title>/i", $content, $matches)) {
          $fieldname  = $matches[1];
          $fieldvalue = $matches[2];
          if (isset($this->meta_fields[$fieldname])) {
            $type = $this->meta_fields[$fieldname];
            debugbr("title tag index field: $fieldname=$fieldvalue");
            $ix->index_field($fieldname, $fieldvalue);
          }
        }
      } // metascan

      // Deal with passed-in field settings. These are meant to cater
      // for indexing of individual files using this method. We just
      // add them to any existing field/values already set up..
      if ($fields) {
        reset($fields);
        while (list($fieldname, $fieldvalue) = each($fields)) {
          $this->index_field($fieldname, $fieldvalue);
        }
      }

      // Process field/value pairs which have been added either by the
      // index_field() method, or passed in via the $fields parameter..
      if (count($this->indexfields) > 0) {
        reset($this->indexfields);
        while (list($fieldname, $fieldvalue) = each($this->indexfields)) {
          $bits = explode(":", $fieldname);
          $type = ((isset($bits[1])) ? $bits[1] : "Text");
          $fieldname = $bits[0];
          debugbr("index field: $fieldname=$fieldvalue");
          $ix->define_field($fieldname, $type);
          $ix->index_field($fieldname, $fieldvalue);
        }
      }

      // Index the file content. We get rid of any HTML tags..
      debugbr("indexing file: $path, ID=$id");
      $ix->index_content($id, strip_tags($content));

      // Send the index message to the search engine. We specify a large
      // timeout since we really want this to succeed and search engine
      // may be in an optimization fugue..
      $success = $ix->send(120);
      if(!$success) {
        debugbr("failed: $ix->error_msg");
      }
    }
    else {
      debugbr("open failed on '$path'");
    }
    return $success;
  } // index_file
  // .....................................................................
  /**
  * Index a tree of files starting at the path given. We index these in one
  * of four modes, which determines how we generate the ID for each item:
  * 'ID_FROM_INC' mode uses an incremental counter starting at 1. If $prefix
  * holds a number, the counter will start at this number instead of one.
  * Each item has an ID incremented by one from the last one.
  * 'ID_FROM_NAME' mode uses the filename, stripped of any path and extension
  * as the ID. If prefix is not a nullstring, then it is prefixed to every
  * filename ID.
  * 'ID_FROM_FILENAME' mode uses the filename, including any extension
  * as the ID. If prefix is not a nullstring, then it is prefixed to every
  * filename ID.
  * 'ID_FROM_PATH' mode uses the full path to the item being indexed as the
  * ID. If prefix is not a nullstring, then it is prefixed to every
  * filename ID.
  * The file will simply be indexed as a single Text field, with the
  * appropriate ID, and no other index fields unless $metascan is set to TRUE.
  * If this is the case, the system will scan the file for HTML meta tags of
  * form: '<meta name="foo" content="bar">'. In this example a field of name
  *'foo' would be given value 'bar'.
  * @param string $path Path to the head of the file tree to index
  * @param $patt Pattern to match, eg. '*.html'
  * @param $restart If equal to "restart" then treat $path as file of paths
  * @param $lockfile If path is set, we idle whilst this file exists
  * @param string $lockfile Path to the lockfile. Nullstring = not defined
  * @param integer $wait_secs Time to wait for lockfile. Zero means forever.
  */
  function index_tree($path, $patt="", $restart="", $lockfile="", $wait_secs=0) {
    // Set up any lockfile definition..
    $this->avoid_lockfile($lockfile, $wait_secs);

    if ($restart == "restart") {
      // Restart from existing paths file..
      $tmpfname = $path;
      debugbr("restarting with existing item list $path", DBG_DEBUG);
    }
    else {
      // Use find to generate item list to a temporary file..
      debugbr("generating item list", DBG_DEBUG);
      $tmpfname = tempnam("/tmp", "LU");
      $cmd = "find $path";
      if ($patt != "") $cmd .= " -name \"$patt\"";
      $cmd .= " >$tmpfname";
      exec($cmd);
    }
    $treelist = new inputfile($tmpfname);
    if ($treelist->opened) {
      // Find the number of items..
      debugbr("counting items", DBG_DEBUG);
      $todo = (int) exec("cat $tmpfname|wc -l");
      if ($todo > 0) {
        $done = 0; $succeeded = 0; $failed = 0; $last = 0;
        debugbr("$todo items to index", DBG_DEBUG);
        $this->timer->start();
        $idix = 0;
        if ($this->idsource == ID_FROM_INC) {
          $idix += $this->idoffset;
        }

        while ($path = $treelist->readln()) {
          // Generate an ID to use..
          switch ($this->idsource) {
            case ID_FROM_INC:
              // Use incremented index..
              $id = $idix + 1;
              $idix += 1;
              break;

            case ID_FROM_NAME:
              // Use filename, minus extenaion..
              $fname = basename($path);
              if (strstr($fname, ".")) {
                $bits = explode(".", $fname);
                $dummy = array_pop($bits);
                $fname = implode(".", $bits);
              }
              $id = $this->idprefix . $fname;
              break;

            case ID_FROM_FILENAME:
              // Use full filename..
              $id = $this->idprefix . basename($path);
              break;

            case ID_FROM_PATH:
              // Use full file path..
              $id = $this->idprefix . $path;
              break;
          } // switch

          // Index the file with new ID..
          if ($this->index_file($path, $id)) {
            debugbr("$id indexed", DBG_DEBUG);
            $succeeded += 1;
          }
          else {
            debugbr("$path index failed", DBG_DEBUG);
            //break;
            $failed += 1;
          }

          // Progress check..
          $done += 1;

          // If the verbose output option is enabled, we compile
          // stats and display these via the debugger..
          if (debugging()) {
            $pct = ($done / $todo) * 100;
            $pct_int = (int)(floor($pct));
            $pct_mod = $pct % 5;
            if ($pct_mod == 0 && $pct_int > $last) {
              $secperdoc  = $this->timer->secs() / $done;
              $timedone = $this->timer->formatted_time();
              $timeleft = nicetime(($todo - $done) * $secperdoc);
              $ms = $this->timer->millisecs();
              $msper = number_format( ($ms / $done), 0);
              debugbr("Mark: $pct_int% $timedone ($done) Rate:$msper" . "ms/item Left:$timeleft", DBG_DEBUG);
              $last = $pct_int;
            }
          }
        } // while

        // Close tree list file..
        $treelist->closefile();

        // Wrap it up..
        $this->timer->stop();

        // Final stats if verbose mode..
        if (debugging()) {
          $secs = $this->timer->secs();
          $msper   = number_format( (1000 * $secs / $todo), 2);
          $sper1000 = number_format( ($secs / $todo) * 1000, 2);
          debugbr("time taken per item: " . $msper . "msec", DBG_DEBUG);
          debugbr("time per 1000 items: " . nicetime($sper1000), DBG_DEBUG);
          debugbr("total time taken: " . $this->timer->formatted_time(), DBG_DEBUG);
          debugbr("successfully indexed: $succeeded", DBG_DEBUG);
          debugbr("indexing failures: $failed", DBG_DEBUG);
        }
      }
      else {
        debugbr("nothing to index", DBG_DEBUG);
      }
    }
    else {
      debugbr("failed to open $tmpfname", DBG_DEBUG);
    }
  } // index_tree
} // fileindexer class

// ----------------------------------------------------------------------
?>