/* ============================================================
 * File  : harvester.cpp
 * Author: Eric Giesselbach <ericgies@kabelfoon.nl>
 * Date  : 2004-03-08
 * Description : scans url for links (streams)
 *
 * Copyright 2003 by Eric Giesselbach

 * This program is free software; you can redistribute it
 * and/or modify it under the terms of the GNU General
 * Public License as published bythe Free Software Foundation;
 * either version 2, or (at your option)
 * any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * ============================================================ */


#include <qregexp.h>
#include <qdom.h>

#include "harvester.h"

using namespace std;


bool readTextFile(QString fileName, QString *data)
{
    QFile f(fileName);
    if ( f.open( IO_ReadOnly ) )
    {
      *data = (QString)f.readAll(); 
      f.close();
      return true;
    }
      else
        return false;
}


NewUrl::NewUrl(QString& nUrl, QString& nName, QString& nDescr) : QObject()
{
  name    = nName;
  url     = nUrl;
  descr   = nDescr;
  handler = "";
  
}

Parser::Parser() : QObject()
{
	list.setAutoDelete(true);
}

Parser::~Parser()
{
  list.clear();
}

UrlList* Parser::parse(QString& data)
{
   source = &data;
   run();
   
   return &list;
}


void Parser::run()
{
    // data to parse in *source.
    // result, state
    QString baseUrl, newUrl;
    int     thisStart, prevEnd, linkCount;
    QString copydata1, copydata2, resultdata;

    // general
    int i;
    QString name;
    QRegExp expr;

    expr.setCaseSensitive(false);

    copydata1  = "";
    copydata2  = "";
    resultdata = "";
    linkCount  = 0;

    // get <a href=url>title</a>
    expr.setPattern("<a\\s+(?:.(?!ref))?href\\s*=\\s*\"?\'?([\\w\\.\\-\\/\\:\\?\\&\\=\\_\\~]+)\"?\'?[^>]*>((?:.(?!\\</a))*.)");

    i = 0;
    prevEnd = 0;
    while ( i > -1 )
    {
      i = expr.search( *source, i );
      thisStart = i;
      i+= expr.matchedLength();
      if ( i > -1 )
      {
        newUrl = expr.cap(1);
        name   = expr.cap(2);
        name   = name.replace( QRegExp("<[^><]*>"), "" );  // tags contain url's
        copydata1  += source->mid(prevEnd, thisStart - prevEnd);
        resultdata += "\nHYPER***" + newUrl + "***HYPER***" + name + "***HYPER";
        prevEnd = i;
        linkCount++;
      }
    }
    copydata1 += source->mid(prevEnd, source->length() - prevEnd);

    // get protocol://url
    expr.setPattern("(?:^|[\\s<>\\[\\]\\:\\=\\(\\)\"\\'])(\\w+://[\\w\\.\\-\\/:\\?\\&\\=]+)");

    i = 0;
    prevEnd = 0;
    while ( i > -1 )
    {
      i = expr.search( copydata1, i );
      thisStart = i;
      i+= expr.matchedLength();
      if ( i > -1 )
      {
        newUrl = expr.cap(1);
        copydata2  += copydata1.mid(prevEnd, thisStart - prevEnd);
        resultdata += "\nHYPER***" + newUrl + "***HYPER***" + newUrl + "***HYPER";
        prevEnd = i;
        linkCount++;
      }
    }
    copydata2 += copydata1.mid(prevEnd, copydata1.length() - prevEnd);

    // new searches use copydata2 here...
    // get FRAME src=url
    expr.setPattern("FRAME\\s+(?:.(?!rc))?src\\s*=\\s*\"?\'?([\\w\\.\\-\\/\\:\\?\\&\\=\\_\\~]+)\"?\'?");

    i = 0;
    prevEnd = 0;
    while ( i > -1 )
    {
      i = expr.search( copydata2, i );
      thisStart = i;
      i+= expr.matchedLength();
      if ( i > -1 )
      {
        newUrl = expr.cap(1);
        //copydata1  += copydata2.mid(prevEnd, thisStart - prevEnd);
        resultdata += "\nHYPER***" + newUrl + "***HYPER***" + newUrl + "***HYPER";
        prevEnd = i;
        linkCount++;
      }
    }
    //copydata1 += copydata2.mid(prevEnd, copydata1.length() - prevEnd);


    // grep found url's
    expr.setPattern("HYPER\\*\\*\\*(.*)\\*\\*\\*HYPER\\*\\*\\*(.*)\\*\\*\\*HYPER");
    expr.setMinimal(true);


    i = 0;
    int dataMatches = 0;
    list.clear();
    
    while ( i > -1 )
    {
      i = expr.search( resultdata, i );
      i+= expr.matchedLength();
      if ( i > -1 )
      {
        newUrl = expr.cap(1);
        name = expr.cap(2);
        
	name = name.replace( QRegExp("[\r\n]"), "" );
        if ( name == "")
          name = newUrl;

          NewUrl *url = 0; 
	  url = new NewUrl(newUrl, name, newUrl);
          list.append( url );
          dataMatches++;
          //cout << "found: " << newUrl << "   name: " << name << endl;
      }
    }

}


