/*
    BFilter - a smart ad-filtering web proxy
    Copyright (C) 2002-2007  Joseph Artsimovich <joseph_a@mail.ru>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#ifndef HTMLPROCESSOR_H_
#define HTMLPROCESSOR_H_

#ifdef HAVE_CONFIG_H
#include <config.h>
#endif

#include "HtmlLexer.h"
#include "SplittableBuffer.h"
#include "SBOutStream.h"
#include "BString.h"
#include "IntrusivePtr.h"
#include "RequestPtr.h"
#include "HtmlNode.h"
#include "HtmlFragmentNode.h"
#include "JsEnvProxy.h"
#include "URI.h"
#include "Conf.h"
#include <stddef.h>
#include <string>
#include <memory>
#include <map>
#include <iosfwd>

class ServiceContext;
class HttpRequestMetadata;
class HttpResponseMetadata;
class RequestTag;
class ErrorDescriptor;
class AdSuspect;
class AdSuspectList;
class HeuristicScore;

class HtmlProcessor : private HtmlLexer
{
public:
	HtmlProcessor(ServiceContext& service_context);
	
	~HtmlProcessor();
public:
	void setRelatedInfo(
		ConstRequestPtr const& request, RequestTag const& request_tag,
		bool xhtml_content, bool bf_analyze);
	
	void consume(SplittableBuffer& data, bool eof);
	
	SBOutStream& processedData() { return m_processedData; }
	
	void reset();
private:
	enum ContentType { CONTENT_HTML, CONTENT_XHTML };
	enum { MAX_DESCENDANT_SCRIPT_FETCHES = 20 };
	enum { MAX_SCRIPT_FETCH_REDIRECTS = 3 };
	enum NodeHandled { NODE_NOT_HANDLED, NODE_HANDLED };
	enum Action { PASS_THROUGH, SUBSTITUTE, ANALYZE };
	
	enum TagType {
		TAG_OTHER = 0,
		TAG_A,
		TAG_AREA,
		TAG_BASE,
		TAG_EMBED,
		TAG_IFRAME,
		TAG_IMG,
		TAG_MAP,
		TAG_NOSCRIPT,
		TAG_OBJECT,
		TAG_PARAM,
		TAG_SCRIPT,
		TAG_STYLE,
		TAG_TEXTAREA,
		NUM_TAG_TYPES
	};
	
	struct TagTypePair;
	struct IPair;
	class IPairIcaseComparator;
	class ScriptContext;
	class SubProcessor;
	class AbstractScriptOperation;
	class LeaveAsIsScriptOperation;
	class RemoveScriptOperation;
	class ReplaceBodyScriptOperation;
	class TempStreamScope;
	struct ScriptFetchResult;
	class PageOpenListener;
	class ListenerSuspender;
	
	friend class HtmlProcessor::ScriptContext;
	friend class HtmlProcessor::LeaveAsIsScriptOperation;
	friend class HtmlProcessor::RemoveScriptOperation;
	friend class HtmlProcessor::ReplaceBodyScriptOperation;
	
	HtmlProcessor(HtmlProcessor const& parent, ScriptContext& script_context);
	
	virtual void processDocType(Iterator const& begin, Iterator const& end);
	
	virtual void processText(Iterator const& begin, Iterator const& end);
	
	virtual void processComment(Iterator const& begin, Iterator const& end);
	
	virtual void processCDATA(Iterator const& begin, Iterator const& end);
	
	void processAsText(Iterator const& begin, Iterator const& end);
	
	void processAsText(SplittableBuffer const& buf) { processAsText(buf.begin(), buf.end()); }
	
	void processAsText(BString const& str);
	
	virtual void processOpeningTagName(Iterator const& begin, Iterator const& end);
	
	virtual void processOpeningTag(Iterator const& begin, Iterator const& end, bool explicit_empty);
	
	virtual void processClosingTagName(Iterator const& begin, Iterator const& end);
	
	virtual void processClosingTag(Iterator const& begin, Iterator const& end, bool noscript_follows = false);
	
	virtual bool processAttrName(Iterator const& begin, Iterator const& end);
	
	virtual void processAttrValue(Iterator const& begin, Iterator const& end);
	
	virtual void processAttrNullValue();
	
	virtual bool isCDATAStarting() const;
	
	virtual bool isCDATAEnding(Iterator const& begin, Iterator const& end) const;
	
	virtual bool isNoscriptToBeExpected() const;
	
	bool isLexerInTheMiddleOfSomething() const;
	
	Action adviceAction(HeuristicScore const& score) const;
	
	NodeHandled processOpeningOther(HtmlNode* node, bool explicit_empty);
	
	NodeHandled processOpeningA(HtmlNode* anchor, bool explicit_empty);
	
	NodeHandled processOpeningArea(HtmlNode* area, bool explicit_empty);
	
	NodeHandled processOpeningBase(HtmlNode* base, bool explicit_empty);
	
	NodeHandled processOpeningEmbed(HtmlNode* embed, bool explicit_empty);
	
	NodeHandled processOpeningIframe(HtmlNode* iframe, bool explicit_empty);
	
	NodeHandled processOpeningImg(HtmlNode* img, bool explicit_empty);
	
	NodeHandled processOpeningMap(HtmlNode* map, bool explicit_empty);
	
	NodeHandled processOpeningNoscript(HtmlNode* noscript, bool explicit_empty);
	
	NodeHandled processOpeningObject(HtmlNode* object, bool explicit_empty);
	
	NodeHandled processOpeningParam(HtmlNode* param, bool explicit_empty);
	
	NodeHandled processOpeningScript(HtmlNode* script, bool explicit_empty);
	
	void processClosingOther(Iterator const& begin, Iterator const& end);
	
	void processClosingA(Iterator const& begin, Iterator const& end);
	
	void processClosingHead(Iterator const& begin, Iterator const& end);
	
	void processClosingIframe(Iterator const& begin, Iterator const& end);
	
	void processClosingMap(Iterator const& begin, Iterator const& end);
	
	void processClosingNoscript(Iterator const& begin, Iterator const& end);
	
	void processClosingObject(Iterator const& begin, Iterator const& end);
	
	bool processFlashObject(
		HtmlNode* node, BString const& attr_name,
		BString const& width_str, BString const& height_str,
		bool loop, bool menu);
	
	void processClosingScript(Iterator const& begin, Iterator const& end);
	
	void processScript(HtmlNode* script, bool& nuke_adjacent_noscript);
	
	void processJavaScript(HtmlNode* script, bool& nuke_adjacent_noscript);
	
	void executeJavaScript(ScriptContext& context, HtmlNode* script,
		SplittableBuffer const& script_code, std::string const& script_src);
	
	std::auto_ptr<AbstractScriptOperation> processJavaScriptOutput(
		ScriptContext& context, HtmlNode* script,
		SplittableBuffer const& script_code, std::string const& script_src,
		bool dynamic_content, bool& nuke_adjacent_noscript);
	
	std::auto_ptr<AbstractScriptOperation> tryMarkScriptForAnalyzing(
		ScriptContext& context, AdSuspect const& suspect, HtmlNode* script,
		SplittableBuffer const& script_code, std::string const& script_src);
	
	std::auto_ptr<AbstractScriptOperation> substituteAdScript(
		ScriptContext& context, AdSuspect const& suspect, HtmlNode* script,
		SplittableBuffer const& script_code, std::string const& script_src,
		bool& nuke_adjacent_noscript);
	
	NodeHandled processImg(HtmlNode* img, bool explicit_empty, HtmlNode const* anchor);
	
	void processUnclosedTags(HtmlNode* to);
	
	void processOpeningTagAsText(HtmlNode* node, bool explicit_empty);
	
	void processIndependentNode(HtmlNode* node);
	
	void processDocFragmentNode(HtmlNode* node);
	
	void processDocFragmentNodeReplacement(HtmlNode* original, HtmlNode* replacement);
	
	void processDocFragmentNodeReplacement(HtmlNode* node, BString const& replacement);
	
	void processDocFragmentNodeReplacement(HtmlNode* node, SplittableBuffer const& replacement);
	
	bool processNode(HtmlNode* node, bool skip_stoppers,
		bool can_output_directly, bool toplevel=true);
	
	void docFragmentPutContainer(HtmlNode* node);
	
	void docFragmentPutElement(HtmlNode* node);
	
	void inlineExternalScript(HtmlNode* node, SplittableBuffer const& code);
	
	static bool isJavaScript(HtmlNode const* script);
	
	static bool isAncestorOrSelf(HtmlNode const* ancestor, const HtmlNode* descendant);
	
	static HtmlNode* findAncestorOrSelf(HtmlNode* node, BString const& name);
	
	HtmlNode* findMostEvilMapLink(HtmlNode* map);
	
	void outputData(std::string const& data);
	
	void outputData(SplittableBuffer const& data);
	
	void outputData(Iterator const& begin, Iterator const& end);
	
	void outputDocFragmentLeadingText();
	
	void outputPendingData();
	
	static void tagTextAppend(HtmlNode* el, Iterator const& begin, Iterator const& end);
	
	static void openingTagToStream(
		std::ostream& strm, HtmlNode const* node,
		ContentType ctype, bool explicit_empty);
	
	static TagType getTagType(Iterator const& begin, Iterator const& end);
	
	static bool isStopperTag(int type);
	
	static bool isClosingTagForbidden(BString const& tname);
	
	static bool tagTerminatesAnchor(Iterator const& begin, Iterator const& end);
	
	bool isXHTML() const { return m_contentType == CONTENT_XHTML; }
	
	int getScriptNestLevel() const;
	
	void fetchScript(
		RequestPtr const& request, ScriptFetchResult& result,
		size_t max_body_size, size_t max_fetch_size);
	
	RequestPtr prepareScriptFetchRequest(URI const& url);
	
	static bool hasPersistentCookies(HttpResponseMetadata const& metadata);
	
	static bool isPersistentCookie(std::string const& str);
	
	static bool isDynamicResource(HttpResponseMetadata const& metadata);
	
	static void stripBOM(SplittableBuffer& data);
	
	std::auto_ptr<AbstractScriptOperation>
	createScriptSubstitution(HtmlNode* node, AdSuspect const& ad);
	
	SBOutStream& tempStream() { return m_tempStream; }
	
	ServiceContext& m_rContext;
	ScriptContext* m_pScriptContext;
	Config::PageCleanupLevel m_pageCleanupLevel;
	ConstRequestPtr m_ptrRequestMetadata;
	URI m_baseURL;
	int m_requestGroupId;
	JsEnvProxy m_jsEnv;
	ContentType m_contentType;
	int m_pageHint;
	HtmlNodePtr m_ptrCurTag;
	HtmlFragmentNode m_docFragment;
	HtmlNode* m_pContainingNode; // points to either m_docFragment or one of its descendants
	HtmlNodePtr m_ptrContainingAnchor;
	HtmlNodePtr m_ptrContainingMap;
	typedef std::map<BString, HtmlNodePtr> TagsByName;
	typedef std::multimap<BString, HtmlNodePtr> MultiTagsByName;
	TagsByName m_mapTags;
	MultiTagsByName m_imgTagsUsingMap;
	int m_openingTagType;
	int m_closingTagType;
	BString m_curAttrName;
	bool m_isInsideNoscript;
	bool m_noscriptFollowsScript; // updated every time when a tag closes
	SBOutStream m_processedData;
	SBOutStream m_tempStream; // used instead of multiple local streams
	static SplittableBuffer const m_sEmptyBuffer;
};

#endif
