/*
    BFilter - a web proxy which removes banner ads
    Copyright (C) 2002-2005  Joseph Artsimovich <joseph_a@mail.ru>

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
*/

#include <iostream>
#include <fstream>
#include <sstream>
#include "lexgen.h"
#include "NfaChar.h"
#include "NfaCharClass.h"
#include "NfaNegatedCharClass.h"
#include "NfaConcatenation.h"
#include "NfaClosure.h"
#include "NfaPositiveClosure.h"
#include "NfaOption.h"
#include "NfaString.h"
#include "NfaInsensitiveString.h"
#include "NfaUnion.h"
#include "NfaEpsilon.h"

#include "HtmlLexerDefinitions.h"

int main(int argc, char** argv)
{
	if (argc < 8) {
		std::cerr << "Usage: makelexer <OutputClass> <out.h> <out.cpp> "
			"<Definitions> <definitions.h> <Subclass> <subclass.h>" << std::endl;
		return 1;
	}
	const char* out_class = argv[1];
	const char* out_header = argv[2];
	const char* out_impl = argv[3];
	const char* def_class = argv[4];
	const char* def_header = argv[5];
	const char* subclass = argv[6];
	const char* subclass_header = argv[7];
	
	std::ofstream header_strm(out_header);
	if (header_strm.fail()) {
		std::cerr << "Failed opening " << out_header << " for writing" << std::endl;
	}
	
	std::ofstream impl_strm(out_impl);
	if (impl_strm.fail()) {
		std::cerr << "Failed opening " << out_impl << " for writing" << std::endl;
	}
	
	LexGen lexgen(out_class, subclass);
	
	NfaCharClass space(" \t\r\n");
	NfaCharClass alpha('A', 'Z');
	alpha.addChars('a', 'z');
	NfaCharClass ident(alpha);
	ident.addChars('0', '9');
	ident.addChars("-:");
	NfaNegatedCharClass not_ident('0', '9');
	not_ident.addChars('a', 'z');
	not_ident.addChars('A', 'Z');
	not_ident.addChars("-:");
	NfaNegatedCharClass any;
	NfaEpsilon epsilon;
	NfaChar dquote('"');
	NfaChar squote('\'');
	NfaChar lbracket('<');
	NfaChar rbracket('>');
	NfaNegatedCharClass not_lbracket("<");
	NfaChar dash('-');
	NfaNegatedCharClass not_dash("-");
	
	NfaString comment_begin("<!--");
	NfaString comment_end("-->");
	
	NfaConcatenation cdata_end(NfaString("</"), alpha);
	cdata_end.addComponent(NfaClosure(ident));
	cdata_end.addComponent(rbracket);
	
	NfaConcatenation noscript_start(NfaInsensitiveString("<noscript"), NfaCharClass("> \t\r\n"));
	
	typedef HtmlLexerDefinitions Defs;
	
	/*
	When we enter the INITIAL state, the current token is always empty.
	Actions that do transition to the INITIAL state can't use MORE() or
	START_GROWING() or CONTINUE_GROWING().
	*/
	lexgen.addRule(Defs::INITIAL, NfaString("<!DOCTYPE"), space,
		"BEGIN(INSIDE_DOCTYPE); MORE();"
	).setLazy();
	lexgen.addRule(Defs::INITIAL, lbracket, alpha,
		"BEGIN(TAG_NAME); MORE();"
	).setLazy();
	lexgen.addRule(Defs::INITIAL, NfaString("</"), alpha,
		"BEGIN(CLOSING_TAG_NAME); MORE();"
	).setLazy();
	lexgen.addRule(Defs::INITIAL, comment_begin,
		"BEGIN(INSIDE_COMMENT); MORE();"
	).setLazy();
	lexgen.addRule(Defs::INITIAL, any, "BEGIN(TEXT); MORE();");
	
	// When we enter the TEXT state, the current token already contains the first character
	lexgen.addRule(Defs::TEXT, NfaClosure(not_lbracket),
		"BEGIN(INITIAL); obj->processText(tok_begin, tok_end);"
	).setLazyOnBlockEnd();
	
	// When we enter the INSIDE_COMMENT state, the current token may contain some part of the comment
	lexgen.addRule(Defs::INSIDE_COMMENT, NfaConcatenation(NfaClosure(dash), comment_end),
		"BEGIN(INITIAL); obj->processComment(tok_begin, tok_end);"
	).setLazy();
	lexgen.addRule(Defs::INSIDE_COMMENT, any,
		"BEGIN(INSIDE_COMMENT_NOEND); MORE(); IGNORE_MATCH();"
	);
	lexgen.addRule(Defs::INSIDE_COMMENT, epsilon,
		// an EOF while inside a comment
		"obj->processComment(tok_begin, tok_end);"
	);
	
	lexgen.addRule(Defs::INSIDE_COMMENT_NOEND, NfaConcatenation(NfaClosure(dash), NfaClosure(not_dash)),
		"BEGIN(INSIDE_COMMENT);\n\t"
		"if (tok_end.isAtRightBorder())\n\t"
		"obj->processComment(tok_begin, tok_end);\n\t"
		"else MORE();"
	).setLazyOnBlockEnd();
	
	// When we enter the CDATA state, the current token may already contain some part of it
	lexgen.addRule(Defs::CDATA, cdata_end,
		"Iterator begin(match_begin); begin += 2;\n\t"
		"Iterator end(match_end); --end;\n\t"
		"if (obj->isCDATAEnding(begin, end))\n\t"
		"{ BEGIN(INITIAL); IGNORE_MATCH(); obj->processCDATA(tok_begin, tok_end); }\n\t"
		"else MORE();"
	).setLazy();
	lexgen.addRule(Defs::CDATA, comment_begin,
		"BEGIN(CDATA_COMM); obj->processCDATA(tok_begin, tok_end);"
		// We commit the data accumulated so far, as we may return to this point
		// in case this comment section is unclosed.
	).setLazy();
	lexgen.addRule(Defs::CDATA, any, "BEGIN(CDATA_NOEND); MORE(); IGNORE_MATCH();");
	lexgen.addRule(Defs::CDATA, epsilon, "obj->processCDATA(tok_begin, tok_end);");
	
	lexgen.addRule(Defs::CDATA_NOEND, NfaConcatenation(NfaClosure(lbracket), NfaClosure(not_lbracket)),
		"BEGIN(CDATA);\n\t"
		"if (tok_end.isAtRightBorder())\n\t"
		"obj->processCDATA(tok_begin, tok_end);\n\t"
		"else MORE();"
	).setLazyOnBlockEnd();
	
	lexgen.addRule(Defs::CDATA_COMM, NfaConcatenation(NfaClosure(any), comment_end),
		"BEGIN(CDATA); MORE();"
	).setLazy();
	lexgen.addRule(Defs::CDATA_COMM, NfaConcatenation(NfaClosure(any), comment_begin),
		"BEGIN(CDATA); CONSUME_NOTHING();"
		// Nested comment inside CDATA.
		// We consider the outer comment unclosed and ignore it.
	).setLazy();
	lexgen.addRule(Defs::CDATA_COMM, epsilon,
		"BEGIN(CDATA); CONSUME_NOTHING();"
	);
	
	lexgen.addRule(Defs::INSIDE_DOCTYPE, NfaConcatenation(NfaClosure(any), rbracket),
		"BEGIN(INITIAL); obj->processDocType(tok_begin, tok_end);"
	).setLazy();
	lexgen.addRule(Defs::INSIDE_DOCTYPE, NfaConcatenation(NfaClosure(any), NfaChar('[')),
		"BEGIN(INSIDE_DOCTYPE_INTERNAL_SUBSET); MORE();"
	).setLazy();
	
	lexgen.addRule(Defs::INSIDE_DOCTYPE_INTERNAL_SUBSET, NfaConcatenation(NfaClosure(any), NfaChar(']')),
		"BEGIN(INSIDE_DOCTYPE); MORE();"
	).setLazy();
	
	// token already contains "<"
	lexgen.addRule(Defs::TAG_NAME, NfaClosure(ident),
		"BEGIN(INSIDE_TAG); MORE(); obj->processOpeningTagName(match_begin, tok_end);"
	);
	
	// token already contains "</"
	lexgen.addRule(Defs::CLOSING_TAG_NAME, NfaClosure(ident),
		"BEGIN(INSIDE_CLOSING_TAG); MORE(); obj->processClosingTagName(match_begin, tok_end);"
	);
	
	lexgen.addRule(Defs::INSIDE_TAG, NfaConcatenation(alpha, NfaClosure(ident)),
		"BEGIN(ATTR_EQUAL);\n\t"
		"if (obj->processAttrName(match_begin, tok_end))\n\t"
		"MORE();\n\t"
		"else { BEGIN(INITIAL); CONSUME_NOTHING(); }"
	);
	lexgen.addRule(Defs::INSIDE_TAG, NfaString("/>"),
		"BEGIN(INITIAL); obj->processOpeningTag(tok_begin, tok_end, true);"
	);
	lexgen.addRule(Defs::INSIDE_TAG, rbracket,
		"BEGIN(obj->isCDATAStarting() ? CDATA : INITIAL);\n\t"
		"obj->processOpeningTag(tok_begin, tok_end, false);"
	);
	lexgen.addRule(Defs::INSIDE_TAG, any, "MORE();");
	lexgen.addRule(Defs::INSIDE_TAG, epsilon,
		// EOF inside a tag. Treat the tag as text.
		"obj->processText(tok_begin, tok_end);"
	);
	
	lexgen.addRule(Defs::ATTR_EQUAL, NfaConcatenation(NfaClosure(space), NfaChar('=')).addComponent(NfaClosure(space)),
		"BEGIN(ATTR_VALUE); MORE();"
	);
	lexgen.addRule(Defs::ATTR_EQUAL, epsilon,
		"BEGIN(INSIDE_TAG); MORE();\n\t"
		"obj->processAttrNullValue();"
	);
	
	lexgen.addRule(Defs::ATTR_VALUE, dquote, "BEGIN(ATTR_VALUE_DQ); MORE();");
	lexgen.addRule(Defs::ATTR_VALUE, squote, "BEGIN(ATTR_VALUE_SQ); MORE();");
	lexgen.addRule(Defs::ATTR_VALUE, any, "BEGIN(ATTR_VALUE_NQ); MORE(); IGNORE_MATCH();");
	lexgen.addRule(Defs::ATTR_VALUE, epsilon, "BEGIN(INSIDE_TAG); MORE();");
	
	// this one will match in any case
	lexgen.addRule(Defs::ATTR_VALUE_NQ, NfaClosure(NfaNegatedCharClass(" \t\n\r>")),
		"BEGIN(INSIDE_TAG); MORE(); obj->processAttrValue(match_begin, tok_end);"
	);
	
	lexgen.addRule(Defs::ATTR_VALUE_DQ, NfaClosure(NfaNegatedCharClass("\"")), dquote,
		"BEGIN(INSIDE_TAG); MORE(); obj->processAttrValue(match_begin, tok_end);"
	);
	lexgen.addRule(Defs::ATTR_VALUE_DQ, epsilon, "BEGIN(ATTR_VALUE_NQ); MORE();");
	
	lexgen.addRule(Defs::ATTR_VALUE_SQ, NfaClosure(NfaNegatedCharClass("'")), squote,
		"BEGIN(INSIDE_TAG); MORE(); obj->processAttrValue(match_begin, tok_end);"
	);
	lexgen.addRule(Defs::ATTR_VALUE_SQ, epsilon, "BEGIN(ATTR_VALUE_NQ); MORE();");
	
	lexgen.addRule(Defs::INSIDE_CLOSING_TAG, rbracket,
		"if (obj->isNoscriptToBeExpected())\n\t"
		"{ BEGIN(AFTER_SCRIPT_CLOSE); MORE(); }\n\t"
		"else { BEGIN(INITIAL); obj->processClosingTag(tok_begin, tok_end); }"
	);
	lexgen.addRule(Defs::INSIDE_CLOSING_TAG, any, "MORE();");
	lexgen.addRule(Defs::INSIDE_CLOSING_TAG, epsilon,
		"obj->processText(tok_begin, tok_end);"
	);
	
	lexgen.addRule(Defs::AFTER_SCRIPT_CLOSE, NfaConcatenation(NfaClosure(space), noscript_start),
		"BEGIN(INITIAL); IGNORE_MATCH(); obj->processClosingTag(tok_begin, tok_end, true);"
	);
	lexgen.addRule(Defs::AFTER_SCRIPT_CLOSE, epsilon,
		"BEGIN(INITIAL); obj->processClosingTag(tok_begin, tok_end);"
	);
	
	lexgen.writeLexer(header_strm, impl_strm, def_class, def_header, subclass_header);
	return 0;
}

