
// Modified XmlTokenMaker.java from RText - http://rtext.sourceforge.net/

package sdoc.lexers;

import sdoc.lexers.tokens.Token;
import java.util.ArrayList;
import java.util.List;
import javax.swing.text.Segment;
import java.io.CharArrayReader;
import java.io.IOException;
import sdoc.lexers.tokens.TokenFactory;

%%
%public
%class XmlLexer
%implements Lexer
%unicode
%pack
%buffer 128
%type List



%{
	private List tokens = new ArrayList();
	
	/**
	* Token type specific to XMLTokenMaker; this signals that the user has
	* ended a line with an unclosed XML tag; thus a new line is beginning
	* still inside of the tag.
	*/
	public static final int INTERNAL_INTAG					= -1;
	private int start;
	
	public XmlLexer(){}
	
	private void addNullToken()
	{
		tokens.add(TokenFactory.createNullToken());
	}
	
	
	public int getLastTokenTypeOnLine(Segment text , int initialTokenType)
	{
		getTokens(text , initialTokenType , 0);
		return ((Token)tokens.get(tokens.size() -1)).type;
	}
	
	
	private void addToken(int type) 
	{
		
		Token t = TokenFactory.createToken(type , yytext());
		
		if(tokens.size() == 1 && ((Token)tokens.get(0)).type == Token.NULL)
		{
			tokens.remove(0);
		}
		
		tokens.add(t);
	}
	
	
	public List getTokens(Segment text, int initialTokenType, int startOffset) 
	{
		tokens.clear();

		// Start off in the proper state.
		int state = Token.NULL;
		switch (initialTokenType) {
			case Token.COMMENT_MULTILINE:
				state = COMMENT;
				start = text.offset;
				break;
			case Token.FUNCTION:
				state = DTD;
				start = text.offset;
				break;
			case Token.PREPROCESSOR:
				state = PI;
				start = text.offset;
				break;
			case INTERNAL_INTAG:
				state = INTAG;
				start = text.offset;
				break;
			case Token.VARIABLE:
				state = CDATA;
				start = text.offset;
				break;
			default:
				state = Token.NULL;
		}

		try 
		{
			yyreset(new CharArrayReader(text.array , text.offset , text.count));
			yybegin(state);
			return yylex();			
		} 
		catch (IOException ioe) 
		{
			ioe.printStackTrace();
			return new ArrayList();
		}
	}
	
	
%}

Whitespace			= ([ \t\f])
LineTerminator			= ([\n])
Identifier			= ([^ \t\n<&]+)
AmperItem				= ([&][A-Za-z]*[;]?)
InTagIdentifier		= ([^ \t\n\"\'=>]+)
UnclosedStringLiteral	= ([\"][^\"]*)
StringLiteral			= ({UnclosedStringLiteral}[\"])
UnclosedCharLiteral		= ([\'][^\']*)
CharLiteral			= ({UnclosedCharLiteral}[\'])
CDataBegin			= ("<![CDATA[")
CDataEnd				= ("]]>")

%state COMMENT
%state PI
%state DTD
%state INTAG
%state CDATA

%%

<YYINITIAL> 
{
	"<!--"							{ addToken(Token.COMMENT_MULTILINE); yybegin(COMMENT); }
	{CDataBegin}				{ addToken(Token.DATA_TYPE); yybegin(CDATA); }
	"<!"								{ addToken(Token.FUNCTION); yybegin(DTD); }
	"<?"								{ addToken(Token.PREPROCESSOR); yybegin(PI); }
	"<"									{ addToken(Token.SEPARATOR); yybegin(INTAG); }
	{LineTerminator}		{ addNullToken(); return tokens; }
	{Identifier}				{ addToken(Token.IDENTIFIER); }
	{AmperItem}					{ addToken(Token.DATA_TYPE); }
	{Whitespace}+				{ addToken(Token.WHITESPACE); }
	<<EOF>>							{ addNullToken(); return tokens; }
}

<COMMENT> 
{
	[^\n\-]+						{addToken(Token.COMMENT_MULTILINE);}
	{LineTerminator}		{ addToken(Token.COMMENT_MULTILINE); return tokens; }
	"-->"								{ yybegin(YYINITIAL); addToken(Token.COMMENT_MULTILINE); }
	"-"									{addToken(Token.COMMENT_MULTILINE);}
	<<EOF>>							{ addToken(Token.COMMENT_MULTILINE); return tokens; }
}

<PI> 
{
	[^\n\?]+						{addToken(Token.PREPROCESSOR);}
	{LineTerminator}		{ addToken(Token.PREPROCESSOR); return tokens; }
	"?>"								{ yybegin(YYINITIAL); addToken(Token.PREPROCESSOR); }
	"?"									{addToken(Token.PREPROCESSOR);}
	<<EOF>>							{ addToken(Token.PREPROCESSOR); return tokens; }
}

<DTD> 
{
	[^\n>]+						{ addToken(Token.FUNCTION);}
	{LineTerminator}	{ addToken(Token.FUNCTION); return tokens; }
	">"								{ yybegin(YYINITIAL); addToken(Token.FUNCTION); }
	<<EOF>>						{ addToken(Token.FUNCTION); return tokens; }
}

<INTAG> 
{
	{InTagIdentifier}				{ addToken(Token.RESERVED_WORD); }
	{Whitespace}+						{ addToken(Token.WHITESPACE); }
	"="											{ addToken(Token.OPERATOR); }
	">"											{ yybegin(YYINITIAL); addToken(Token.SEPARATOR); }
	{UnclosedStringLiteral}	{ addToken(Token.ERROR_STRING_DOUBLE); }
	{StringLiteral}					{ addToken(Token.LITERAL_STRING_DOUBLE_QUOTE); }
	{UnclosedCharLiteral}		{ addToken(Token.ERROR_CHAR); }
	{CharLiteral}						{ addToken(Token.LITERAL_CHAR); }
	<<EOF>>									{ addToken(INTERNAL_INTAG); return tokens; }
}

<CDATA> 
{
	[^\]]+			{addToken(Token.VARIABLE);}
	{CDataEnd}	{ yybegin(YYINITIAL); addToken(Token.VARIABLE); }
	"]"					{addToken(Token.VARIABLE);}
	<<EOF>>			{ addToken(Token.VARIABLE); return tokens; }
}