// Jericho HTML Parser - Java based library for analysing and manipulating HTML
// Version 2.1
// Copyright (C) 2005 Martin Jericho
// http://sourceforge.net/projects/jerichohtml/
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// http://www.gnu.org/copyleft/lesser.html
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
// Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this library; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA

package au.id.jericho.lib.html;

import java.util.*;

/**
 * Represents either a {@link StartTag} or {@link EndTag} in a specific {@linkplain Source source} document.
 *
 * <h3><a name="ParsingProcess">Tag Parsing Process</a></h3>
 * The following process describes how each tag is identified by the parser:
 * <ol class="Separated">
 *  <li>
 *   Every '<code>&lt;</code>' character found in the source document is considered to be the start of a tag.
 *   The characters following it are compared with the {@linkplain TagType#getStartDelimiter() start delimiters}
 *   of all the {@linkplain TagType#register() registered} {@linkplain TagType tag types}, and a list of matching tag types
 *   is determined.
 *  <li>
 *   A more detailed analysis of the source is performed according to the features of each matching tag type from the first step,
 *   in order of <a href="TagType.html#Precedence">precedence</a>, until a valid tag is able to be constructed.
 *   <p>
 *   The analysis performed in relation to each candidate tag type is a two-stage process:
 *   <ol>
 *    <li>
 *     The position of the tag is checked to determine whether it is {@linkplain TagType#isValidPosition(Source,int) valid}
 *     for the candidate tag type.
 *     <p>
 *     By default, this check ensures that a tag found inside an HTML {@linkplain StartTagType#COMMENT comment}
 *     or {@linkplain StartTagType#CDATA_SECTION CDATA section} is ignored unless it is a {@linkplain TagType#isServerTag() server tag}.
 *     The check is performed by the {@link TagType#isValidPosition(Source, int pos)} method, which has a common default implementation
 *     for all tag types, but can be overridden in <a href="TagType.html#custom">custom</a> tag types.
 *     <p>
 *     The default implementation uses the static {@link TagType#getTagTypesIgnoringEnclosedMarkup() TagTypesIgnoringEnclosedMarkup}
 *     property to determine which tag types can not contain other tags.  The documentation of this property contains
 *     a more detailed analysis of the subject and explains why only the {@linkplain StartTagType#COMMENT comment} and 
 *     {@linkplain StartTagType#CDATA_SECTION CDATA section} tag types are included by default.
 *     <p>
 *     Note that in versions prior to 2.0, server tags located inside HTML comments were returned in
 *     <a href="#NamedSearch">named searches</a> but ignored in <a href="#OpenSearch">open searches</a>.
 *     CDATA sections were not recognised at all, so tags appearing inside them were always recognised.
 *    <li>
 *     A final analysis is performed by the {@link TagType#constructTagAt(Source, int pos)} method of the candidate tag type.
 *     This method returns a valid {@link Tag} object if all conditions of the candidate tag type are met, otherwise it returns
 *     <code>null</code> and the process continues with the next candidate tag type.
 *   </ol>
 *  <li>
 *   If the source does not match the start delimiter or syntax of any registered tag type, the segment spanning it and the next
 *   '<code>&gt;</code>' character is taken to be an {@linkplain #isUnregistered() unregistered} tag.
 *   Some tag search methods ignore unregistered tags.  See the {@link #isUnregistered()} method for more information.
 * </ol>
 * <p>
 * See the documentation of the {@link TagType} class for more details on how tags are recognised.
 *
 * <h3><a name="TagSearchMethods">Tag Search Methods</a></h3>
 * <p>
 * Methods that find tags in a source document are collectively referred to as <i>Tag Search Methods</i>.
 * They are found mostly in the {@link Source} and {@link Segment} classes, and can be generally categorised as follows:
 * <dl class="Separated">
 *  <dt><a name="OpenSearch">Open Search:</a>
 *   <dd>These methods search for tags of any {@linkplain #getName() name} and {@linkplain #getTagType() type}.
 *  <dt><a name="NamedSearch">Named Search:</a>
 *   <dd>These methods usually include a parameter called <code>name</code> which is used to specify the {@linkplain #getName() name} of the
 *    tag to search for.  In some cases named search methods do not require this parameter because the context or name of the method implies
 *    the name to search for.
 *    In tag search methods specifically looking for start tags, specifying a name that ends in a colon (<code>:</code>)
 *    searches for all start tags in the specified XML namespace.
 *  <dt><a name="TagTypeSearch">Tag Type Search:</a>
 *   <dd>These methods usually include a parameter called <code>tagType</code> which is used to specify the {@linkplain #getTagType() type} of the
 *    tag to search for.  In some methods the search parameter is restricted to the {@link StartTagType} subclass of <code>TagType</code>.
 *  <dt><a name="OtherSearch">Other Search:</a>
 *   <dd>A small number of methods do not fall into any of the above categories, such as the methods that search on
 *    {@linkplain Source#findNextStartTag(int pos, String attributeName, String value, boolean valueCaseSensitive) attribute values}.
 * </dl>
 */
public abstract class Tag extends Segment implements HTMLElementName {
	String name=null; // always lower case, can always use == operator to compare with constants in HTMLElementName interface

	/**
	 * {@linkplain StartTagType#XML_PROCESSING_INSTRUCTION XML processing instruction}
	 * @deprecated  Use {@link StartTagType#XML_PROCESSING_INSTRUCTION} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String PROCESSING_INSTRUCTION=StartTagType.XML_PROCESSING_INSTRUCTION.getNamePrefixForTagConstant();

	/**
	 * {@linkplain StartTagType#XML_DECLARATION XML declaration}
	 * @deprecated  Use {@link StartTagType#XML_DECLARATION} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String XML_DECLARATION=StartTagType.XML_DECLARATION.getNamePrefixForTagConstant();

	/**
	 * {@linkplain StartTagType#DOCTYPE_DECLARATION document type declaration}
	 * @deprecated  Use {@link StartTagType#DOCTYPE_DECLARATION} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String DOCTYPE_DECLARATION=StartTagType.DOCTYPE_DECLARATION.getNamePrefixForTagConstant();

	/**
	 * {@linkplain PHPTagTypes#PHP_STANDARD Standard PHP} tag (<code>&lt;&#63;php &#46;&#46;&#46; &#63;&gt;</code>)
	 * @deprecated  Use {@link PHPTagTypes#PHP_STANDARD} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String SERVER_PHP=PHPTagTypes.PHP_STANDARD.getNamePrefixForTagConstant();

	/**
	 * Common {@linkplain StartTagType#SERVER_COMMON server} tag (<code>&lt;% &#46;&#46;&#46; %&gt;</code>)
	 * @deprecated  Use {@link StartTagType#SERVER_COMMON} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String SERVER_COMMON=StartTagType.SERVER_COMMON.getNamePrefixForTagConstant();

	/**
	 * {@linkplain MasonTagTypes#MASON_NAMED_BLOCK Mason named block} (<code>&lt;%<i>name</i> &#46;&#46;&#46; &gt; &#46;&#46;&#46; &lt;/%<i>name</i>&gt;</code>)
	 * @deprecated  Use {@link MasonTagTypes#MASON_NAMED_BLOCK} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String SERVER_MASON_NAMED_BLOCK=MasonTagTypes.MASON_NAMED_BLOCK.getNamePrefixForTagConstant(); // NOTE: this value is the same value as SERVER_COMMON

	/**
	 * {@linkplain MasonTagTypes#MASON_COMPONENT_CALL Mason component call} (<code>&lt;&amp; &#46;&#46;&#46; &amp;&gt;</code>)
	 * @deprecated  Use {@link MasonTagTypes#MASON_COMPONENT_CALL} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String SERVER_MASON_COMPONENT_CALL=MasonTagTypes.MASON_COMPONENT_CALL.getNamePrefixForTagConstant();

	/**
	 * {@linkplain MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT Mason component called with content} (<code>&lt;&amp;| &#46;&#46;&#46; &amp;&gt; &#46;&#46;&#46; &lt;/&amp;&gt;</code>)
	 * @deprecated  Use {@link MasonTagTypes#MASON_COMPONENT_CALLED_WITH_CONTENT} in combination with <a href="#TagTypeSearch">tag type search</a> methods instead.
	 */
	public static final String SERVER_MASON_COMPONENT_CALLED_WITH_CONTENT=MasonTagTypes.MASON_COMPONENT_CALLED_WITH_CONTENT.getNamePrefixForTagConstant();

	private static final boolean INCLUDE_UNREGISTERED_IN_SEARCH=false; // determines whether unregistered tags are included in searches

	Tag(final Source source, final int begin, final int end, final String name) {
		super(source, begin, end);
		this.name=HTMLElements.getConstantElementName(name.toLowerCase());
	}

	/**
	 * Returns the name of the tag, always in lower case.
	 * <p>
	 * The name always starts with the {@linkplain TagType#getNamePrefix() name prefix} defined in this tag's {@linkplain TagType type}.
	 * For some tag types, the name consists only of this prefix, while in others it must be followed by a valid
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML name</a>
	 * (see {@link StartTagType#isNameAfterPrefixRequired()}).
	 * <p>
	 * If the name is equal to one of the constants defined in the {@link HTMLElementName} interface, this method is guaranteed to return
	 * the constant itself.
	 * This allows comparisons to be performed using the <code>==</code> operator instead of the less efficient
	 * <code>String.equals(Object)</code> method.
	 * <p>
	 * For example, the following expression can be used to test whether a {@link StartTag} is from a
	 * <code><a target="_blank" href="http://www.w3.org/TR/html401/interact/forms.html#edef-SELECT">SELECT</a></code> element:
	 * <br /><code>startTag.getName()==HTMLElementName.SELECT</code>
	 *
	 * @return the name of the tag, always in lower case.
	 */
	public String getName() {
		return name;
	}

	/**
	 * Indicates whether the tag has a syntax that does not match any of the {@linkplain TagType#register() registered} {@linkplain TagType tag types}.
	 * <p>
 	 * The only requirement of an unregistered tag type is that it {@linkplain TagType#getStartDelimiter() starts} with
 	 * '<code>&lt;</code>' and there is a {@linkplain TagType#getClosingDelimiter() closing} '<code>&gt;</code>' character
 	 * at some position after it in the source document.
	 * <p>
	 * The absence or presence of a '<code>/</code>' character after the initial '<code>&lt;</code>' determines whether an
	 * unregistered tag is respectively a
	 * {@link StartTag} with a {@linkplain #getTagType() type} of {@link StartTagType#UNREGISTERED} or an
	 * {@link EndTag} with a {@linkplain #getTagType() type} of {@link EndTagType#UNREGISTERED}.
	 * <p>
	 * There are no restrictions on the characters that might appear between these delimiters, including other '<code>&lt;</code>'
	 * characters.  This may result in a '<code>&gt;</code>' character that is identified as the closing delimiter of two
	 * separate tags, one an unregistered tag, and the other a tag of any type that {@linkplain #getBegin() begins} in the middle 
	 * of the unregistered tag.  As explained below, unregistered tags are usually only found when specifically looking for them,
	 * so it is up to the user to detect and deal with any such nonsensical results.
	 * <p>
	 * Unregistered tags are only returned by <a href="Tag.html#NamedSearch">named search</a> methods, where the specified <code>name</code>
	 * matches the first characters inside the tag, or by <a href="Tag.html#TagTypeSearch">tag type search</a> methods, where the
	 * specified <code>tagType</code> is either {@link StartTagType#UNREGISTERED} or {@link EndTagType#UNREGISTERED}.
	 * <p>
	 * <a href="Tag.html#OpenSearch">Open</a> tag searches and <a href="Tag.html#OtherSearch">other</a> searches always ignore
	 * unregistered tags, although every discovery of an unregistered tag is {@linkplain Source#setLogWriter(Writer) logged} by the parser.
	 * <p>
	 * The logic behind this design is that unregistered tag types are usually the result of a '<code>&lt;</code>' character 
	 * in the text that was mistakenly left {@linkplain CharacterReference#encode(CharSequence) unencoded}, or a less-than 
	 * operator inside a script, or some other occurrence which is of no interest to the user.
	 * By returning unregistered tags in <a href="Tag.html#NamedSearch">named</a> and <a href="Tag.html#TagTypeSearch">tag type</a>
	 * search methods, the library allows the user to specifically search for tags with a certain syntax that does not match any
	 * existing {@link TagType}.  This expediency feature avoids the need for the user to create a
	 * <a href="TagType.html#Custom">custom tag type</a> to define the syntax before searching for these tags.
	 * By not returning unregistered tags in the less specific search methods, it is providing only the information that 
	 * most users are interested in.
	 *
	 * @return <code>true</code> if the tag has a syntax that does not match any of the {@linkplain TagType#register() registered} {@linkplain TagType tag types}, otherwise <code>false</code>.
	 */
	public abstract boolean isUnregistered();

	/**
	 * Returns the {@linkplain TagType type} of this tag.	
	 * @return the {@linkplain TagType type} of this tag.	
	 */
	public abstract TagType getTagType();

	/**
	 * Regenerates the HTML text of this tag.
	 * <p>
	 * This is an abstract method which is implemented in the {@link StartTag} and {@link EndTag} subclasses.
	 * See the documentation of the {@link StartTag#regenerateHTML()} and {@link EndTag#regenerateHTML()} methods for details.
	 *
	 * @return the regenerated HTML text of this tag.
	 */
	public abstract String regenerateHTML();

	/**
	 * Indicates whether the specified character is valid at the start of an
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
	 * <p>
	 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> defines a
	 * <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">Name</a></code> as starting with one of the characters
	 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a> | '_' | ':')</code>.
	 * <p>
	 * This method uses the expression
	 * <br /><code>Character.isLetter(ch) || ch=='_' || ch==':'</code>.
	 * <p>
	 * Note that there are many differences between the <code>Character.isLetter()</code> definition of a Letter and the
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">XML definition of a Letter</a>,
	 * but these differences are unlikely to be significant in real-world XML or HTML documents.
	 *
	 * @param ch  the character to test.
	 * @return <code>true</code> if the specified character is valid at the start of an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
	 * @see Source#findNameEnd(int pos)
	 */
	public static final boolean isXMLNameStartChar(final char ch) {
		return Character.isLetter(ch) || ch=='_' || ch==':';
	}

	/**
	 * Indicates whether the specified character is valid anywhere in an
	 * <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>.
	 * <p>
	 * The <a target="_blank" href="http://www.w3.org/TR/REC-xml/#sec-common-syn">XML 1.0 specification section 2.3</a> uses the
	 * entity <code><a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-NameChar">NameChar</a></code> to represent this set of
	 * characters, which is defined as
	 * <br /><code>(<a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Letter">Letter</a>
	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Digit">Digit</a> | '.' | '-' | '_' | ':'
	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-CombiningChar">CombiningChar</a>
	 * | <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Extender">Extender</a>)</code>.
	 * <p>
	 * This method uses the expression
	 * <br /><code>Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':'</code>.
	 * <p>
	 * Note that there are many differences between these definitions,
	 * but these differences are unlikely to be significant in real-world XML or HTML documents.
	 *
	 * @param ch  the character to test.
	 * @return <code>true</code> if the specified character is valid anywhere in an <a target="_blank" href="http://www.w3.org/TR/REC-xml/#NT-Name">XML Name</a>, otherwise <code>false</code>.
	 * @see Source#findNameEnd(int pos)
	 */
	public static final boolean isXMLNameChar(final char ch) {
		return Character.isLetterOrDigit(ch) || ch=='.' || ch=='-' || ch=='_' || ch==':';
	}

	final boolean includeInSearch() {
		return INCLUDE_UNREGISTERED_IN_SEARCH || !isUnregistered();
	}

	static final Tag findPreviousOrNextTag(final Source source, final int pos, final boolean previous) {
		return source.cache.findPreviousOrNextTag(pos,previous);
	}
		
	static final Tag findPreviousOrNextTagUncached(final Source source, final int pos, final boolean previous, final int breakAtPos) {
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
//				int x=begin;
				begin=previous?parseText.lastIndexOf('<',begin,breakAtPos):parseText.indexOf('<',begin,breakAtPos); // this assumes that all tags start with '<'
//				if (begin==-1)
//					x=Math.abs(breakAtPos-x);
//				else
//					x=Math.abs(begin-x);
//				System.out.println("A"+(previous?"P":"N")+"-"+x+" from "+pos+" breakat "+breakAtPos+" found "+begin);
				if (begin==-1) return null;
				final Tag tag=getTagAt(source,begin);
				if (tag!=null && tag.includeInSearch()) return tag;
			} while (inRange(source,begin+=(previous?-1:1)));
		} catch (IndexOutOfBoundsException ex) {
			ex.printStackTrace();
			// this should only happen when the end of file is reached in the middle of a tag.
			// we don't have to do anything to handle it as there are no more tags anyway.
		}
		return null;
	}
	
	static final Tag findPreviousOrNextTag(final Source source, final int pos, final TagType tagType, final boolean previous) {
		return source.cache.findPreviousOrNextTag(pos,tagType,previous);
	}

	static final Tag findPreviousOrNextTagUncached(final Source source, final int pos, final TagType tagType, final boolean previous, final int breakAtPos) {
		if (tagType==null) return findPreviousOrNextTagUncached(source,pos,previous,breakAtPos);
		final char[] startDelimiterCharArray=tagType.getStartDelimiterCharArray();
		try {
			final ParseText parseText=source.getParseText();
			int begin=pos;
			do {
//				int x=begin;
				begin=previous?parseText.lastIndexOf(startDelimiterCharArray,begin,breakAtPos):parseText.indexOf(startDelimiterCharArray,begin,breakAtPos);
//				if (begin==-1)
//					x=Math.abs(breakAtPos-x);
//				else
//					x=Math.abs(begin-x);
//				System.out.println("T"+(previous?"P":"N")+"-"+x+" search "+tagType+" from "+pos+" breakat "+breakAtPos+" found "+begin);
				if (begin==-1) return null;
				final Tag tag=getTagAt(source,begin);
				if (tag!=null && tag.getTagType()==tagType) return tag;
			} while (inRange(source,begin+=(previous?-1:1)));
		} catch (IndexOutOfBoundsException ex) {
			// this should only happen when the end of file is reached in the middle of a tag.
			// we don't have to do anything to handle it as there are no more tags anyway.
		}
		return null;
	}

	static final Tag getTagAt(final Source source, final int pos) {
		return source.cache.getTagAt(pos);
	}

	static final Tag getTagAtUncached(final Source source, final int pos) {
		return TagType.getTagAt(source,pos);
	}

	static final boolean inRange(final Source source, final int pos) {
		return pos>=0 && pos<=source.length();
	}

	static Iterator getNextTagIterator(final Source source, final int pos) {
		return new NextTagIterator(source,pos);
	}

	private static final class NextTagIterator implements Iterator {
		private Tag nextTag=null;

		public NextTagIterator(final Source source, final int pos) {
			nextTag=findPreviousOrNextTag(source,pos,false);
		}

		public boolean hasNext() {
			return nextTag!=null;
		}

		public Object next() {
			final Tag result=nextTag;
			try {
				nextTag=findPreviousOrNextTag(result.source,result.begin+1,false);
			} catch (NullPointerException ex) {
				throw new NoSuchElementException();
			}
			return result;
		}

		public void remove() {
			throw new UnsupportedOperationException();
		}
	}
}
