/*
 * Copyright 2006 Ricoh Corporation.
 * 
 * 
 * APACHE LICENSE VERSION 2.0
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *      http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 * 
 * RICOH DEVELOPER PROGRAM SUPPORT:
 * 
 * Support for this software is available only to "Premier Plus" members
 * of the Ricoh Developer Program (RiDP).  You may find out more 
 * information about the Program at
 * 
 *      http://americas.ricoh-developer.com
 * 
 * Premier plus members may find answers and ask questions through the
 * RiDP customer help website at
 * 
 *      https://ridp.custhelp.com
 * 
 * Developers who are not RiDP members may still use this software as
 * stipulated in the license terms given above.
 *
 */ 

import java.net.*;
import java.io.*;
import java.util.*;

import com.hp.hpl.sparta.*;

class XHTMLUtils
{
	public static class HTMLSanitizingFilter extends FilterInputStream
	{		
		public static class ProcessBuffer
		{
			private byte[]	rawBuffer = null;
			private static final int paddingForFillableSpace = 100;
			
			//
			// This class maintains a buffer of characters that have been read
			// from the input stream.  The buffer layout looks something like
			// this:
			//
			//	+-------------------------------------------------------------------+
			//	|... some text in an HTML stream with a <b>tag</b> or two...|#######|
			//	+-------------------------------------------------------------------+
			//	             ^                         ^                    ^
			//	consumedCursor           processedCursor            endCursor
			//
			// The three cursors point to character positions in the buffer.
			// It is always true that consumedCursor <= processedCursor <= endCursor <= rawBuffer.length.
			// The meaning of the cursors are as follows:
			//
			//	consumedCursor:		Characters before this point have
			//				already been copied into a client
			//				buffer via a call to 'read' and
			//				are no longer needed.
			//
			//	processedCursor:	Characters after this point have not
			//				been processed yet
			//
			//	endCursor:		Characters before endCursor were read
			//				from the input stream (and possibly
			//				processed); characters after this point
			//				are invalid.
			//
			private int	endCursor = 0;
			private int	consumedCursor = 0;
			private int	processedCursor = 0;
			private boolean	reachedEOF = false;
			
			public ProcessBuffer(int initialSize)
			{
				rawBuffer = new byte[initialSize];
			}

			public void flushAndForceEOF()
			{
				this.reset();
				reachedEOF = true;
			}
			
			public void reset()
			{
				endCursor = 0;
				consumedCursor = 0;
				processedCursor = 0;
				reachedEOF = false;
			}
			
			/**
			 *	Returns 'true' if the last of the buffer has been read
			 */
			public boolean eof()
			{
				return reachedEOF && (this.resident() == 0);
			}
			
			/**
			 *	Returns the number of bytes in the buffer excluding
			 *	those that have already been consumed or processed.
			 */
			public int unprocessed()
			{
				return endCursor - processedCursor;
			}
			
			/**
			 *	Returns the number of bytes in the buffer excluding
			 *	those that have already been consumed.
			 */
			public int resident()
			{
				return endCursor - consumedCursor;
			}
			
			/**
			 *	Return the number of bytes in the buffer that have
			 *	been processed but not consumed.
			 */
			public int available()
			{
				return processedCursor - consumedCursor;
			}
			
			public int unused()
			{
				return rawBuffer.length - endCursor;
			}
			
			public int fillableSpace()
			{
				return unused() > paddingForFillableSpace ? unused() - paddingForFillableSpace : 0;
			}
			
			/**
			 *	Mark all of the processed bytes in the buffer
			 *	consumed without doing anything with them.
			 */
			public int consume()
			{
				int actuallyConsumed = this.available();
				consumedCursor = processedCursor;
				return actuallyConsumed;
			}
			
			/**
			 *	Mark the next 'desiredSize' processed bytes from the
			 *	buffer as consumed.
			 */
			public int consume(int desiredSize)
			{
				//
				// Assume we're going to consume everything
				// available, then back off to the desired
				// size if less is requested.
				//
				int actuallyConsumed = this.available();
				if(desiredSize < actuallyConsumed)
					actuallyConsumed = desiredSize;
				
				consumedCursor += actuallyConsumed;
				
				return actuallyConsumed;
			}
			
			/**
			 *	Copy the next 'desiredSize' bytes from the buffer
			 *	into the provided byte array and mark them as
			 *	consumed.
			 */
			public int consume(byte[] b, int off, int desiredSize)
			{
				int srcPosition = consumedCursor;
				int availableLength = this.consume(desiredSize);
				
				System.arraycopy(rawBuffer, srcPosition, b, off, availableLength);
				System.out.print(new String(rawBuffer, srcPosition, availableLength));
				
				return availableLength;
			}
			
			/**
			 *	Remove all of the consumed bytes from the buffer.
			 */
			public void flushConsumed()
			{
				if(this.resident() > 0)
				{
					System.arraycopy(rawBuffer, consumedCursor, rawBuffer, 0, this.resident());
				}
				
				processedCursor -= consumedCursor;
				endCursor -= consumedCursor;
				consumedCursor = 0;
			}

			
			public void insertIntoProcessed(String textToAdd)
			{
				this.insertIntoProcessed(textToAdd.getBytes());
			}
			
			public void insertIntoProcessed(byte[] textToAdd)
			{
				this.flushConsumed();
				if(this.unused() < textToAdd.length)
				{
					System.out.println("*** NO ROOM TO INSERT");
					
					// TO DO:  No choice at this point but to increase the size of the buffer...
				}
				else
				{
					System.arraycopy(rawBuffer, processedCursor, rawBuffer, processedCursor + textToAdd.length, this.unprocessed());
					System.arraycopy(textToAdd, 0, rawBuffer, processedCursor, textToAdd.length);
					
					processedCursor += textToAdd.length;
					endCursor += textToAdd.length;
				}
			}
			
			/**
			 *	Read from the provided input stream until the buffer is
			 *	full or the stream is empty.
			 */
			public boolean readFromStream(InputStream inStream) throws IOException
			{
				boolean stateChanged = false;
				
				this.flushConsumed();
				if(this.fillableSpace() > 0)
				{
					int newlyRead = inStream.read(rawBuffer, endCursor, this.fillableSpace());
				
					reachedEOF = (newlyRead < 0);
					if(reachedEOF == false)
					{
						endCursor += newlyRead;
						stateChanged = true;
					}
				}
				
				return stateChanged;
			}
			
			public int process(int desiredSize)
			{
				//
				// Assume we're going to consume everything
				// available, then back off to the desired
				// size if less is requested.
				//
				int actuallyProcessed = this.unprocessed();
				if(desiredSize < actuallyProcessed)
					actuallyProcessed = desiredSize;
				
				processedCursor += actuallyProcessed;
				
				return actuallyProcessed;
			}
			
			public char nextUnprocessedChar()
			{
				return getUnprocessedChar(0);
			}
			
			public char getUnprocessedChar(int offset)
			{
				char nextUnprocessed = 0;
				
				if((offset >= 0) && (this.unprocessed() > offset))
					nextUnprocessed = (char)rawBuffer[processedCursor + offset];
				
				return nextUnprocessed;
			}
			
			public int offsetToNextWhitespace(int offset)
			{				
				while((this.unprocessed() > offset) && (Character.isWhitespace(this.getUnprocessedChar(offset)) == false))
				{
					++offset;
				}
				
				return (this.unprocessed() > offset) ? offset : -1;
			}
			
			public int offsetToNextNonWhitespace(int offset)
			{				
				while((this.unprocessed() > offset) && (Character.isWhitespace(this.getUnprocessedChar(offset))))
				{
					++offset;
				}
				
				return (this.unprocessed() > offset) ? offset : -1;
			}
			
			static public boolean isIdentifier(char c)
			{
				return Character.isLetterOrDigit(c) || (c == ':');
			}
			
			public int offsetToNextNonName(int offset)
			{				
				while((this.unprocessed() > offset) && (isIdentifier(this.getUnprocessedChar(offset)) == true))
				{
					++offset;
				}
				
				return (this.unprocessed() > offset) ? offset : -1;
			}
			
			public int offsetToLimitChar(int offset, char limitChar)
			{	
				while((this.unprocessed() > offset) && (this.getUnprocessedChar(offset) != limitChar))
				{
					++offset;
				}
				
				return (this.unprocessed() > offset) ? offset : -1;
			}
			
			/**
			 *	Keep processing until there are no more characters
			 *	left to process, OR until the NEXT unprocessed character
			 *	is 'limitChar'.
			 */
			public boolean processUntil(char limitChar)
			{
				while((this.unprocessed() > 0) && (this.nextUnprocessedChar() != limitChar))
				{
					this.process(1);
				}
				
				//
				// If there are unprocessed characters remaining, then we
				// must have found the limit character:  return true.  If
				// there are no unprocessed characters remaining, then
				// we cannot have found the limit character:  return false.
				//
				return this.unprocessed() > 0;
			}
			
			public boolean discardUnprocessedInclusive(char limitChar)
			{
				int offset = this.offsetToLimitChar(0, limitChar);
				boolean found = false;
				if(offset == -1)
				{
					offset = this.unprocessed();
				}
				else
				{
					++offset;
					found = true;
				}
				
				this.discardFromUnprocessed(offset);
				
				return found;
			}
			
			public boolean discardFromUnprocessed(int len)
			{
				boolean remainingCharacters = false;
				
				if(len >= this.unprocessed())
				{
					endCursor = processedCursor;	
				}
				else if(len > 0)
				{
					System.arraycopy(rawBuffer, processedCursor + len, rawBuffer, processedCursor, this.unprocessed() - len);
					endCursor -= len;
					remainingCharacters = true;	
				}
				
				return remainingCharacters;
			}
			
			public String nextWord(int offset)
			{
				String result = null;
				
				int wordStartOffset = this.offsetToNextNonWhitespace(offset);
				if(wordStartOffset >= 0)
				{
					int wordEndOffset = this.offsetToNextNonName(wordStartOffset);
					if(wordEndOffset >= 0)
					{
						result = new String(rawBuffer, processedCursor + wordStartOffset, wordEndOffset - wordStartOffset);
					}
				}
				
				return result;
			}

			public String getProcessed()
			{
				return new String(rawBuffer, consumedCursor, this.available());
			}
		}

		private final int defaultBufferSize = 4000;
		private ProcessBuffer buffer = null;
				
		HTMLSanitizingFilter(InputStream in)
		{
			super(in);
		
			buffer = new ProcessBuffer(defaultBufferSize);
		}
		
		public boolean markSupported()
		{
			return false;
		}
		
		/*
		public void mark(int readlimit)
		{
		}
		
		public void reset() throws IOException
		{
		}
		*/
		
		public int available() throws IOException
		{
			// to do:  we should to read ahead 'super.available()' bytes and process first, otherwise the result here is not as large as it could be
			
			return buffer.available();
		}

		public long skip(long len) throws IOException
		{
			long actuallySkipped = 0;
			
			while((len > 0) && (buffer.eof() == false))
			{
				int skippedThisTime = 0;
				
				if(len > buffer.available())
					skippedThisTime = buffer.consume();
				else
					skippedThisTime = buffer.consume((int)len);
				
				actuallySkipped += skippedThisTime;
				len -= skippedThisTime;
				
				//
				// If len is not zero, fill the buffer
				//
				if(len > 0)
				{
					this.processMore();
				}
			}
						
			return actuallySkipped;
		}
			
		public int read(byte[] b) throws IOException
		{
			return this.read(b, 0, b.length);
		}
		
		public int read() throws IOException
		{
			int result = -1;
			
			byte[] b = new byte[1];
			
			int lenRead = this.read(b);
			if(lenRead > 0)
				result = b[0];
			
			return result;
		}
		
		public int read(byte[] b, int off, int len) throws IOException
		{
			int actualBytesRead = 0;
			
			while((len > 0) && (buffer.eof() == false))
			{
				int bytesCopied = buffer.consume(b, off, len);
				actualBytesRead += bytesCopied;
				len -= bytesCopied;
				
				//
				// If len is not zero, fill the buffer
				//
				if(len > 0)
				{
					this.processMore();
				}
			}
			
			return actualBytesRead;
		}
		
		final static int SCAN_TEXT = 0;
		final static int DETERMINE_TAG_ACTION = 1;
		final static int PASS_TAG_THROUGH = 2;
		final static int PASS_TAG_THROUGH_ADD_END_MARK = 3;
		final static int DISCARD_TAG = 4;
		final static int DISCARD_TAG_AND_CONTENTS = 5;
		final static int DETERMINE_IF_AT_END_OF_TAG_CONTENTS = 6;
		
		final static int DISCARD_SPECIAL_OUTSIDECOMMENT = 10;
		final static int DISCARD_SPECIAL_ENTERINGCOMMENT = 11;
		final static int DISCARD_SPECIAL_INSIDECOMMENT = 12;
		final static int DISCARD_SPECIAL_EXITINGGCOMMENT = 13;
		
		int mode = SCAN_TEXT;
		
		String currentTag = null;
		
		protected boolean shouldDiscardThisTag(String tagName)
		{
			return		tagName.equalsIgnoreCase("link") ||
					tagName.equalsIgnoreCase("bgsound") ||
					tagName.equalsIgnoreCase("br") ||
					tagName.equalsIgnoreCase("col") ||
					tagName.equalsIgnoreCase("hr") ||
					tagName.equalsIgnoreCase("img") ||
					tagName.equalsIgnoreCase("isindex") ||
					tagName.equalsIgnoreCase("meta") ||
					tagName.equalsIgnoreCase("nextid") ||
					tagName.equalsIgnoreCase("p") ||
					tagName.equalsIgnoreCase("param") ||
					tagName.equalsIgnoreCase("plaintext") ||
					tagName.equalsIgnoreCase("sound") ||
					tagName.equalsIgnoreCase("spacer") ||
					tagName.equalsIgnoreCase("wbr");
		}
		
		protected boolean shouldDiscardThisTagAndItsContents(String tagName)
		{
			return		tagName.equalsIgnoreCase("script") ||
					tagName.equalsIgnoreCase("style");
		}
		
		protected boolean shouldAddEndTag(String tagName)
		{
			return		tagName.equalsIgnoreCase("area") ||
					tagName.equalsIgnoreCase("base") ||
					tagName.equalsIgnoreCase("frame") ||
					tagName.equalsIgnoreCase("input") ||
					tagName.equalsIgnoreCase("keygen");
		}
		
		// ... we can't really fix these with a one-pass stackless filter...
		protected boolean tagInFixEndtagAfterContentsSet(String tagName)
		{
			return		tagName.equalsIgnoreCase("dd") ||
					tagName.equalsIgnoreCase("dt") ||
					tagName.equalsIgnoreCase("li") ||
					tagName.equalsIgnoreCase("option") ||
					tagName.equalsIgnoreCase("td") ||
					tagName.equalsIgnoreCase("th");
		}
		
		/**
		 *	This function is called whenever someone wants to read
		 *	bytes from the buffer, but there are no processed bytes
		 *	available.
		 */
		protected void processMore() throws IOException
		{
			//
			// Start by getting rid of the consumed bytes
			// and then fill the buffer from the input stream.
			//
			boolean stateChanged = buffer.readFromStream(in);
			
			//
			// Run the finite state machine until we cannot
			// process any more text.
			//
			int initialAvailable = buffer.available();
			int lastAvailable = -1;
			while(buffer.available() != lastAvailable)
			{
				lastAvailable = buffer.available();
				
				switch(mode)
				{
					case SCAN_TEXT:
					{
						if(buffer.processUntil('<'))
						{
							mode = DETERMINE_TAG_ACTION;
						}
						break;
					}
					
					case DETERMINE_TAG_ACTION:
					{
						boolean isEndTag = false;
						int offset = buffer.offsetToNextNonWhitespace(1);
						if(buffer.getUnprocessedChar(offset) == '!')
						{
							mode = DISCARD_SPECIAL_OUTSIDECOMMENT;
						}
						else
						{
							if(buffer.getUnprocessedChar(offset) == '/')
							{
								isEndTag = true;
								++offset;
							}
							offset = buffer.offsetToNextNonWhitespace(offset);
							
							String tagName = buffer.nextWord(offset);
							if(tagName != null)
							{

								/*
								if(isEndTag)
									System.out.print("(" + tagName);
								else
									System.out.print("(-" + tagName);
								*/

								if(this.shouldDiscardThisTag(tagName))
								{
									mode = DISCARD_TAG;
									//System.out.print("X");
								}
								else if(this.shouldDiscardThisTagAndItsContents(tagName))
								{
									mode = DISCARD_TAG_AND_CONTENTS;
									currentTag = tagName;
									//System.out.print("*");
								}
								else if(shouldAddEndTag(tagName))
								{
									if(isEndTag)
									{
										mode = DISCARD_TAG;
									}
									else
									{
										buffer.process(offset+tagName.length());
										mode = PASS_TAG_THROUGH_ADD_END_MARK;
									}
								}
								else
								{
									buffer.process(offset+tagName.length());
									mode = PASS_TAG_THROUGH;
								}

								//System.out.print(")");
							}
						}
						
						break;
					}
					
					case PASS_TAG_THROUGH:
					{
						// TO DO:  at this point it is possible that there may still
						// be attributes with no value (e.g. "<td nowrap>...</td>").
						// 
						// We should make a point of validating the tag contents; we
						// can just strip out attributes that have no ="value" component
						// (although it would be technically more correct to add ="1"
						// after value-less attributes.)
						
						if(buffer.processUntil('>'))
						{
							buffer.process(1); // include the '>' in the processed text
							mode = SCAN_TEXT;
						}
						break;
					}
					
					case PASS_TAG_THROUGH_ADD_END_MARK:
					{
						if(buffer.processUntil('>'))
						{
							buffer.insertIntoProcessed("/");
							buffer.process(1); // include the '>' in the processed text
							mode = SCAN_TEXT;
						}
						break;
					}
					
					case DISCARD_TAG:
					{
						if(buffer.discardUnprocessedInclusive('>'))
						{
							mode = SCAN_TEXT;
						}
						break;
					}
					
					case DISCARD_TAG_AND_CONTENTS:
					{
						if(buffer.discardUnprocessedInclusive('<'))
						{
							// this will work for simple cases w/out embedded tags or '<' characters.
							// TO DO:  don't switch modes until the end tag name matches the name
							// of the tag whose contents we're discarding.
							
							mode = DETERMINE_IF_AT_END_OF_TAG_CONTENTS;
						}
						break;
					}

					
					case DETERMINE_IF_AT_END_OF_TAG_CONTENTS:
					{
						int offset = buffer.offsetToNextNonWhitespace(0);
						if(buffer.getUnprocessedChar(offset) == '/')
						{
							++offset;
							offset = buffer.offsetToNextNonWhitespace(offset);
							
							String tagName = buffer.nextWord(offset);
							if(tagName != null)
							{
								if(currentTag.equalsIgnoreCase(tagName))
								{
									mode = DISCARD_TAG;
								}
								else
								{
									mode = DISCARD_TAG_AND_CONTENTS;
								}
							}
						}
						else if(offset > -1)
						{
							mode = DISCARD_TAG_AND_CONTENTS;
						}
						
						break;
					}
					
					case DISCARD_SPECIAL_OUTSIDECOMMENT:
					{
						if(buffer.getUnprocessedChar(0) == '-')
							mode = DISCARD_SPECIAL_ENTERINGCOMMENT;
						if(buffer.getUnprocessedChar(0) == '>')
							mode = SCAN_TEXT;
						buffer.discardFromUnprocessed(1);
						
						break;
					}
					
					case DISCARD_SPECIAL_ENTERINGCOMMENT:
					{
						if(buffer.getUnprocessedChar(0) == '-')
							mode = DISCARD_SPECIAL_INSIDECOMMENT;
						else
							mode = DISCARD_SPECIAL_OUTSIDECOMMENT;
						buffer.discardFromUnprocessed(1);
						
						break;
					}
					
					case DISCARD_SPECIAL_INSIDECOMMENT:
					{
						if(buffer.getUnprocessedChar(0) == '-')
							mode = DISCARD_SPECIAL_EXITINGGCOMMENT;
						buffer.discardFromUnprocessed(1);
						
						break;
					}

					
					case DISCARD_SPECIAL_EXITINGGCOMMENT:
					{
						if(buffer.getUnprocessedChar(0) == '-')
							mode = DISCARD_SPECIAL_OUTSIDECOMMENT;
						else
							mode = DISCARD_SPECIAL_INSIDECOMMENT;
						buffer.discardFromUnprocessed(1);
						
						break;
					}
				}
			}
			
			//
			// If the number of available (processed) characters changed,
			// then we made some progress; mark stateChanged as 'true'.
			//
			if(initialAvailable != buffer.available())
			{
				stateChanged = true;
			}
			
			//
			// INFINITE LOOP PROTECTION:  If the state does not change,
			// then force some progress (broken output is better than an
			// infinite loop).
			//
			if(stateChanged = false)
			{
				if(buffer.unprocessed() > 0)
				{
					System.err.println("INFINITE LOOP PROTECTION (i.e., a bug):  reset scanning state");
					buffer.process(buffer.unprocessed());
				}
				else
				{
					System.err.println("INFINITE LOOP PROTECTION (i.e., a bug):  force termination");
					buffer.flushAndForceEOF();
				}
				mode = SCAN_TEXT;
			}
		}
	}
	
	public static String xpathSelectString(URLConnection theConnection, String xpath)
	{
		String result = "";
		
		try
		{
			System.out.println("xpathSelectString from " + theConnection.getURL().toString());
			Document xmlDoc = Parser.parse(theConnection.getURL().toString(), new BufferedReader(new InputStreamReader(new HTMLSanitizingFilter(theConnection.getInputStream()))));
			result = xmlDoc.xpathSelectString(xpath);
			System.out.println("xpathSelectString returns: " + result);
		}
		catch(Exception e)
		{
			System.out.println("Could not evaluate xpath expression for URL connection:" + e);
		}	
		
			
		return result;
	}
	
	public static String xpathSelectAttribute(URLConnection theConnection, String xpath, String attributeName)
	{
		String result = "";
		
		try
		{
			System.out.println("xpathSelectAttribute from " + theConnection.getURL().toString());
			Document xmlDoc = Parser.parse(theConnection.getURL().toString(), new BufferedReader(new InputStreamReader(new HTMLSanitizingFilter(theConnection.getInputStream()))));
			Element elem = xmlDoc.xpathSelectElement(xpath);
			result = elem.getAttribute(attributeName);
			System.out.println("xpathSelectAttribute returns: " + result);
		}
		catch(Exception e)
		{
			System.out.println("Could not evaluate xpath expression for URL connection:" + e);
		}	

				
		return result;
	}
}
