
/*
 * Copyright (c) 1998, 1999 Semiotek Inc. All Rights Reserved.
 *
 * This software is the confidential intellectual property of
 * of Semiotek Inc.; it is copyrighted and licensed, not sold.
 * You may use it under the terms of the GNU General Public License,
 * version 2, as published by the Free Software Foundation. If you 
 * do not want to use the GPL, you may still use the software after
 * purchasing a proprietary developers license from Semiotek Inc.
 *
 * This software is provided "as is", with NO WARRANTY, not even the 
 * implied warranties of fitness to purpose, or merchantability. You
 * assume all risks and liabilities associated with its use.
 *
 * See the attached License.html file for details, or contact us
 * by e-mail at info@semiotek.com to get a copy.
 */


package org.webmacro.engine;

import java.util.*;
import java.io.*;
import org.webmacro.util.*;

/**
  * Tokenizes WebMacro template files. It is an wrapper around a 
  * StreamTokenizer that's configured to tokenize the kinds of symbols 
  * that appear in WebMacro template files.
  * <p>
  * It also provides several utility functions that make it
  * convenient to parse stuch files.
  * <p>
  * All characters are word characters except: {}();.=#&@$ space and newline
  * which are "ordinary"--meaning they can be returned as token types.
  * <p>
  * The parseTool also has the property that it returns unique strings 
  * only: If two strings returned by the parseTool are .equals equivalent,
  * then they are also == equivalent.
  * <p>
  * Furthermore, unlike StreamTokenizer, the pushback method for this 
  * ParseTool works for strings as well as regular tokens.
  */
public final class ParseTool
{

   final Object[] asArg  = { this };

   /**
     * The current token on the stack is unknown, usually meaning no
     * valid token has been read yet. The ParseTool begins in this state.
     */
   public static final int TT_UNKNOWN = -99; 

   /**
     * The current token on the stack is a word.
     */
   public static final int TT_WORD = StreamTokenizer.TT_WORD;

   /**
     * The current token on the stack is the end of file token
     */
   public static final int TT_EOF = StreamTokenizer.TT_EOF;

   /**
     * The current token on the stack is the end of line character
     */
   public static final int TT_EOL = '\n'; // we make this portable


   /**
     * If the current token is a word it will appear in this variable
     */
   public String sval = null;

   /**
     * This is the last token we read, used for correct pushback
     */
   private int last = TT_UNKNOWN;

   /**
     * This is the last sval we read, used for correct pushback
     */
   private String _lastsval = null;

   /**
     * This is the underlying stream parseTool we use
     */
   final private StreamTokenizer _in;

   /**
     * Hashtable to hold parameters during parse
     */
   final private Hashtable _paramContext = new Hashtable();

   /**
     * This is the current token
     */
   public int ttype = TT_UNKNOWN;

   /**
     * What line of input are we on?
     */
   private int _lineNum = 1;

   /**
     * We need to know this to help us out with stupid platforms
     * that decided to use non-standard line separators. I'm not in
     * general hostile towards mac/nt, except in cases like this 
     * where they cause me to do unnecessary work.
     */
   final public static String lineSeparator;

   static {
      String tmpLineSep;
      try {
         tmpLineSep = System.getProperty("line.separator").intern();
      } catch (Exception e) {
         tmpLineSep = "\n".intern(); // we make this portable
      }
      lineSeparator = tmpLineSep;
   }

   /**
     * Our name, representing the source of the stream
     */
   final private String _name;

   /**
     * Create a parseTool that reads from the supplied file
     */
   public ParseTool(File in) throws IOException {
      this(in.getName(),new BufferedReader(new FileReader(in)));
   }

   /**
     * Create a parseTool that reads from the supplied stream.
     * The supplied name is used in error messages, to identify
     * the source of the input.
     */
   public ParseTool(String name, Reader inputStream) {
      _name = name;
      _in = new StreamTokenizer(inputStream);

      _in.resetSyntax();
      _in.wordChars(0,0xffff);

      // escape character and quotes
      _in.ordinaryChar('\''); 
      _in.ordinaryChar('\"');
      _in.ordinaryChar('\\');

      // whitespace
      _in.ordinaryChar(' ');
      _in.ordinaryChar('\n'); // we make this portable
      _in.ordinaryChar('\t');
      _in.ordinaryChar(13);

      // things we look for in block
      _in.ordinaryChar('#');
      _in.ordinaryChar('$');

      // other things we actually use
      _in.ordinaryChar('{');
      _in.ordinaryChar('}');
      _in.ordinaryChar('(');
      _in.ordinaryChar(')');
      _in.ordinaryChar(';');
      _in.ordinaryChar('=');
      _in.ordinaryChar('.');

      // non-word characters
      _in.ordinaryChar('!');
      _in.ordinaryChar('%');
      _in.ordinaryChar('&');
      _in.ordinaryChar('*');
      _in.ordinaryChar('+');
      _in.ordinaryChar(',');
      _in.ordinaryChar('/');
      _in.ordinaryChar(':');
      _in.ordinaryChar('<');
      _in.ordinaryChar('>');
      _in.ordinaryChar('?');
      _in.ordinaryChar('@');
      _in.ordinaryChar('[');
      _in.ordinaryChar(']');
      _in.ordinaryChar('^');
      _in.ordinaryChar('|');
      _in.ordinaryChar('~');


      // NT sucks, requiring two characters where one would do
      _in.ordinaryChar('\r');

      // in.eolIsSignificant(true);
   }

   /**
     * Return the name of the stream we read from
     */
   final public String getName() {
      return _name;
   }

   /**
     * A string suitable for use in error messages
     */
   final public String toString()
   {
      return getName() + ":" + _lineNum;
   }

   /**
     * Return a unique String with the same value as this String. Any 
     * string returned by this method will be == equivalent to any other
     * string from this method that has the same value. The ParseTool also
     * calls this before setting sval. This uniqueness guarantee is 
     * provided only within a single JVM process (if you restart the JVM,
     * or pass this value to another JVM, the guarantee does not hold.)
     * <p>
     * @param str the string to be uniqued
     * @return a unique version of str
     */
   final static String unique(String str) 
   {
      return str.intern();
   }

   /**
     * Skip all characters up to but not including the next newline
     * @return whether anything was skipped or not
     */
   public boolean parseToEOL() throws IOException {
      boolean ateSomething = false;
      while (ttype != TT_EOF && ttype != '\n') { // we make this portable
         nextToken();
         ateSomething = true;
      }
      return ateSomething;
   }

   /**
     * Parse end of line character(s). Same as parseChar('\n');
     * OS dependent line separators are always converted to 
     * the single character \n.
     * @return true if \n found, false otherwise
     */
   public boolean parseEOL() throws IOException {
      return parseChar('\n'); // we make this portable
   }

   /**
     * skip to the first non-space character
     * @return whether anything was skipped
     */
   public boolean parseSpaces() throws IOException {
      boolean ateSpace = false;
      while ((ttype ==' ') || (ttype=='\t')) {
         ateSpace = true;
         nextToken();
      }
      return ateSpace;
   }

   /**
     * Skip to the next non-whitespace character. Whitespace is either a 
     * space or a newline.
     * @return whether anything was skipped
     */
   public boolean parseWhitespace() throws IOException {
      boolean ateSpace = false;
      while (ttype == ' ' || 
            ttype == '\n' ||  // parseTool makes this portable
            ttype == '\t') 
      {
         ateSpace = true;
         nextToken();
      }
      return ateSpace;
   }

   /**
     * If the current token is the specified character, advance to the 
     * next token.
     * @return whether the character was eaten.
     */
   public boolean parseChar(char c) throws IOException
   {
      if (ttype != c) {
         //System.out.println(ttype);
         return false;
      }
      nextToken();
      return true;
   }

   /**
     * Eat a string and return it. If the current token is not a 
     * string token, do not eat it, and return null.
     * @return the word eaten, or null if the next token is not a word.
     */
   public String parseString() throws IOException
   {
      if (ttype != TT_WORD) {
         return null;
      }
      String ret = sval;
      nextToken();
      return ret;
   }

   /**
     * Look for the named word, case insensitive. If the current token 
     * matches that word advance the token and return the word; otherwise
     * do not advance the token and return null
     * @param word the word we are looking for
     * @returns whether or not we found it
     */
   public boolean parseWord(String word) throws IOException
   {
      if (ttype != TT_WORD) {
         return false;
      }
      if (sval.equals(word)) {
         nextToken();
         return true;
      } else {
         return false;
      }
   }

   /**
     * Push the parseTool back to the previous token. This only works
     * once per token, and if you attempt to call it twice you will 
     * get a ParseException.
     * <p>
     * This pushback, unlike StreamTokenizer's, will correctly reset 
     * the sval variable if it pushes back to a word token.
     * @exception ParseException on attempt to push back twice
     */
   public void pushBack() throws ParseException {

      if (last == TT_UNKNOWN) {
         throw new ParseException(this,"Attempt to undo too many times.");
      }
      ttype = last;
      last = TT_UNKNOWN;
      if (ttype == TT_WORD) {
         sval = _lastsval; 
      } 
      _in.pushBack();
   }

   /**
     * Get the next token on the stream. If it is a word, set sval. 
     * Either way the value of ttype will be set to the current token,
     * which is also returned.
     * @return the new value of ttype
     */
   public final int nextToken() throws IOException
   {
      last = ttype;
      if (last == TT_WORD) {
         _lastsval = sval;
      } 

      ttype = _in.nextToken();

      // deal with stupid idiotic platforms that were invented post-Unix
      // and arbitrarily chose to use something other than standard 
      // newline as the line separator... why???????????
      if (ttype == '\r') {
         if (lineSeparator == "\r") {
            // stupid Mac likes to be different
            ttype = '\n'; // force portability
         } else  {
            // stupid NT uses two charactesr where one would do
            ttype = _in.nextToken(); // just ignore it... hope for the best
         }
      }

      if (ttype == '\n') {
         _lineNum++;
      }

      if (ttype == TT_WORD) {
         sval = unique(_in.sval.intern());
      }
      return ttype;
   }

   /**
     * Determine which line we are on, possibly for reporting errors.
     * @return the line of input that the current token is on
     */
   public final int lineno() {
      return _lineNum;
   }

   /**
     * Parse the rest of the line as a string and return it. Every 
     * character up until the newline will be eaten and added to 
     * the string. 
     * <p>
     * It WILL eat the newline, and append it to the string it returns.
     */
   public final String parseLine() throws IOException 
   {
      StringBuffer str = new StringBuffer(96);
      while ((ttype != '\n') && // parseTool makes this portable
            (ttype != TT_EOF)) 
      {
         if (ttype == TT_WORD) {
            str.append(sval);
         } else {
            str.append((char) ttype);
         }
         nextToken();
      } 

      if (ttype == '\n') // parseTool makes this portable
      {
         str.append((char) ttype);
         nextToken();
      } 
          
      return str.toString();
   }

   /**
     * Get the parameter hashtable. This can be used to store static data
     * values during parse. In the template parse, this is used to store 
     * the template parameters.
     */
   public final Hashtable getParamContext() {
      return _paramContext;
   }

   /**
     * Test harness
     */
   static public void main(String arg[]) {
      try {

         System.out.println("Parsing the string $;@");
         ParseTool in = new ParseTool("string",new StringReader("$;@"));
         System.out.println("1st token: " + (char) in.nextToken());
         System.out.println("2nd token: " + (char) in.nextToken());
         System.out.println("3rd token: " + (char) in.nextToken());
         System.out.println("undo");
         in.pushBack();
         System.out.println("Current  : " + (char) in.ttype);
         System.out.println("Next tok : " + (char) in.nextToken());

         System.out.println();
         System.out.println("Parsing stdin:");
         System.out.println();

         in = new ParseTool("stdin",new InputStreamReader(System.in));
         while (in.nextToken() != TT_EOF) {
               if (in.ttype == TT_WORD) {
                  System.out.println("Word: " + in.sval);
               } else if (in.ttype == TT_EOL) {
                  System.out.println("LINEBREAK"); 
               } else {
                  System.out.println("Token: " + (char) in.ttype);
               }
         }
      } catch (Exception e) {
         e.printStackTrace();
      }
   }
}


