/* --------------------------------------------------------------------------
 *
 * Copyright (C) 2007 Leif Erik Larsen, Kjerringvik, Norway.
 *
 * This file is part of the Open Source Edition of Larsen Commander, as
 * available from http://home.online.no/~leifel/lcmd/.  This code is free 
 * software; you can redistribute it and/or modify it under the terms of 
 * the GNU General Public License version 3 only, as published by the 
 * Free Software Foundation.  
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 3 at http://www.gnu.org/licenses/gpl-3.0.txt for more details 
 * (a copy is included in the LICENSE file that accompanied this code).
 *
 * ------------------------------------------------------------------------ */

#ifndef __GLIB_TOKENIZER
#define __GLIB_TOKENIZER

#include "glib/util/GToken.h"
#include "glib/util/GAbstractTokenizer.h"
#include "glib/util/GArray.h"

/**
 * This class can be used to break a string into tokens (substrings).
 *
 * Usually a string consist of several tokens separated by whitespace
 * character(s) that can be seen as delimiters.
 *
 * This tokenizer treates all whitespace characters (spaces, tabs and
 * linefeeds) as deliminators by default. This is configurable
 * with the method {@link #setDeliminators}.
 *
 * The default set of deliminators includes all ASCII characters with a
 * value less than or equal to 32. Among others, these are:
 *
 * <pre>
 * space, tab, backtab, linefeed, carriage return, new page
 * </pre>
 *
 * In addition, you can specify a set of special single character tokens
 * with {@link #setSpecialTokens}. Single character tokens are the
 * characters that will be returned as a token in it self if one of them
 * is reached within the stream while not within an enclosing pair of
 * quotes, even if there are no surrounding deliminators.
 *
 * The default set of single character tokens includes the below
 * listed characters:
 *
 * <pre>
 * , ; : / - + * = ? ! ( ) [ ] { } < > % & | \
 * </pre>
 *
 * Here is an example of how to use the GTokenizer class:
 *
 * <pre>
 *    GString str = "p1=Param1 p2=\"Parameter 2\" p3=\"\\\"\"";
 *    GStringStream stream(str);
 *    GTokenizer tokenizer(stream);
 *    for (;;)
 *    {
 *       const GToken* token = tokenizer.getNextToken();
 *       if (token->isEmpty())
 *          break;
 *       printf("%s\n", token->toString().cstring());
 *    }
 *    printf("No more tokens!\n");
 * </pre>
 *
 * The above example will print the following output:
 *
 * <pre>
 *    p1
 *    =
 *    Param1
 *    p2
 *    =
 *    Parameter 2
 *    p3
 *    =
 *    "
 *    No more tokens!
 * </pre>
 *
 * @author  Leif Erik Larsen
 * @since   1999.09.15
 */
class GTokenizer : public GAbstractTokenizer
{
   private:

      int nextByte;
      bool tokenizeWhiteSpace;
      class GInputStream* inputstream;
      bool autoDeleteStream;
      GToken currentToken;
      GString deliminators;
      GString specialTokens;
      int currentLineNum;
      int currentColumnPos;
      GArray<GString> specialSymbols;
      bool ignoreForcingCharacter;
      char forcingCharacter;

      static const GString DefaultSpecialTokens;

      static const GString Token_eq;

   public:

      /**
       * Construct a tokenizer for the specified input stream,
       * using the default set of special single character tokens
       * by default.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       * @param   stream      The stream of where to read characters. For
       *                      instance, this can be a {@link GStringStream}.
       * @param   whiteSpace  True if we shall tokenize whitespace just as
       *                      normal tokens, or else false. The most typical
       *                      value of this parameter is false.
       */
      explicit GTokenizer ( class GInputStream& stream, bool whiteSpace );

      /**
       * Construct a tokenizer for the specified string,
       * using the default set of special single character tokens
       * by default.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       * @param   str         The string to be tokenized.
       * @param   whiteSpace  True if we shall tokenize whitespace just as
       *                      normal tokens, or else false. The most typical
       *                      value of this parameter is false.
       */
      explicit GTokenizer ( const GString& str, bool whiteSpace );

      /**
       * Construct a tokenizer for the specified string, using the
       * specified set of special single character tokens.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       * @param   str           The string to be tokenized.
       * @param   specialTokens The set of special single character tokens.
       * @param   whiteSpace    True if we shall tokenize whitespace just as
       *                        normal tokens, or else false. The most typical
       *                        value of this parameter is false.
       */
      GTokenizer ( const GString& str, 
                   const GString& specialTokens, 
                   bool whiteSpace );

      /**
       * This constructor was added for backward compatibility with some
       * older Larsen Commander code. Chances are that this constructor will
       * be removed in the future.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.30
       */
      GTokenizer ( const GString& str, 
                   const GString& deliminators, 
                   const GString& specialTokens, 
                   bool ignoreForcingCharacter );

      /**
       * Destroys the input stream, in case it is a temporary one created
       * internally by this class.
       *
       * This is the case if one of the constructors that takes a source
       * {@link GString} rather than a {@link GInputStream} was used.
       * In that case we did create a new {@link GStringStream}, which
       * will be destroyed by this destructor. Else this destructor will
       * do nothing but return.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       */
      virtual ~GTokenizer ();

   private:

      /** Disable the copy constructor. */
      GTokenizer ( const GTokenizer& src ) {}

      /** Disable the assignment operator. */
      GTokenizer& operator= ( const GTokenizer& ) { return *this; }

   private:

      /**
       * Check if the specified character is to be treated as
       * whitespace or not. A negative value is always considered to be
       * whitespace.
       */
      bool isWhiteSpace ( int chr ) const;

   public:

      /**
       * Set the set of delimator characters.
       *
       * If an empty string is specified then we will use the default set
       * of deliminators, which include all ASCII characters with a
       * value less than or equal to 32.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.30
       */
      void setDeliminators ( const GString& delm );

      /**
       * Set the set of characters that should be treated as special
       * single character tokens.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       */
      void setSpecialTokens ( const GString& stok );

      /**
       * Set the set of two-character symbols that should be treated as
       * special tokens/sybols even if their first character is defined
       * in the set of special tokens as set by {@link #setSpecialTokens}.
       *
       * There are no defined special symbols by default.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       */
      void setSpecialSymbols ( const GArray<GString>& ssym );

      /**
       * Return true if and only if this tokenizer is configured to return
       * whitespace tokens just as normal tokens.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       */
      bool isTokenizeWhiteSpace () const;

   private:

      /**
       * Read and return next byte from the input stream.
       *
       * To be called internally by {@link #getNextToken} only.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       * @return  The next byte of stream, or -1 on end-of-stream.
       * @throws  GIOException  In case of any critical error when
       *                        attempting to read from the input stream,
       *                        but not on standard end-of-stream.
       */
      int getNextByte ();

      /**
       * @author  Leif Erik Larsen
       * @since   2000.04.14
       * @return  A pointer to the next token object. If there are no more
       *          tokens then we will return a token whose <i>isEmpty()</i>
       *          method will return true. We will never return null.
       *          Use <i>toString()</i> on the returned token to get the
       *          token as a string.
       * @throws  GIOException  In case of any critical error when
       *                        attempting to read from the input stream,
       *                        but not on standard end-of-stream.
       */
      const GToken* getNextToken ( bool ignoreWhiteSpace );

   public:

      /**
       * @see GAbstractTokenizer#getNextAbstractToken
       */
      virtual const GAbstractToken* getNextAbstractToken ();

      /**
       * @see GAbstractTokenizer#getCurModuleColumn
       */
      virtual int getCurModuleColumn () const;

      /**
       * @see GAbstractTokenizer#getCurModuleLineNr
       */
      virtual int getCurModuleLineNr () const;

      /**
       * Get the next token from the tokenizer stream.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       * @return  A pointer to the next token object. If there are no more
       *          tokens then we will return a token whose <i>isEmpty()</i>
       *          method will return true. We will never return null.
       *          Use <i>toString()</i> on the returned token to get the
       *          token as a string (if needed).
       */
      const GToken* getNextToken ();

      /**
       * Fetch the value to the right of the following "=" (equal sign) 
       * token and return it as a string.
       *
       * @author  Leif Erik Larsen
       * @since   2000.07.29
       * @throws  GSyntaxErrorException in case of any error.
       *                      Error can be: 1) No more tokens.
       *                                    2) Missing equal sign.
       *                                    3) Unknown error.
       */
      GString queryArgValueString ();
};

#endif


