001    /* StreamTokenizer.java -- parses streams of characters into tokens
002       Copyright (C) 1998, 1999, 2000, 2001, 2002, 2003  Free Software Foundation
003    
004    This file is part of GNU Classpath.
005    
006    GNU Classpath is free software; you can redistribute it and/or modify
007    it under the terms of the GNU General Public License as published by
008    the Free Software Foundation; either version 2, or (at your option)
009    any later version.
010     
011    GNU Classpath is distributed in the hope that it will be useful, but
012    WITHOUT ANY WARRANTY; without even the implied warranty of
013    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
014    General Public License for more details.
015    
016    You should have received a copy of the GNU General Public License
017    along with GNU Classpath; see the file COPYING.  If not, write to the
018    Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
019    02110-1301 USA.
020    
021    Linking this library statically or dynamically with other modules is
022    making a combined work based on this library.  Thus, the terms and
023    conditions of the GNU General Public License cover the whole
024    combination.
025    
026    As a special exception, the copyright holders of this library give you
027    permission to link this library with independent modules to produce an
028    executable, regardless of the license terms of these independent
029    modules, and to copy and distribute the resulting executable under
030    terms of your choice, provided that you also meet, for each linked
031    independent module, the terms and conditions of the license of that
032    module.  An independent module is a module which is not derived from
033    or based on this library.  If you modify this library, you may extend
034    this exception to your version of the library, but you are not
035    obligated to do so.  If you do not wish to do so, delete this
036    exception statement from your version. */
037    
038    package java.io;
039    
040    /**
041     * This class parses streams of characters into tokens.  There are a
042     * million-zillion flags that can be set to control the parsing, as 
043     * described under the various method headings.
044     *
045     * @author Warren Levy (warrenl@cygnus.com)
046     * @date October 25, 1998.  
047     */
048    /* Written using "Java Class Libraries", 2nd edition, ISBN 0-201-31002-3
049     * "The Java Language Specification", ISBN 0-201-63451-1
050     * plus online API docs for JDK 1.2 beta from http://www.javasoft.com.
051     * Status:  Believed complete and correct.
052     */
053     
054    public class StreamTokenizer
055    {
056      /** A constant indicating that the end of the stream has been read. */
057      public static final int TT_EOF = -1;
058    
059      /** A constant indicating that the end of the line has been read. */
060      public static final int TT_EOL = '\n';
061    
062      /** A constant indicating that a number token has been read. */
063      public static final int TT_NUMBER = -2;
064    
065      /** A constant indicating that a word token has been read. */
066      public static final int TT_WORD = -3;
067    
068      /** A constant indicating that no tokens have been read yet. */
069      private static final int TT_NONE = -4;
070    
071      /**
072       * Contains the type of the token read resulting from a call to nextToken
073       * The rules are as follows:
074       * <ul>
075       * <li>For a token consisting of a single ordinary character, this is the 
076       *     value of that character.</li>
077       * <li>For a quoted string, this is the value of the quote character</li>
078       * <li>For a word, this is TT_WORD</li>
079       * <li>For a number, this is TT_NUMBER</li>
080       * <li>For the end of the line, this is TT_EOL</li>
081       * <li>For the end of the stream, this is TT_EOF</li>
082       * </ul>
083       */
084      public int ttype = TT_NONE;
085    
086      /** The String associated with word and string tokens. */
087      public String sval;
088    
089      /** The numeric value associated with number tokens. */
090      public double nval;
091    
092      /* Indicates whether end-of-line is recognized as a token. */
093      private boolean eolSignificant = false;
094    
095      /* Indicates whether word tokens are automatically made lower case. */
096      private boolean lowerCase = false;
097    
098      /* Indicates whether C++ style comments are recognized and skipped. */
099      private boolean slashSlash = false;
100    
101      /* Indicates whether C style comments are recognized and skipped. */
102      private boolean slashStar = false;
103    
104      /* Attribute tables of each byte from 0x00 to 0xFF. */
105      private boolean[] whitespace = new boolean[256];
106      private boolean[] alphabetic = new boolean[256];
107      private boolean[] numeric = new boolean[256];
108      private boolean[] quote = new boolean[256];
109      private boolean[] comment = new boolean[256];
110    
111      /* The Reader associated with this class. */
112      private PushbackReader in;
113    
114      /* Indicates if a token has been pushed back. */
115      private boolean pushedBack = false;
116    
117      /* Contains the current line number of the reader. */
118      private int lineNumber = 1;
119    
120      /**
121       * This method reads bytes from an <code>InputStream</code> and tokenizes
122       * them.  For details on how this method operates by default, see
123       * <code>StreamTokenizer(Reader)</code>.
124       *
125       * @param is The <code>InputStream</code> to read from
126       *
127       * @deprecated Since JDK 1.1.
128       */
129      public StreamTokenizer(InputStream is)
130      {
131        this(new InputStreamReader(is));
132      }
133    
134      /**
135       * This method initializes a new <code>StreamTokenizer</code> to read 
136       * characters from a <code>Reader</code> and parse them.  The char values
137       * have their hight bits masked so that the value is treated a character
138       * in the range of 0x0000 to 0x00FF.
139       * <p>
140       * This constructor sets up the parsing table to parse the stream in the
141       * following manner:
142       * <ul>
143       * <li>The values 'A' through 'Z', 'a' through 'z' and 0xA0 through 0xFF
144       *     are initialized as alphabetic</li>
145       * <li>The values 0x00 through 0x20 are initialized as whitespace</li>
146       * <li>The values '\'' and '"' are initialized as quote characters</li>
147       * <li>'/' is a comment character</li>
148       * <li>Numbers will be parsed</li>
149       * <li>EOL is not treated as significant</li>
150       * <li>C  and C++ (//) comments are not recognized</li>
151       * </ul>
152       *
153       * @param r The <code>Reader</code> to read chars from
154       */
155      public StreamTokenizer(Reader r)
156      {
157        in = new PushbackReader(r);
158    
159        whitespaceChars(0x00, 0x20);
160        wordChars('A', 'Z');
161        wordChars('a', 'z');
162        wordChars(0xA0, 0xFF);
163        commentChar('/');
164        quoteChar('\'');
165        quoteChar('"');
166        parseNumbers();
167      }
168    
169      /**
170       * This method sets the comment attribute on the specified
171       * character.  Other attributes for the character are cleared.
172       *
173       * @param ch The character to set the comment attribute for, passed as an int
174       */
175      public void commentChar(int ch)
176      {
177        if (ch >= 0 && ch <= 255)
178          {
179            comment[ch] = true;
180            whitespace[ch] = false;
181            alphabetic[ch] = false;
182            numeric[ch] = false;
183            quote[ch] = false;
184          }
185      }
186    
187      /**
188       * This method sets a flag that indicates whether or not the end of line
189       * sequence terminates and is a token.  The defaults to <code>false</code>
190       *
191       * @param flag <code>true</code> if EOF is significant, <code>false</code>
192       *             otherwise
193       */
194      public void eolIsSignificant(boolean flag)
195      {
196        eolSignificant = flag;
197      }
198    
199      /**
200       * This method returns the current line number.  Note that if the 
201       * <code>pushBack()</code> method is called, it has no effect on the
202       * line number returned by this method.
203       *
204       * @return The current line number
205       */
206      public int lineno()
207      {
208        return lineNumber;
209      }
210    
211      /**
212       * This method sets a flag that indicates whether or not alphabetic
213       * tokens that are returned should be converted to lower case.
214       * 
215       * @param flag <code>true</code> to convert to lower case,
216       *             <code>false</code> otherwise
217       */
218      public void lowerCaseMode(boolean flag)
219      {
220        lowerCase = flag;
221      }
222    
223      private boolean isWhitespace(int ch)
224      {
225        return (ch >= 0 && ch <= 255 && whitespace[ch]);
226      }
227    
228      private boolean isAlphabetic(int ch)
229      {
230        return ((ch > 255) || (ch >= 0 && alphabetic[ch]));
231      }
232    
233      private boolean isNumeric(int ch)
234      {
235        return (ch >= 0 && ch <= 255 && numeric[ch]);
236      }
237    
238      private boolean isQuote(int ch)
239      {
240        return (ch >= 0 && ch <= 255 && quote[ch]);
241      }
242    
243      private boolean isComment(int ch)
244      {
245        return (ch >= 0 && ch <= 255 && comment[ch]);
246      }
247    
248      /**
249       * This method reads the next token from the stream.  It sets the 
250       * <code>ttype</code> variable to the appropriate token type and 
251       * returns it.  It also can set <code>sval</code> or <code>nval</code>
252       * as described below.  The parsing strategy is as follows:
253       * <ul>
254       * <li>Skip any whitespace characters.</li>
255       * <li>If a numeric character is encountered, attempt to parse a numeric
256       * value.  Leading '-' characters indicate a numeric only if followed by
257       * another non-'-' numeric.  The value of the numeric token is terminated
258       * by either the first non-numeric encountered, or the second occurrence of
259       * '-' or '.'.  The token type returned is TT_NUMBER and <code>nval</code>
260       * is set to the value parsed.</li>
261       * <li>If an alphabetic character is parsed, all subsequent characters
262       * are read until the first non-alphabetic or non-numeric character is
263       * encountered.  The token type returned is TT_WORD and the value parsed
264       * is stored in <code>sval</code>.  If lower case mode is set, the token
265       * stored in <code>sval</code> is converted to lower case.  The end of line
266       * sequence terminates a word only if EOL signficance has been turned on.
267       * The start of a comment also terminates a word.  Any character with a 
268       * non-alphabetic and non-numeric attribute (such as white space, a quote,
269       * or a commet) are treated as non-alphabetic and terminate the word.</li>
270       * <li>If a comment character is parsed, then all remaining characters on
271       * the current line are skipped and another token is parsed.  Any EOL or
272       * EOF's encountered are not discarded, but rather terminate the comment.</li>
273       * <li>If a quote character is parsed, then all characters up to the 
274       * second occurrence of the same quote character are parsed into a
275       * <code>String</code>.  This <code>String</code> is stored as
276       * <code>sval</code>, but is not converted to lower case, even if lower case
277       * mode is enabled.  The token type returned is the value of the quote
278       * character encountered.  Any escape sequences
279       * (\b (backspace), \t (HTAB), \n (linefeed), \f (form feed), \r
280       * (carriage return), \" (double quote), \' (single quote), \\
281       * (backslash), \XXX (octal esacpe)) are converted to the appropriate
282       * char values.  Invalid esacape sequences are left in untranslated.  
283       * Unicode characters like ('\ u0000') are not recognized. </li>
284       * <li>If the C++ comment sequence "//" is encountered, and the parser
285       * is configured to handle that sequence, then the remainder of the line
286       * is skipped and another token is read exactly as if a character with
287       * the comment attribute was encountered.</li>
288       * <li>If the C comment sequence "/*" is encountered, and the parser
289       * is configured to handle that sequence, then all characters up to and
290       * including the comment terminator sequence are discarded and another
291       * token is parsed.</li>
292       * <li>If all cases above are not met, then the character is an ordinary
293       * character that is parsed as a token by itself.  The char encountered
294       * is returned as the token type.</li>
295       * </ul>
296       *
297       * @return The token type
298       * @exception IOException If an I/O error occurs
299       */
300      public int nextToken() throws IOException
301      {
302        if (pushedBack)
303          {
304            pushedBack = false;
305            if (ttype != TT_NONE)
306              return ttype;
307          }
308    
309        sval = null;
310        int ch;
311    
312        // Skip whitespace.  Deal with EOL along the way.
313        while (isWhitespace(ch = in.read()))
314          if (ch == '\n' || ch == '\r')
315            {
316              lineNumber++;
317    
318              // Throw away \n if in combination with \r.
319              if (ch == '\r' && (ch = in.read()) != '\n')
320                {
321                  if (ch != TT_EOF)
322                    in.unread(ch);
323                }
324              if (eolSignificant)
325                return (ttype = TT_EOL);
326            }
327    
328        if (ch == '/')
329          if ((ch = in.read()) == '/' && slashSlash)
330            {
331              while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
332                ;
333              
334              if (ch != TT_EOF)
335                in.unread(ch);
336              return nextToken(); // Recursive, but not too deep in normal cases
337            }
338          else if (ch == '*' && slashStar) 
339            {
340              while (true)
341                {
342                  ch = in.read();
343                  if (ch == '*')
344                    {
345                      if ((ch = in.read()) == '/')
346                        break;
347                      else if (ch != TT_EOF)
348                        in.unread(ch);
349                    }
350                  else if (ch == '\n' || ch == '\r')
351                    {
352                      lineNumber++;
353                      if (ch == '\r' && (ch = in.read()) != '\n')
354                        {
355                          if (ch != TT_EOF)
356                            in.unread(ch);
357                        }
358                    }
359                  else if (ch == TT_EOF)
360                    {
361                      break;
362                    }
363                }
364              return nextToken(); // Recursive, but not too deep in normal cases
365            }
366          else
367            {
368              if (ch != TT_EOF)
369                in.unread(ch);
370              ch = '/';
371            }
372    
373        if (ch == TT_EOF)
374          ttype = TT_EOF;
375        else if (isNumeric(ch))
376          {
377            boolean isNegative = false;
378            if (ch == '-')
379              {
380                // Read ahead to see if this is an ordinary '-' rather than numeric.
381                ch = in.read();
382                if (isNumeric(ch) && ch != '-')
383                  {
384                    isNegative = true;
385                  }
386                else
387                  {
388                    if (ch != TT_EOF)
389                      in.unread(ch);
390                    return (ttype = '-');
391                  }
392              }
393    
394            StringBuffer tokbuf = new StringBuffer();
395            tokbuf.append((char) ch);
396    
397            int decCount = 0;
398            while (isNumeric(ch = in.read()) && ch != '-')
399              if (ch == '.' && decCount++ > 0)
400                break;
401              else
402                tokbuf.append((char) ch);
403    
404            if (ch != TT_EOF)
405              in.unread(ch);
406            ttype = TT_NUMBER;
407            try
408              {
409                nval = Double.valueOf(tokbuf.toString()).doubleValue();
410              }
411            catch (NumberFormatException _)
412              {
413                nval = 0.0;
414              }
415            if (isNegative)
416              nval = -nval;
417          }
418        else if (isAlphabetic(ch))
419          {
420            StringBuffer tokbuf = new StringBuffer();
421            tokbuf.append((char) ch);
422            while (isAlphabetic(ch = in.read()) || isNumeric(ch))
423              tokbuf.append((char) ch);
424            if (ch != TT_EOF)
425              in.unread(ch);
426            ttype = TT_WORD;
427            sval = tokbuf.toString();
428            if (lowerCase)
429              sval = sval.toLowerCase();
430          }
431        else if (isComment(ch))
432          {
433            while ((ch = in.read()) != '\n' && ch != '\r' && ch != TT_EOF)
434              ;
435            
436            if (ch != TT_EOF)
437              in.unread(ch);
438            return nextToken();     // Recursive, but not too deep in normal cases.
439          }
440        else if (isQuote(ch))
441          {
442            ttype = ch;
443            StringBuffer tokbuf = new StringBuffer();
444            while ((ch = in.read()) != ttype && ch != '\n' && ch != '\r' &&
445                   ch != TT_EOF)
446              {
447                if (ch == '\\')
448                  switch (ch = in.read())
449                    {
450                      case 'a':     ch = 0x7;
451                        break;
452                      case 'b':     ch = '\b';
453                        break;
454                      case 'f':     ch = 0xC;
455                        break;
456                      case 'n':     ch = '\n';
457                        break;
458                      case 'r':     ch = '\r';
459                        break;
460                      case 't':     ch = '\t';
461                        break;
462                      case 'v':     ch = 0xB;
463                        break;
464                      case '\n':    ch = '\n';
465                        break;
466                      case '\r':    ch = '\r';
467                        break;
468                      case '\"':
469                      case '\'':
470                      case '\\':
471                        break;
472                      default:
473                        int ch1, nextch;
474                        if ((nextch = ch1 = ch) >= '0' && ch <= '7')
475                          {
476                            ch -= '0';
477                            if ((nextch = in.read()) >= '0' && nextch <= '7')
478                              {
479                                ch = ch * 8 + nextch - '0';
480                                if ((nextch = in.read()) >= '0' && nextch <= '7' &&
481                                    ch1 >= '0' && ch1 <= '3')
482                                  {
483                                    ch = ch * 8 + nextch - '0';
484                                    nextch = in.read();
485                                  }
486                              }
487                          }
488    
489                        if (nextch != TT_EOF)
490                          in.unread(nextch);
491                    }
492    
493                tokbuf.append((char) ch);
494              }
495    
496            // Throw away matching quote char.
497            if (ch != ttype && ch != TT_EOF)
498              in.unread(ch);
499    
500            sval = tokbuf.toString();
501          }
502        else
503          {
504            ttype = ch;
505          }
506    
507        return ttype;
508      }
509    
510      private void resetChar(int ch)
511      {
512        whitespace[ch] = alphabetic[ch] = numeric[ch] = quote[ch] = comment[ch] =
513          false;
514      }
515    
516      /**
517       * This method makes the specified character an ordinary character.  This
518       * means that none of the attributes (whitespace, alphabetic, numeric,
519       * quote, or comment) will be set on this character.  This character will
520       * parse as its own token.
521       *
522       * @param ch The character to make ordinary, passed as an int
523       */
524      public void ordinaryChar(int ch)
525      {
526        if (ch >= 0 && ch <= 255)
527          resetChar(ch);
528      }
529    
530      /**
531       * This method makes all the characters in the specified range, range
532       * terminators included, ordinary.  This means the none of the attributes
533       * (whitespace, alphabetic, numeric, quote, or comment) will be set on
534       * any of the characters in the range.  This makes each character in this
535       * range parse as its own token.
536       *
537       * @param low The low end of the range of values to set the whitespace
538       * attribute for
539       * @param hi The high end of the range of values to set the whitespace
540       * attribute for
541       */
542      public void ordinaryChars(int low, int hi)
543      {
544        if (low < 0)
545          low = 0;
546        if (hi > 255)
547          hi = 255;
548        for (int i = low; i <= hi; i++)
549          resetChar(i);
550      }
551    
552      /**
553       * This method sets the numeric attribute on the characters '0' - '9' and
554       * the characters '.' and '-'.
555       * When this method is used, the result of giving other attributes
556       * (whitespace, quote, or comment) to the numeric characters may
557       * vary depending on the implementation. For example, if
558       * parseNumbers() and then whitespaceChars('1', '1') are called,
559       * this implementation reads "121" as 2, while some other implementation
560       * will read it as 21.
561       */
562      public void parseNumbers()
563      {
564        for (int i = 0; i <= 9; i++)
565          numeric['0' + i] = true;
566    
567        numeric['.'] = true;
568        numeric['-'] = true;
569      }
570    
571      /**
572       * Puts the current token back into the StreamTokenizer so
573       * <code>nextToken</code> will return the same value on the next call.
574       * May cause the lineno method to return an incorrect value
575       * if lineno is called before the next call to nextToken.
576       */
577      public void pushBack()
578      {
579        pushedBack = true;
580      }
581    
582      /**
583       * This method sets the quote attribute on the specified character.
584       * Other attributes for the character are cleared.
585       *
586       * @param ch The character to set the quote attribute for, passed as an int.
587       */
588      public void quoteChar(int ch)
589      {
590        if (ch >= 0 && ch <= 255)
591          {
592            quote[ch] = true;
593            comment[ch] = false;
594            whitespace[ch] = false;
595            alphabetic[ch] = false;
596            numeric[ch] = false;
597          }
598      }
599    
600      /**
601       * This method removes all attributes (whitespace, alphabetic, numeric,
602       * quote, and comment) from all characters.  It is equivalent to calling
603       * <code>ordinaryChars(0x00, 0xFF)</code>.
604       *
605       * @see #ordinaryChars(int, int)
606       */
607      public void resetSyntax()
608      {
609        ordinaryChars(0x00, 0xFF);
610      }
611    
612      /**
613       * This method sets a flag that indicates whether or not "C++" language style
614       * comments ("//" comments through EOL ) are handled by the parser.
615       * If this is <code>true</code> commented out sequences are skipped and
616       * ignored by the parser.  This defaults to <code>false</code>.
617       *
618       * @param flag <code>true</code> to recognized and handle "C++" style
619       *             comments, <code>false</code> otherwise
620       */
621      public void slashSlashComments(boolean flag)
622      {
623        slashSlash = flag;
624      }
625    
626      /**
627       * This method sets a flag that indicates whether or not "C" language style
628       * comments (with nesting not allowed) are handled by the parser.
629       * If this is <code>true</code> commented out sequences are skipped and
630       * ignored by the parser.  This defaults to <code>false</code>.
631       *
632       * @param flag <code>true</code> to recognized and handle "C" style comments,
633       *             <code>false</code> otherwise
634       */
635      public void slashStarComments(boolean flag)
636      {
637        slashStar = flag;
638      }
639    
640      /**
641       * This method returns the current token value as a <code>String</code> in
642       * the form "Token[x], line n", where 'n' is the current line numbers and
643       * 'x' is determined as follows.
644       * <p>
645       * <ul>
646       * <li>If no token has been read, then 'x' is "NOTHING" and 'n' is 0</li>
647       * <li>If <code>ttype</code> is TT_EOF, then 'x' is "EOF"</li>
648       * <li>If <code>ttype</code> is TT_EOL, then 'x' is "EOL"</li>
649       * <li>If <code>ttype</code> is TT_WORD, then 'x' is <code>sval</code></li>
650       * <li>If <code>ttype</code> is TT_NUMBER, then 'x' is "n=strnval" where
651       * 'strnval' is <code>String.valueOf(nval)</code>.</li>
652       * <li>If <code>ttype</code> is a quote character, then 'x' is
653       * <code>sval</code></li>
654       * <li>For all other cases, 'x' is <code>ttype</code></li>
655       * </ul>
656       */
657      public String toString()
658      {
659        String tempstr;
660        if (ttype == TT_EOF)
661          tempstr = "EOF";
662        else if (ttype == TT_EOL)
663          tempstr = "EOL";
664        else if (ttype == TT_WORD)
665          tempstr = sval;
666        else if (ttype == TT_NUMBER)
667          tempstr = "n=" + nval;
668        else if (ttype == TT_NONE)
669          tempstr = "NOTHING";
670        else // must be an ordinary char.
671          tempstr = "\'" + (char) ttype + "\'";
672    
673        return "Token[" + tempstr + "], line " + lineno();
674      }
675    
676      /**
677       * This method sets the whitespace attribute for all characters in the
678       * specified range, range terminators included.
679       *
680       * @param low The low end of the range of values to set the whitespace
681       * attribute for
682       * @param hi The high end of the range of values to set the whitespace
683       * attribute for
684       */
685      public void whitespaceChars(int low, int hi)
686      {
687        if (low < 0)
688          low = 0;
689        if (hi > 255)
690          hi = 255;
691        for (int i = low; i <= hi; i++)
692          {
693            resetChar(i);
694            whitespace[i] = true;
695          }
696      }
697    
698      /**
699       * This method sets the alphabetic attribute for all characters in the
700       * specified range, range terminators included.
701       *
702       * @param low The low end of the range of values to set the alphabetic
703       * attribute for
704       * @param hi The high end of the range of values to set the alphabetic
705       * attribute for
706       */
707      public void wordChars(int low, int hi)
708      {
709        if (low < 0)
710          low = 0;
711        if (hi > 255)
712          hi = 255;
713        for (int i = low; i <= hi; i++)
714          alphabetic[i] = true;
715      }
716    }