View Javadoc

1   package org.codehaus.groovy.syntax.lexer;
2   
3   import org.codehaus.groovy.syntax.ReadException;
4   import org.codehaus.groovy.syntax.Numbers;
5   import org.codehaus.groovy.syntax.Types;
6   import org.codehaus.groovy.syntax.Token;
7   
8   /***
9    *  The core code used in lexing Groovy.
10   *
11   *  @author Bob Mcwhirter
12   *  @author James Strachan
13   *  @author John Wilson
14   *  @author Chris Poirier
15   */
16  
17  public class GroovyLexerBase extends LexerBase
18  {
19  
20      protected StringLexer  stringLexer  = new StringLexer();   // support lexer for processing strings
21      protected GStringLexer gstringLexer = new GStringLexer();  // support lexer for processing GStrings
22  
23  
24     /***
25      *  Finds and returns (and consumes) the next token from the underlying stream.
26      *  Returns null when out of tokens.
27      */
28  
29      public Token nextToken() throws ReadException, LexerException
30      {
31          // System.out.println( "entering GroovyLexerBase.nextToken() on " + this );
32  
33          Token token = null;
34          OUTER_LOOP : while (token == null)
35          {
36  
37              //
38              // Get from the delegate, if available
39  
40              if( delegate != null )
41              {
42                  token = delegate.nextToken();
43  
44                  if( token == null )
45                  {
46                      undelegate();
47                  }
48                  else
49                  {
50                      break OUTER_LOOP;
51                  }
52              }
53  
54  
55              //
56              // Otherwise, do it the hard way.
57  
58              char c = la();
59  
60              ROOT_SWITCH : switch (c)
61              {
62                  case (CharStream.EOS) :
63                  {
64                      break OUTER_LOOP;
65                  }
66                  case (' ') :
67                  case ('\t') :
68                  {
69                      consume();
70                      token = null;
71                      break ROOT_SWITCH;
72                  }
73                  case ('\r') :
74                  case ('\n') :
75                  {
76                      mark();
77                      token = tokenizeEOL();
78                      break ROOT_SWITCH;
79                  }
80                  case ('{') :
81                  {
82                      mark();
83                      consume();
84                      token = symbol( Types.LEFT_CURLY_BRACE );
85                      break ROOT_SWITCH;
86                  }
87                  case ('}') :
88                  {
89                      mark();
90                      consume();
91                      token = symbol( Types.RIGHT_CURLY_BRACE );
92                      break ROOT_SWITCH;
93                  }
94                  case ('[') :
95                  {
96                      mark();
97                      consume();
98                      token = symbol( Types.LEFT_SQUARE_BRACKET );
99                      break ROOT_SWITCH;
100                 }
101                 case (']') :
102                 {
103                     mark();
104                     consume();
105                     token = symbol( Types.RIGHT_SQUARE_BRACKET );
106                     break ROOT_SWITCH;
107                 }
108                 case ('(') :
109                 {
110                     mark();
111                     consume();
112                     token = symbol( Types.LEFT_PARENTHESIS );
113                     break ROOT_SWITCH;
114                 }
115                 case (')') :
116                 {
117                     mark();
118                     consume();
119                     token = symbol( Types.RIGHT_PARENTHESIS );
120                     break ROOT_SWITCH;
121                 }
122                 case ('#') :
123                 {
124                     consume();
125 
126                     token = symbol( Types.NEWLINE, -1 );
127 
128                     CONSUME_LOOP : while( true )
129                     {
130                         switch (c = la())
131                         {
132                             case ('\r') :
133                             case ('\n') :
134                             {
135                                 readEOL();
136                                 break CONSUME_LOOP;
137                             }
138                             case CharStream.EOS :
139                             {
140                                 break CONSUME_LOOP;
141                             }
142                             default :
143                             {
144                                 consume();
145                             }
146                         }
147                     }
148                     break ROOT_SWITCH;
149                 }
150                 case ('/') :
151                 {
152                     mark();
153                     consume();
154 
155                     c = la();
156 
157                     MULTICHAR_SWITCH : switch (c)
158                     {
159                         case ('=') :
160                         {
161                             consume();
162                             token = symbol( Types.DIVIDE_EQUAL );
163                             break MULTICHAR_SWITCH;
164                         }
165                         case ('/') :
166                         {
167                             consume();
168                             token = symbol( Types.NEWLINE, -2 );
169 
170                             CONSUME_LOOP : while (true)
171                             {
172                                 switch (c = la())
173                                 {
174                                     case ('\r') :
175                                     case ('\n') :
176                                     {
177                                         readEOL();
178                                         break CONSUME_LOOP;
179                                     }
180                                     case CharStream.EOS :
181                                     {
182                                         break CONSUME_LOOP;
183                                     }
184                                     default :
185                                     {
186                                         consume();
187                                     }
188                                 }
189                             }
190                             break MULTICHAR_SWITCH;
191                         }
192                         case ('*') :
193                         {
194                             CONSUME_LOOP : while (true)
195                             {
196                                 CONSUME_SWITCH : switch (c = la())
197                                 {
198                                     case ('*') :
199                                     {
200                                         consume();
201                                         if (la() == '/')
202                                         {
203                                             consume();
204                                             break CONSUME_LOOP;
205                                         }
206                                         break CONSUME_SWITCH;
207                                     }
208                                     case ('\r') :
209                                     case ('\n') :
210                                     {
211                                         readEOL();
212                                         break CONSUME_SWITCH;
213                                     }
214                                     case CharStream.EOS :
215                                     {
216                                         break CONSUME_LOOP;
217                                     }
218                                     default :
219                                     {
220                                         consume();
221                                     }
222                                 }
223                             }
224                             token = null;
225                             break MULTICHAR_SWITCH;
226                         }
227                         default :
228                         {
229                             token = symbol( Types.DIVIDE );
230                             break MULTICHAR_SWITCH;
231                         }
232                     }
233                     break ROOT_SWITCH;
234                 }
235                 case ('%') :
236                 {
237                     mark();
238                     consume();
239 
240                     c = la();
241 
242                     MULTICHAR_SWITCH : switch (c)
243                     {
244                         case ('=') :
245                         {
246                             consume();
247                             token = symbol( Types.MOD_EQUAL );
248                             break MULTICHAR_SWITCH;
249                         }
250                         default :
251                         {
252                             token = symbol( Types.MOD );
253                             break MULTICHAR_SWITCH;
254                         }
255                     }
256                     break ROOT_SWITCH;
257                 }
258                 case ('//') :
259                 {
260                     mark();
261                     consume();
262 
263                     c = la();
264 
265                     MULTICHAR_SWITCH : switch (c)
266                     {
267                         case ('=') :
268                         {
269                             consume();
270                             token = symbol( Types.INTDIV_EQUAL );
271                             break MULTICHAR_SWITCH;
272                         }
273                         default :
274                         {
275                             token = symbol( Types.INTDIV );
276                             break MULTICHAR_SWITCH;
277                         }
278                     }
279                     break ROOT_SWITCH;
280                 }
281                 case ('~') :
282                 {
283                     mark();
284                     consume();
285 
286                     token = symbol( Types.REGEX_PATTERN );
287                     break ROOT_SWITCH;
288                 }
289                 case ('!') :
290                 {
291                     mark();
292                     consume();
293 
294                     c = la();
295 
296                     MULTICHAR_SWITCH : switch (c)
297                     {
298                         case ('=') :
299                         {
300                             consume();
301                             if( la() == '=' )
302                             {
303                                 consume();
304                                 token = symbol( Types.COMPARE_NOT_IDENTICAL );
305                             }
306                             else
307                             {
308                                 token = symbol( Types.COMPARE_NOT_EQUAL );
309                             }
310                             break MULTICHAR_SWITCH;
311                         }
312                         default :
313                         {
314                             token = symbol( Types.NOT );
315                             break MULTICHAR_SWITCH;
316                         }
317                     }
318                     break ROOT_SWITCH;
319                 }
320                 case ('=') :
321                 {
322                     mark();
323                     consume();
324 
325                     c = la();
326 
327                     MULTICHAR_SWITCH : switch (c)
328                     {
329                         case ('=') :
330                         {
331                             consume();
332                             c = la();
333 
334                             switch (c)
335                             {
336                                 case '=' :
337                                 {
338                                     consume();
339                                     token = symbol( Types.COMPARE_IDENTICAL );
340                                     break;
341                                 }
342                                 case '~' :
343                                 {
344                                     consume();
345                                     token = symbol( Types.MATCH_REGEX );
346                                     break;
347                                 }
348                                 default :
349                                 {
350                                     token = symbol( Types.COMPARE_EQUAL );
351                                 }
352                             }
353                             break MULTICHAR_SWITCH;
354                         }
355                         case '~' :
356                         {
357                             consume();
358                             token = symbol( Types.FIND_REGEX );
359                             break MULTICHAR_SWITCH;
360                         }
361                         default :
362                         {
363                             token = symbol( Types.EQUAL );
364                             break MULTICHAR_SWITCH;
365                         }
366                     }
367                     break ROOT_SWITCH;
368                 }
369                 case ('&') :
370                 {
371                     mark();
372                     consume();
373 
374                     c = la();
375 
376                     MULTICHAR_SWITCH : switch (c)
377                     {
378                         case ('&') :
379                         {
380                             consume();
381 
382                             if( la() == '=' )
383                             {
384                                 consume();
385                                 token = symbol( Types.LOGICAL_AND_EQUAL );
386                             }
387                             else
388                             {
389                                 token = symbol( Types.LOGICAL_AND );
390                             }
391 
392                             break MULTICHAR_SWITCH;
393                         }
394                         case ('=') :
395                         {
396                             consume();
397                             token = symbol( Types.BITWISE_AND_EQUAL );
398                             break MULTICHAR_SWITCH;
399                         }
400                         default :
401                         {
402                             token = symbol( Types.BITWISE_AND );
403                             break MULTICHAR_SWITCH;
404                         }
405                     }
406                     break ROOT_SWITCH;
407                 }
408                 case ('|') :
409                 {
410                     mark();
411                     consume();
412                     c = la();
413 
414                     MULTICHAR_SWITCH : switch (c)
415                     {
416                         case ('|') :
417                         {
418                             consume();
419 
420                             if( la() == '=' )
421                             {
422                                 consume();
423                                 token = symbol( Types.LOGICAL_OR_EQUAL );
424                             }
425                             else
426                             {
427                                 token = symbol( Types.LOGICAL_OR );
428                             }
429 
430                             break MULTICHAR_SWITCH;
431                         }
432                         case ('=') :
433                         {
434                             consume();
435 
436 			    token = symbol( Types.BITWISE_OR_EQUAL );
437                             break MULTICHAR_SWITCH;
438                         }
439                         default :
440                         {
441                             token = symbol( Types.PIPE );
442                             break MULTICHAR_SWITCH;
443                         }
444                     }
445                     break ROOT_SWITCH;
446                 }
447                 case ('+') :
448                 {
449                     mark();
450                     consume();
451 
452                     c = la();
453 
454                     MULTICHAR_SWITCH : switch (c)
455                     {
456                         case ('+') :
457                         {
458                             consume();
459                             token = symbol( Types.PLUS_PLUS );
460                             break MULTICHAR_SWITCH;
461                         }
462                         case ('=') :
463                         {
464                             consume();
465                             token = symbol( Types.PLUS_EQUAL );
466                             break MULTICHAR_SWITCH;
467                         }
468                         default :
469                         {
470                             token = symbol( Types.PLUS );
471                             break MULTICHAR_SWITCH;
472                         }
473                     }
474                     break ROOT_SWITCH;
475                 }
476                 case ('-') :
477                 {
478                     mark();
479                     consume();
480 
481                     c = la();
482 
483                     MULTICHAR_SWITCH : switch (c)
484                     {
485                         case ('-') :
486                         {
487                             consume();
488                             token = symbol( Types.MINUS_MINUS );
489                             break MULTICHAR_SWITCH;
490                         }
491                         case ('=') :
492                         {
493                             consume();
494                             token = symbol( Types.MINUS_EQUAL );
495                             break MULTICHAR_SWITCH;
496                         }
497                         case ('>') :
498                         {
499                             consume();
500                             token = symbol( Types.NAVIGATE );
501                             break MULTICHAR_SWITCH;
502                         }
503                         default :
504                         {
505                             token = symbol( Types.MINUS );
506                             break MULTICHAR_SWITCH;
507                         }
508                     }
509                     break ROOT_SWITCH;
510                 }
511                 case ('*') :
512                 {
513                     mark();
514                     consume();
515 
516                     c = la();
517 
518                     MULTICHAR_SWITCH : switch (c)
519                     {
520                         case ('=') :
521                         {
522                             consume();
523                             token = symbol( Types.MULTIPLY_EQUAL );
524                             break MULTICHAR_SWITCH;
525                         }
526                         case ('*') :
527                         {
528                             consume();
529                             c = la();
530                             if( c == '=' )
531                             {
532                                 consume();
533                                 token = symbol( Types.POWER_EQUAL );
534                             } 
535                             else
536                             {	
537 			        token = symbol( Types.POWER );
538                             }
539                             break MULTICHAR_SWITCH;
540                         }
541                         default :
542                         {
543                             token = symbol( Types.MULTIPLY );
544                             break MULTICHAR_SWITCH;
545                         }
546                     }
547                     break ROOT_SWITCH;
548                 }
549                 case ('^') :
550                 {
551                     mark();
552                     consume();
553 
554                     c = la();
555 
556                     MULTICHAR_SWITCH : switch (c)
557                     {
558                         case ('=') :
559                         {
560                             consume();
561                             token = symbol( Types.BITWISE_XOR_EQUAL );
562                             break MULTICHAR_SWITCH;
563                         }
564                         default :
565                         {
566                             token = symbol( Types.BITWISE_XOR );
567                             break MULTICHAR_SWITCH;
568                         }
569                     }
570                     break ROOT_SWITCH;
571                 }
572                 case (':') :
573                 {
574                     mark();
575                     consume();
576 
577                     token = symbol( Types.COLON );
578                     break ROOT_SWITCH;
579                 }
580                 case (',') :
581                 {
582                     mark();
583                     consume();
584                     token = symbol( Types.COMMA );
585                     break ROOT_SWITCH;
586                 }
587                 case (';') :
588                 {
589                     mark();
590                     consume();
591                     token = symbol( Types.SEMICOLON );
592                     break ROOT_SWITCH;
593                 }
594                 case ('?') :
595                 {
596                     mark();
597                     consume();
598                     token = symbol( Types.QUESTION );
599                     break ROOT_SWITCH;
600                 }
601                 case ('<') :
602                 {
603                     mark();
604                     consume();
605 
606                     c = la();
607 
608                     MULTICHAR_SWITCH : switch (c)
609                     {
610                         case ('=') :
611                         {
612                             consume();
613                             c = la();
614                             if (c == '>')
615                             {
616                                 consume();
617                                 token = symbol( Types.COMPARE_TO );
618                             }
619                             else
620                             {
621                                 token = symbol( Types.COMPARE_LESS_THAN_EQUAL );
622                             }
623                             break MULTICHAR_SWITCH;
624                         }
625                         case ('<') :
626                         {
627                             consume();
628                             c = la();
629 
630                             //
631                             // It's a "here-doc", created using <<<TOK ... \nTOK.   The terminator
632                             // runs from the <<< to the end of the line.  The marker is then used
633                             // to create a HereDocLexer which becomes our delegate until the heredoc
634                             // is finished.
635 
636                             if (c == '<')
637                             {
638                                 consume();
639 
640                                 StringBuffer marker = new StringBuffer();
641                                 while( (c = la()) != '\n' && c != '\r' && c != CharStream.EOS )
642                                 {
643                                     marker.append( consume() );
644                                 }
645 
646                                 readEOL();
647 
648                                 Lexer child = new HereDocLexer( marker.toString() );
649                                 delegate( child );
650 
651                                 gstringLexer.reset();
652                                 child.delegate( gstringLexer );
653 
654                                 break ROOT_SWITCH;
655                             }
656                             else if (c == '=')
657                             {
658                                 consume();
659                                 token = symbol( Types.LEFT_SHIFT_EQUAL );
660                                 break MULTICHAR_SWITCH;
661                             }
662                             else
663                             {
664                                 token = symbol( Types.LEFT_SHIFT );
665                                 break ROOT_SWITCH;
666                             }
667                         }
668                         default :
669                         {
670                             token = symbol( Types.COMPARE_LESS_THAN );
671                             break MULTICHAR_SWITCH;
672                         }
673                     }
674                     break ROOT_SWITCH;
675                 }
676                 case ('>') :
677                 {
678                     mark();
679                     consume();
680 
681                     c = la();
682 
683                     MULTICHAR_SWITCH : switch (c)
684                     {
685                         case ('=') :
686                         {
687                             consume();
688                             token = symbol( Types.COMPARE_GREATER_THAN_EQUAL );
689                             break MULTICHAR_SWITCH;
690                         }
691                         case ('>') :
692                         {
693                             consume();
694                             c = la();
695                             if( c == '>' )
696                             {
697                                 consume();
698                                 c = la();
699                                 if (c == '=')
700                                 {
701                                     consume();
702                                     token = symbol( Types.RIGHT_SHIFT_UNSIGNED_EQUAL );
703                                 }
704 				else
705                                 {
706                                     token = symbol( Types.RIGHT_SHIFT_UNSIGNED );
707                                 }
708                             } 
709                             else if (c == '=')
710                             {
711                                 consume();
712                                 token = symbol( Types.RIGHT_SHIFT_EQUAL );
713                             }
714                             else
715                             {	
716                             	token = symbol( Types.RIGHT_SHIFT );
717                             }
718                             break MULTICHAR_SWITCH;
719                         }
720                         default :
721                         {
722                             token = symbol( Types.COMPARE_GREATER_THAN );
723                             break MULTICHAR_SWITCH;
724                         }
725                     }
726                     break ROOT_SWITCH;
727                 }
728                 case ('\'') :
729                 {
730                     mark();
731 
732                     stringLexer.reset();
733                     stringLexer.allowGStrings(false);
734                     delegate( stringLexer );
735 
736                     break ROOT_SWITCH;
737                 }
738                 case ('"') :
739                 {
740                     mark();
741 
742                     stringLexer.reset();
743                     stringLexer.allowGStrings(true);
744                     delegate( stringLexer );
745 
746                     gstringLexer.reset();
747                     stringLexer.delegate( gstringLexer );
748 
749                     break ROOT_SWITCH;
750                 }
751                 case ('0') :
752                 case ('1') :
753                 case ('2') :
754                 case ('3') :
755                 case ('4') :
756                 case ('5') :
757                 case ('6') :
758                 case ('7') :
759                 case ('8') :
760                 case ('9') :
761                 case ('.') :
762                 {
763                     mark();
764 
765                     //
766                     // If it is a '.' and not followed by a digit,
767                     // it's an operator.
768 
769                     if( c == '.' && !Numbers.isDigit(la(2)) )
770                     {
771                         consume();
772                         if( la() == '.' )
773                         {
774                             consume();
775                             if( la() == '.' )
776                             {
777                                 consume();
778                                 token = symbol( Types.DOT_DOT_DOT );
779                             }
780                             else
781                             {
782                                 token = symbol( Types.DOT_DOT );
783                             }
784                         }
785                         else
786                         {
787                             token = symbol( Types.DOT );
788                         }
789                         break ROOT_SWITCH;
790                     }
791 
792 
793                     //
794                     // Otherwise, we are processing a number (integer or decimal).
795 
796                     StringBuffer numericLiteral = new StringBuffer();
797                     boolean      isDecimal      = false;
798 
799 
800                     //
801                     // If it starts 0 and isn't a decimal number, we give
802                     // special handling for hexadecimal or octal notation.
803 
804                     char c2 = la(2);
805                     if( c == '0' && (c2 == 'X' || c2 == 'x' || Numbers.isDigit(c2)) )
806                     {
807                         numericLiteral.append( consume() );
808 
809                         if( (c = la()) == 'X' || c == 'x' )
810                         {
811                             numericLiteral.append( consume() );
812                             if( Numbers.isHexDigit(la()) )
813                             {
814                                 while( Numbers.isHexDigit(la()) )
815                                 {
816                                     numericLiteral.append( consume() );
817                                 }
818                             }
819                             else
820                             {
821                                 unexpected( la(), numericLiteral.length(), "expected hexadecimal digit" );
822                             }
823                         }
824                         else
825                         {
826                             while( Numbers.isOctalDigit(la()) )
827                             {
828                                 numericLiteral.append( consume() );
829                             }
830 
831                             if( Numbers.isDigit(la()) )
832                             {
833                                 unexpected( la(), numericLiteral.length(), "expected octal digit" );
834                             }
835                         }
836                     }
837 
838 
839                     //
840                     // Otherwise, it's in base 10, integer or decimal.
841 
842                     else
843                     {
844                         while( Numbers.isDigit(la()) )
845                         {
846                             numericLiteral.append( consume() );
847                         }
848 
849 
850                         //
851                         // Next, check for a decimal point
852 
853                         if( la() == '.' && Numbers.isDigit(la(2)) )
854                         {
855                             isDecimal = true;
856 
857                             numericLiteral.append( consume() );
858                             while( Numbers.isDigit(la()) )
859                             {
860                                 numericLiteral.append( consume() );
861                             }
862 
863                             //
864                             // Check for an exponent
865 
866                             if( (c = la()) == 'e' || c == 'E' )
867                             {
868                                 numericLiteral.append( consume() );
869 
870                                 if (la() == '+' || la() == '-')
871                                 {
872                                     numericLiteral.append(consume());
873                                 }
874 
875                                 if( Numbers.isDigit(la()) )
876                                 {
877                                     while( Numbers.isDigit(la()) )
878                                     {
879                                         numericLiteral.append( consume() );
880                                     }
881                                 }
882                                 else
883                                 {
884                                     unexpected( la(), numericLiteral.length(), "expected exponent" );
885                                 }
886                             }
887                         }
888                     }
889 
890 
891                     //
892                     // If there is a type suffix, include it.
893 
894                     if( Numbers.isNumericTypeSpecifier(la(), isDecimal) )
895                     {
896                         numericLiteral.append( consume() );
897                     }
898 
899 
900                     //
901                     // For good error reporting, make sure there is nothing invalid next.
902 
903                     if( Character.isJavaIdentifierPart(c = la()) )
904                     {
905                         unexpected( c, numericLiteral.length(), "expected end of numeric literal" );
906                     }
907 
908 
909                     //
910                     // Finally, create the token.
911 
912                     if( isDecimal )
913                     {
914                         token = Token.newDecimal( numericLiteral.toString(), getStartLine(), getStartColumn() );
915                     }
916                     else
917                     {
918                         token = Token.newInteger( numericLiteral.toString(), getStartLine(), getStartColumn() );
919                     }
920 
921                     break ROOT_SWITCH;
922                 }
923                 default :
924                 {
925                     mark();
926                     if (Character.isJavaIdentifierStart(c))
927                     {
928                         StringBuffer identifier = new StringBuffer();
929 
930                         IDENTIFIER_LOOP : while (true)
931                         {
932                             c = la();
933 
934                             if (Character.isJavaIdentifierPart(c))
935                             {
936                                 identifier.append(consume());
937                             }
938                             else
939                             {
940                                 break IDENTIFIER_LOOP;
941                             }
942                         }
943 
944                         String text = identifier.toString();
945                         token = Token.newKeyword( text, getStartLine(), getStartColumn() );
946 
947                         if (token == null)
948                         {
949                             token = Token.newIdentifier( text, getStartLine(), getStartColumn() );
950                         }
951                     }
952                     else
953                     {
954                         unexpected( c, 1 );
955                     }
956 
957                     break ROOT_SWITCH;
958                 }
959             }
960         }
961 
962         // System.out.println( "" + this + ".nextToken() returning [" + token + "]" );
963 
964         return token;
965     }
966 
967 }