1 package org.codehaus.groovy.syntax.lexer;
2
3 import org.codehaus.groovy.syntax.ReadException;
4 import org.codehaus.groovy.syntax.Numbers;
5 import org.codehaus.groovy.syntax.Types;
6 import org.codehaus.groovy.syntax.Token;
7
8 /***
9 * The core code used in lexing Groovy.
10 *
11 * @author Bob Mcwhirter
12 * @author James Strachan
13 * @author John Wilson
14 * @author Chris Poirier
15 */
16
17 public class GroovyLexerBase extends LexerBase
18 {
19
20 protected StringLexer stringLexer = new StringLexer();
21 protected GStringLexer gstringLexer = new GStringLexer();
22
23
24 /***
25 * Finds and returns (and consumes) the next token from the underlying stream.
26 * Returns null when out of tokens.
27 */
28
29 public Token nextToken() throws ReadException, LexerException
30 {
31
32
33 Token token = null;
34 OUTER_LOOP : while (token == null)
35 {
36
37
38
39
40 if( delegate != null )
41 {
42 token = delegate.nextToken();
43
44 if( token == null )
45 {
46 undelegate();
47 }
48 else
49 {
50 break OUTER_LOOP;
51 }
52 }
53
54
55
56
57
58 char c = la();
59
60 ROOT_SWITCH : switch (c)
61 {
62 case (CharStream.EOS) :
63 {
64 break OUTER_LOOP;
65 }
66 case (' ') :
67 case ('\t') :
68 {
69 consume();
70 token = null;
71 break ROOT_SWITCH;
72 }
73 case ('\r') :
74 case ('\n') :
75 {
76 mark();
77 token = tokenizeEOL();
78 break ROOT_SWITCH;
79 }
80 case ('{') :
81 {
82 mark();
83 consume();
84 token = symbol( Types.LEFT_CURLY_BRACE );
85 break ROOT_SWITCH;
86 }
87 case ('}') :
88 {
89 mark();
90 consume();
91 token = symbol( Types.RIGHT_CURLY_BRACE );
92 break ROOT_SWITCH;
93 }
94 case ('[') :
95 {
96 mark();
97 consume();
98 token = symbol( Types.LEFT_SQUARE_BRACKET );
99 break ROOT_SWITCH;
100 }
101 case (']') :
102 {
103 mark();
104 consume();
105 token = symbol( Types.RIGHT_SQUARE_BRACKET );
106 break ROOT_SWITCH;
107 }
108 case ('(') :
109 {
110 mark();
111 consume();
112 token = symbol( Types.LEFT_PARENTHESIS );
113 break ROOT_SWITCH;
114 }
115 case (')') :
116 {
117 mark();
118 consume();
119 token = symbol( Types.RIGHT_PARENTHESIS );
120 break ROOT_SWITCH;
121 }
122 case ('#') :
123 {
124 consume();
125
126 token = symbol( Types.NEWLINE, -1 );
127
128 CONSUME_LOOP : while( true )
129 {
130 switch (c = la())
131 {
132 case ('\r') :
133 case ('\n') :
134 {
135 readEOL();
136 break CONSUME_LOOP;
137 }
138 case CharStream.EOS :
139 {
140 break CONSUME_LOOP;
141 }
142 default :
143 {
144 consume();
145 }
146 }
147 }
148 break ROOT_SWITCH;
149 }
150 case ('/') :
151 {
152 mark();
153 consume();
154
155 c = la();
156
157 MULTICHAR_SWITCH : switch (c)
158 {
159 case ('=') :
160 {
161 consume();
162 token = symbol( Types.DIVIDE_EQUAL );
163 break MULTICHAR_SWITCH;
164 }
165 case ('/') :
166 {
167 consume();
168 token = symbol( Types.NEWLINE, -2 );
169
170 CONSUME_LOOP : while (true)
171 {
172 switch (c = la())
173 {
174 case ('\r') :
175 case ('\n') :
176 {
177 readEOL();
178 break CONSUME_LOOP;
179 }
180 case CharStream.EOS :
181 {
182 break CONSUME_LOOP;
183 }
184 default :
185 {
186 consume();
187 }
188 }
189 }
190 break MULTICHAR_SWITCH;
191 }
192 case ('*') :
193 {
194 CONSUME_LOOP : while (true)
195 {
196 CONSUME_SWITCH : switch (c = la())
197 {
198 case ('*') :
199 {
200 consume();
201 if (la() == '/')
202 {
203 consume();
204 break CONSUME_LOOP;
205 }
206 break CONSUME_SWITCH;
207 }
208 case ('\r') :
209 case ('\n') :
210 {
211 readEOL();
212 break CONSUME_SWITCH;
213 }
214 case CharStream.EOS :
215 {
216 break CONSUME_LOOP;
217 }
218 default :
219 {
220 consume();
221 }
222 }
223 }
224 token = null;
225 break MULTICHAR_SWITCH;
226 }
227 default :
228 {
229 token = symbol( Types.DIVIDE );
230 break MULTICHAR_SWITCH;
231 }
232 }
233 break ROOT_SWITCH;
234 }
235 case ('%') :
236 {
237 mark();
238 consume();
239
240 c = la();
241
242 MULTICHAR_SWITCH : switch (c)
243 {
244 case ('=') :
245 {
246 consume();
247 token = symbol( Types.MOD_EQUAL );
248 break MULTICHAR_SWITCH;
249 }
250 default :
251 {
252 token = symbol( Types.MOD );
253 break MULTICHAR_SWITCH;
254 }
255 }
256 break ROOT_SWITCH;
257 }
258 case ('//') :
259 {
260 mark();
261 consume();
262
263 c = la();
264
265 MULTICHAR_SWITCH : switch (c)
266 {
267 case ('=') :
268 {
269 consume();
270 token = symbol( Types.INTDIV_EQUAL );
271 break MULTICHAR_SWITCH;
272 }
273 default :
274 {
275 token = symbol( Types.INTDIV );
276 break MULTICHAR_SWITCH;
277 }
278 }
279 break ROOT_SWITCH;
280 }
281 case ('~') :
282 {
283 mark();
284 consume();
285
286 token = symbol( Types.REGEX_PATTERN );
287 break ROOT_SWITCH;
288 }
289 case ('!') :
290 {
291 mark();
292 consume();
293
294 c = la();
295
296 MULTICHAR_SWITCH : switch (c)
297 {
298 case ('=') :
299 {
300 consume();
301 if( la() == '=' )
302 {
303 consume();
304 token = symbol( Types.COMPARE_NOT_IDENTICAL );
305 }
306 else
307 {
308 token = symbol( Types.COMPARE_NOT_EQUAL );
309 }
310 break MULTICHAR_SWITCH;
311 }
312 default :
313 {
314 token = symbol( Types.NOT );
315 break MULTICHAR_SWITCH;
316 }
317 }
318 break ROOT_SWITCH;
319 }
320 case ('=') :
321 {
322 mark();
323 consume();
324
325 c = la();
326
327 MULTICHAR_SWITCH : switch (c)
328 {
329 case ('=') :
330 {
331 consume();
332 c = la();
333
334 switch (c)
335 {
336 case '=' :
337 {
338 consume();
339 token = symbol( Types.COMPARE_IDENTICAL );
340 break;
341 }
342 case '~' :
343 {
344 consume();
345 token = symbol( Types.MATCH_REGEX );
346 break;
347 }
348 default :
349 {
350 token = symbol( Types.COMPARE_EQUAL );
351 }
352 }
353 break MULTICHAR_SWITCH;
354 }
355 case '~' :
356 {
357 consume();
358 token = symbol( Types.FIND_REGEX );
359 break MULTICHAR_SWITCH;
360 }
361 default :
362 {
363 token = symbol( Types.EQUAL );
364 break MULTICHAR_SWITCH;
365 }
366 }
367 break ROOT_SWITCH;
368 }
369 case ('&') :
370 {
371 mark();
372 consume();
373
374 c = la();
375
376 MULTICHAR_SWITCH : switch (c)
377 {
378 case ('&') :
379 {
380 consume();
381
382 if( la() == '=' )
383 {
384 consume();
385 token = symbol( Types.LOGICAL_AND_EQUAL );
386 }
387 else
388 {
389 token = symbol( Types.LOGICAL_AND );
390 }
391
392 break MULTICHAR_SWITCH;
393 }
394 case ('=') :
395 {
396 consume();
397 token = symbol( Types.BITWISE_AND_EQUAL );
398 break MULTICHAR_SWITCH;
399 }
400 default :
401 {
402 token = symbol( Types.BITWISE_AND );
403 break MULTICHAR_SWITCH;
404 }
405 }
406 break ROOT_SWITCH;
407 }
408 case ('|') :
409 {
410 mark();
411 consume();
412 c = la();
413
414 MULTICHAR_SWITCH : switch (c)
415 {
416 case ('|') :
417 {
418 consume();
419
420 if( la() == '=' )
421 {
422 consume();
423 token = symbol( Types.LOGICAL_OR_EQUAL );
424 }
425 else
426 {
427 token = symbol( Types.LOGICAL_OR );
428 }
429
430 break MULTICHAR_SWITCH;
431 }
432 case ('=') :
433 {
434 consume();
435
436 token = symbol( Types.BITWISE_OR_EQUAL );
437 break MULTICHAR_SWITCH;
438 }
439 default :
440 {
441 token = symbol( Types.PIPE );
442 break MULTICHAR_SWITCH;
443 }
444 }
445 break ROOT_SWITCH;
446 }
447 case ('+') :
448 {
449 mark();
450 consume();
451
452 c = la();
453
454 MULTICHAR_SWITCH : switch (c)
455 {
456 case ('+') :
457 {
458 consume();
459 token = symbol( Types.PLUS_PLUS );
460 break MULTICHAR_SWITCH;
461 }
462 case ('=') :
463 {
464 consume();
465 token = symbol( Types.PLUS_EQUAL );
466 break MULTICHAR_SWITCH;
467 }
468 default :
469 {
470 token = symbol( Types.PLUS );
471 break MULTICHAR_SWITCH;
472 }
473 }
474 break ROOT_SWITCH;
475 }
476 case ('-') :
477 {
478 mark();
479 consume();
480
481 c = la();
482
483 MULTICHAR_SWITCH : switch (c)
484 {
485 case ('-') :
486 {
487 consume();
488 token = symbol( Types.MINUS_MINUS );
489 break MULTICHAR_SWITCH;
490 }
491 case ('=') :
492 {
493 consume();
494 token = symbol( Types.MINUS_EQUAL );
495 break MULTICHAR_SWITCH;
496 }
497 case ('>') :
498 {
499 consume();
500 token = symbol( Types.NAVIGATE );
501 break MULTICHAR_SWITCH;
502 }
503 default :
504 {
505 token = symbol( Types.MINUS );
506 break MULTICHAR_SWITCH;
507 }
508 }
509 break ROOT_SWITCH;
510 }
511 case ('*') :
512 {
513 mark();
514 consume();
515
516 c = la();
517
518 MULTICHAR_SWITCH : switch (c)
519 {
520 case ('=') :
521 {
522 consume();
523 token = symbol( Types.MULTIPLY_EQUAL );
524 break MULTICHAR_SWITCH;
525 }
526 case ('*') :
527 {
528 consume();
529 c = la();
530 if( c == '=' )
531 {
532 consume();
533 token = symbol( Types.POWER_EQUAL );
534 }
535 else
536 {
537 token = symbol( Types.POWER );
538 }
539 break MULTICHAR_SWITCH;
540 }
541 default :
542 {
543 token = symbol( Types.MULTIPLY );
544 break MULTICHAR_SWITCH;
545 }
546 }
547 break ROOT_SWITCH;
548 }
549 case ('^') :
550 {
551 mark();
552 consume();
553
554 c = la();
555
556 MULTICHAR_SWITCH : switch (c)
557 {
558 case ('=') :
559 {
560 consume();
561 token = symbol( Types.BITWISE_XOR_EQUAL );
562 break MULTICHAR_SWITCH;
563 }
564 default :
565 {
566 token = symbol( Types.BITWISE_XOR );
567 break MULTICHAR_SWITCH;
568 }
569 }
570 break ROOT_SWITCH;
571 }
572 case (':') :
573 {
574 mark();
575 consume();
576
577 token = symbol( Types.COLON );
578 break ROOT_SWITCH;
579 }
580 case (',') :
581 {
582 mark();
583 consume();
584 token = symbol( Types.COMMA );
585 break ROOT_SWITCH;
586 }
587 case (';') :
588 {
589 mark();
590 consume();
591 token = symbol( Types.SEMICOLON );
592 break ROOT_SWITCH;
593 }
594 case ('?') :
595 {
596 mark();
597 consume();
598 token = symbol( Types.QUESTION );
599 break ROOT_SWITCH;
600 }
601 case ('<') :
602 {
603 mark();
604 consume();
605
606 c = la();
607
608 MULTICHAR_SWITCH : switch (c)
609 {
610 case ('=') :
611 {
612 consume();
613 c = la();
614 if (c == '>')
615 {
616 consume();
617 token = symbol( Types.COMPARE_TO );
618 }
619 else
620 {
621 token = symbol( Types.COMPARE_LESS_THAN_EQUAL );
622 }
623 break MULTICHAR_SWITCH;
624 }
625 case ('<') :
626 {
627 consume();
628 c = la();
629
630
631
632
633
634
635
636 if (c == '<')
637 {
638 consume();
639
640 StringBuffer marker = new StringBuffer();
641 while( (c = la()) != '\n' && c != '\r' && c != CharStream.EOS )
642 {
643 marker.append( consume() );
644 }
645
646 readEOL();
647
648 Lexer child = new HereDocLexer( marker.toString() );
649 delegate( child );
650
651 gstringLexer.reset();
652 child.delegate( gstringLexer );
653
654 break ROOT_SWITCH;
655 }
656 else if (c == '=')
657 {
658 consume();
659 token = symbol( Types.LEFT_SHIFT_EQUAL );
660 break MULTICHAR_SWITCH;
661 }
662 else
663 {
664 token = symbol( Types.LEFT_SHIFT );
665 break ROOT_SWITCH;
666 }
667 }
668 default :
669 {
670 token = symbol( Types.COMPARE_LESS_THAN );
671 break MULTICHAR_SWITCH;
672 }
673 }
674 break ROOT_SWITCH;
675 }
676 case ('>') :
677 {
678 mark();
679 consume();
680
681 c = la();
682
683 MULTICHAR_SWITCH : switch (c)
684 {
685 case ('=') :
686 {
687 consume();
688 token = symbol( Types.COMPARE_GREATER_THAN_EQUAL );
689 break MULTICHAR_SWITCH;
690 }
691 case ('>') :
692 {
693 consume();
694 c = la();
695 if( c == '>' )
696 {
697 consume();
698 c = la();
699 if (c == '=')
700 {
701 consume();
702 token = symbol( Types.RIGHT_SHIFT_UNSIGNED_EQUAL );
703 }
704 else
705 {
706 token = symbol( Types.RIGHT_SHIFT_UNSIGNED );
707 }
708 }
709 else if (c == '=')
710 {
711 consume();
712 token = symbol( Types.RIGHT_SHIFT_EQUAL );
713 }
714 else
715 {
716 token = symbol( Types.RIGHT_SHIFT );
717 }
718 break MULTICHAR_SWITCH;
719 }
720 default :
721 {
722 token = symbol( Types.COMPARE_GREATER_THAN );
723 break MULTICHAR_SWITCH;
724 }
725 }
726 break ROOT_SWITCH;
727 }
728 case ('\'') :
729 {
730 mark();
731
732 stringLexer.reset();
733 stringLexer.allowGStrings(false);
734 delegate( stringLexer );
735
736 break ROOT_SWITCH;
737 }
738 case ('"') :
739 {
740 mark();
741
742 stringLexer.reset();
743 stringLexer.allowGStrings(true);
744 delegate( stringLexer );
745
746 gstringLexer.reset();
747 stringLexer.delegate( gstringLexer );
748
749 break ROOT_SWITCH;
750 }
751 case ('0') :
752 case ('1') :
753 case ('2') :
754 case ('3') :
755 case ('4') :
756 case ('5') :
757 case ('6') :
758 case ('7') :
759 case ('8') :
760 case ('9') :
761 case ('.') :
762 {
763 mark();
764
765
766
767
768
769 if( c == '.' && !Numbers.isDigit(la(2)) )
770 {
771 consume();
772 if( la() == '.' )
773 {
774 consume();
775 if( la() == '.' )
776 {
777 consume();
778 token = symbol( Types.DOT_DOT_DOT );
779 }
780 else
781 {
782 token = symbol( Types.DOT_DOT );
783 }
784 }
785 else
786 {
787 token = symbol( Types.DOT );
788 }
789 break ROOT_SWITCH;
790 }
791
792
793
794
795
796 StringBuffer numericLiteral = new StringBuffer();
797 boolean isDecimal = false;
798
799
800
801
802
803
804 char c2 = la(2);
805 if( c == '0' && (c2 == 'X' || c2 == 'x' || Numbers.isDigit(c2)) )
806 {
807 numericLiteral.append( consume() );
808
809 if( (c = la()) == 'X' || c == 'x' )
810 {
811 numericLiteral.append( consume() );
812 if( Numbers.isHexDigit(la()) )
813 {
814 while( Numbers.isHexDigit(la()) )
815 {
816 numericLiteral.append( consume() );
817 }
818 }
819 else
820 {
821 unexpected( la(), numericLiteral.length(), "expected hexadecimal digit" );
822 }
823 }
824 else
825 {
826 while( Numbers.isOctalDigit(la()) )
827 {
828 numericLiteral.append( consume() );
829 }
830
831 if( Numbers.isDigit(la()) )
832 {
833 unexpected( la(), numericLiteral.length(), "expected octal digit" );
834 }
835 }
836 }
837
838
839
840
841
842 else
843 {
844 while( Numbers.isDigit(la()) )
845 {
846 numericLiteral.append( consume() );
847 }
848
849
850
851
852
853 if( la() == '.' && Numbers.isDigit(la(2)) )
854 {
855 isDecimal = true;
856
857 numericLiteral.append( consume() );
858 while( Numbers.isDigit(la()) )
859 {
860 numericLiteral.append( consume() );
861 }
862
863
864
865
866 if( (c = la()) == 'e' || c == 'E' )
867 {
868 numericLiteral.append( consume() );
869
870 if (la() == '+' || la() == '-')
871 {
872 numericLiteral.append(consume());
873 }
874
875 if( Numbers.isDigit(la()) )
876 {
877 while( Numbers.isDigit(la()) )
878 {
879 numericLiteral.append( consume() );
880 }
881 }
882 else
883 {
884 unexpected( la(), numericLiteral.length(), "expected exponent" );
885 }
886 }
887 }
888 }
889
890
891
892
893
894 if( Numbers.isNumericTypeSpecifier(la(), isDecimal) )
895 {
896 numericLiteral.append( consume() );
897 }
898
899
900
901
902
903 if( Character.isJavaIdentifierPart(c = la()) )
904 {
905 unexpected( c, numericLiteral.length(), "expected end of numeric literal" );
906 }
907
908
909
910
911
912 if( isDecimal )
913 {
914 token = Token.newDecimal( numericLiteral.toString(), getStartLine(), getStartColumn() );
915 }
916 else
917 {
918 token = Token.newInteger( numericLiteral.toString(), getStartLine(), getStartColumn() );
919 }
920
921 break ROOT_SWITCH;
922 }
923 default :
924 {
925 mark();
926 if (Character.isJavaIdentifierStart(c))
927 {
928 StringBuffer identifier = new StringBuffer();
929
930 IDENTIFIER_LOOP : while (true)
931 {
932 c = la();
933
934 if (Character.isJavaIdentifierPart(c))
935 {
936 identifier.append(consume());
937 }
938 else
939 {
940 break IDENTIFIER_LOOP;
941 }
942 }
943
944 String text = identifier.toString();
945 token = Token.newKeyword( text, getStartLine(), getStartColumn() );
946
947 if (token == null)
948 {
949 token = Token.newIdentifier( text, getStartLine(), getStartColumn() );
950 }
951 }
952 else
953 {
954 unexpected( c, 1 );
955 }
956
957 break ROOT_SWITCH;
958 }
959 }
960 }
961
962
963
964 return token;
965 }
966
967 }