blob: a42c294464121c27db5bac04293edcfbf9015a63 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001# pyshlex.py - PLY compatible lexer for pysh.
2#
3# Copyright 2007 Patrick Mezard
4#
5# This software may be used and distributed according to the terms
6# of the GNU General Public License, incorporated herein by reference.
7
8# TODO:
9# - review all "char in 'abc'" snippets: the empty string can be matched
10# - test line continuations within quoted/expansion strings
11# - eof is buggy wrt sublexers
12# - the lexer cannot really work in pull mode as it would be required to run
13# PLY in pull mode. It was designed to work incrementally and it would not be
14# that hard to enable pull mode.
15import re
Patrick Williamsc124f4f2015-09-15 14:41:29 -050016
17from ply import lex
Patrick Williamsc0f7c042017-02-23 20:41:17 -060018from bb.pysh.sherrors import *
Patrick Williamsc124f4f2015-09-15 14:41:29 -050019
20class NeedMore(Exception):
21 pass
22
23def is_blank(c):
24 return c in (' ', '\t')
25
26_RE_DIGITS = re.compile(r'^\d+$')
27
28def are_digits(s):
29 return _RE_DIGITS.search(s) is not None
30
31_OPERATORS = dict([
32 ('&&', 'AND_IF'),
33 ('||', 'OR_IF'),
34 (';;', 'DSEMI'),
35 ('<<', 'DLESS'),
36 ('>>', 'DGREAT'),
37 ('<&', 'LESSAND'),
38 ('>&', 'GREATAND'),
39 ('<>', 'LESSGREAT'),
40 ('<<-', 'DLESSDASH'),
41 ('>|', 'CLOBBER'),
42 ('&', 'AMP'),
43 (';', 'COMMA'),
44 ('<', 'LESS'),
45 ('>', 'GREATER'),
46 ('(', 'LPARENS'),
47 (')', 'RPARENS'),
48])
49
50#Make a function to silence pychecker "Local variable shadows global"
51def make_partial_ops():
52 partials = {}
53 for k in _OPERATORS:
54 for i in range(1, len(k)+1):
55 partials[k[:i]] = None
56 return partials
57
58_PARTIAL_OPERATORS = make_partial_ops()
59
60def is_partial_op(s):
61 """Return True if s matches a non-empty subpart of an operator starting
62 at its first character.
63 """
64 return s in _PARTIAL_OPERATORS
65
66def is_op(s):
67 """If s matches an operator, returns the operator identifier. Return None
68 otherwise.
69 """
70 return _OPERATORS.get(s)
71
72_RESERVEDS = dict([
73 ('if', 'If'),
74 ('then', 'Then'),
75 ('else', 'Else'),
76 ('elif', 'Elif'),
77 ('fi', 'Fi'),
78 ('do', 'Do'),
79 ('done', 'Done'),
80 ('case', 'Case'),
81 ('esac', 'Esac'),
82 ('while', 'While'),
83 ('until', 'Until'),
84 ('for', 'For'),
85 ('{', 'Lbrace'),
86 ('}', 'Rbrace'),
87 ('!', 'Bang'),
88 ('in', 'In'),
89 ('|', 'PIPE'),
90])
91
92def get_reserved(s):
93 return _RESERVEDS.get(s)
94
95_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$')
96
97def is_name(s):
98 return _RE_NAME.search(s) is not None
99
100def find_chars(seq, chars):
101 for i,v in enumerate(seq):
102 if v in chars:
103 return i,v
104 return -1, None
105
106class WordLexer:
107 """WordLexer parse quoted or expansion expressions and return an expression
108 tree. The input string can be any well formed sequence beginning with quoting
109 or expansion character. Embedded expressions are handled recursively. The
110 resulting tree is made of lists and strings. Lists represent quoted or
111 expansion expressions. Each list first element is the opening separator,
112 the last one the closing separator. In-between can be any number of strings
113 or lists for sub-expressions. Non quoted/expansion expression can written as
114 strings or as lists with empty strings as starting and ending delimiters.
115 """
116
117 NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
118 NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET))
119
120 SPECIAL_CHARSET = '@*#?-$!0'
121
122 #Characters which can be escaped depends on the current delimiters
123 ESCAPABLE = {
124 '`': set(['$', '\\', '`']),
125 '"': set(['$', '\\', '`', '"']),
126 "'": set(),
127 }
128
129 def __init__(self, heredoc = False):
130 # _buffer is the unprocessed input characters buffer
131 self._buffer = []
132 # _stack is empty or contains a quoted list being processed
133 # (this is the DFS path to the quoted expression being evaluated).
134 self._stack = []
135 self._escapable = None
136 # True when parsing unquoted here documents
137 self._heredoc = heredoc
138
139 def add(self, data, eof=False):
140 """Feed the lexer with more data. If the quoted expression can be
141 delimited, return a tuple (expr, remaining) containing the expression
142 tree and the unconsumed data.
143 Otherwise, raise NeedMore.
144 """
145 self._buffer += list(data)
146 self._parse(eof)
147
148 result = self._stack[0]
149 remaining = ''.join(self._buffer)
150 self._stack = []
151 self._buffer = []
152 return result, remaining
153
154 def _is_escapable(self, c, delim=None):
155 if delim is None:
156 if self._heredoc:
157 # Backslashes works as if they were double quoted in unquoted
158 # here-documents
159 delim = '"'
160 else:
161 if len(self._stack)<=1:
162 return True
163 delim = self._stack[-2][0]
164
165 escapables = self.ESCAPABLE.get(delim, None)
166 return escapables is None or c in escapables
167
168 def _parse_squote(self, buf, result, eof):
169 if not buf:
170 raise NeedMore()
171 try:
172 pos = buf.index("'")
173 except ValueError:
174 raise NeedMore()
175 result[-1] += ''.join(buf[:pos])
176 result += ["'"]
177 return pos+1, True
178
179 def _parse_bquote(self, buf, result, eof):
180 if not buf:
181 raise NeedMore()
182
183 if buf[0]=='\n':
184 #Remove line continuations
185 result[:] = ['', '', '']
186 elif self._is_escapable(buf[0]):
187 result[-1] += buf[0]
188 result += ['']
189 else:
190 #Keep as such
191 result[:] = ['', '\\'+buf[0], '']
192
193 return 1, True
194
195 def _parse_dquote(self, buf, result, eof):
196 if not buf:
197 raise NeedMore()
198 pos, sep = find_chars(buf, '$\\`"')
199 if pos==-1:
200 raise NeedMore()
201
202 result[-1] += ''.join(buf[:pos])
203 if sep=='"':
204 result += ['"']
205 return pos+1, True
206 else:
207 #Keep everything until the separator and defer processing
208 return pos, False
209
210 def _parse_command(self, buf, result, eof):
211 if not buf:
212 raise NeedMore()
213
214 chars = '$\\`"\''
215 if result[0] == '$(':
216 chars += ')'
217 pos, sep = find_chars(buf, chars)
218 if pos == -1:
219 raise NeedMore()
220
221 result[-1] += ''.join(buf[:pos])
222 if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'):
223 result += [sep]
224 return pos+1, True
225 else:
226 return pos, False
227
228 def _parse_parameter(self, buf, result, eof):
229 if not buf:
230 raise NeedMore()
231
232 pos, sep = find_chars(buf, '$\\`"\'}')
233 if pos==-1:
234 raise NeedMore()
235
236 result[-1] += ''.join(buf[:pos])
237 if sep=='}':
238 result += [sep]
239 return pos+1, True
240 else:
241 return pos, False
242
243 def _parse_dollar(self, buf, result, eof):
244 sep = result[0]
245 if sep=='$':
246 if not buf:
247 #TODO: handle empty $
248 raise NeedMore()
249 if buf[0]=='(':
250 if len(buf)==1:
251 raise NeedMore()
252
253 if buf[1]=='(':
254 result[0] = '$(('
255 buf[:2] = []
256 else:
257 result[0] = '$('
258 buf[:1] = []
259
260 elif buf[0]=='{':
261 result[0] = '${'
262 buf[:1] = []
263 else:
264 if buf[0] in self.SPECIAL_CHARSET:
265 result[-1] = buf[0]
266 read = 1
267 else:
268 for read,c in enumerate(buf):
269 if c not in self.NAME_CHARSET:
270 break
271 else:
272 if not eof:
273 raise NeedMore()
274 read += 1
275
276 result[-1] += ''.join(buf[0:read])
277
278 if not result[-1]:
279 result[:] = ['', result[0], '']
280 else:
281 result += ['']
282 return read,True
283
284 sep = result[0]
285 if sep=='$(':
286 parsefunc = self._parse_command
287 elif sep=='${':
288 parsefunc = self._parse_parameter
289 else:
290 raise NotImplementedError(sep)
291
292 pos, closed = parsefunc(buf, result, eof)
293 return pos, closed
294
295 def _parse(self, eof):
296 buf = self._buffer
297 stack = self._stack
298 recurse = False
299
300 while 1:
301 if not stack or recurse:
302 if not buf:
303 raise NeedMore()
304 if buf[0] not in ('"\\`$\''):
305 raise ShellSyntaxError('Invalid quoted string sequence')
306 stack.append([buf[0], ''])
307 buf[:1] = []
308 recurse = False
309
310 result = stack[-1]
311 if result[0]=="'":
312 parsefunc = self._parse_squote
313 elif result[0]=='\\':
314 parsefunc = self._parse_bquote
315 elif result[0]=='"':
316 parsefunc = self._parse_dquote
317 elif result[0]=='`':
318 parsefunc = self._parse_command
319 elif result[0][0]=='$':
320 parsefunc = self._parse_dollar
321 else:
322 raise NotImplementedError()
323
324 read, closed = parsefunc(buf, result, eof)
325
326 buf[:read] = []
327 if closed:
328 if len(stack)>1:
329 #Merge in parent expression
330 parsed = stack.pop()
331 stack[-1] += [parsed]
332 stack[-1] += ['']
333 else:
334 break
335 else:
336 recurse = True
337
338def normalize_wordtree(wtree):
339 """Fold back every literal sequence (delimited with empty strings) into
340 parent sequence.
341 """
342 def normalize(wtree):
343 result = []
344 for part in wtree[1:-1]:
345 if isinstance(part, list):
346 part = normalize(part)
347 if part[0]=='':
348 #Move the part content back at current level
349 result += part[1:-1]
350 continue
351 elif not part:
352 #Remove empty strings
353 continue
354 result.append(part)
355 if not result:
356 result = ['']
357 return [wtree[0]] + result + [wtree[-1]]
358
359 return normalize(wtree)
360
361
362def make_wordtree(token, here_document=False):
363 """Parse a delimited token and return a tree similar to the ones returned by
364 WordLexer. token may contain any combinations of expansion/quoted fields and
365 non-ones.
366 """
367 tree = ['']
368 remaining = token
369 delimiters = '\\$`'
370 if not here_document:
371 delimiters += '\'"'
372
373 while 1:
374 pos, sep = find_chars(remaining, delimiters)
375 if pos==-1:
376 tree += [remaining, '']
377 return normalize_wordtree(tree)
378 tree.append(remaining[:pos])
379 remaining = remaining[pos:]
380
381 try:
382 result, remaining = WordLexer(heredoc = here_document).add(remaining, True)
383 except NeedMore:
384 raise ShellSyntaxError('Invalid token "%s"')
385 tree.append(result)
386
387
388def wordtree_as_string(wtree):
389 """Rewrite an expression tree generated by make_wordtree as string."""
390 def visit(node, output):
391 for child in node:
392 if isinstance(child, list):
393 visit(child, output)
394 else:
395 output.append(child)
396
397 output = []
398 visit(wtree, output)
399 return ''.join(output)
400
401
402def unquote_wordtree(wtree):
403 """Fold the word tree while removing quotes everywhere. Other expansion
404 sequences are joined as such.
405 """
406 def unquote(wtree):
407 unquoted = []
408 if wtree[0] in ('', "'", '"', '\\'):
409 wtree = wtree[1:-1]
410
411 for part in wtree:
412 if isinstance(part, list):
413 part = unquote(part)
414 unquoted.append(part)
415 return ''.join(unquoted)
416
417 return unquote(wtree)
418
419
420class HereDocLexer:
421 """HereDocLexer delimits whatever comes from the here-document starting newline
422 not included to the closing delimiter line included.
423 """
424 def __init__(self, op, delim):
425 assert op in ('<<', '<<-')
426 if not delim:
427 raise ShellSyntaxError('invalid here document delimiter %s' % str(delim))
428
429 self._op = op
430 self._delim = delim
431 self._buffer = []
432 self._token = []
433
434 def add(self, data, eof):
435 """If the here-document was delimited, return a tuple (content, remaining).
436 Raise NeedMore() otherwise.
437 """
438 self._buffer += list(data)
439 self._parse(eof)
440 token = ''.join(self._token)
441 remaining = ''.join(self._buffer)
442 self._token, self._remaining = [], []
443 return token, remaining
444
445 def _parse(self, eof):
446 while 1:
447 #Look for first unescaped newline. Quotes may be ignored
448 escaped = False
449 for i,c in enumerate(self._buffer):
450 if escaped:
451 escaped = False
452 elif c=='\\':
453 escaped = True
454 elif c=='\n':
455 break
456 else:
457 i = -1
458
459 if i==-1 or self._buffer[i]!='\n':
460 if not eof:
461 raise NeedMore()
462 #No more data, maybe the last line is closing delimiter
463 line = ''.join(self._buffer)
464 eol = ''
465 self._buffer[:] = []
466 else:
467 line = ''.join(self._buffer[:i])
468 eol = self._buffer[i]
469 self._buffer[:i+1] = []
470
471 if self._op=='<<-':
472 line = line.lstrip('\t')
473
474 if line==self._delim:
475 break
476
477 self._token += [line, eol]
478 if i==-1:
479 break
480
481class Token:
482 #TODO: check this is still in use
483 OPERATOR = 'OPERATOR'
484 WORD = 'WORD'
485
486 def __init__(self):
487 self.value = ''
488 self.type = None
489
490 def __getitem__(self, key):
491 #Behave like a two elements tuple
492 if key==0:
493 return self.type
494 if key==1:
495 return self.value
496 raise IndexError(key)
497
498
499class HereDoc:
500 def __init__(self, op, name=None):
501 self.op = op
502 self.name = name
503 self.pendings = []
504
505TK_COMMA = 'COMMA'
506TK_AMPERSAND = 'AMP'
507TK_OP = 'OP'
508TK_TOKEN = 'TOKEN'
509TK_COMMENT = 'COMMENT'
510TK_NEWLINE = 'NEWLINE'
511TK_IONUMBER = 'IO_NUMBER'
512TK_ASSIGNMENT = 'ASSIGNMENT_WORD'
513TK_HERENAME = 'HERENAME'
514
515class Lexer:
516 """Main lexer.
517
518 Call add() until the script AST is returned.
519 """
520 # Here-document handling makes the whole thing more complex because they basically
521 # force tokens to be reordered: here-content must come right after the operator
522 # and the here-document name, while some other tokens might be following the
523 # here-document expression on the same line.
524 #
525 # So, here-doc states are basically:
526 # *self._state==ST_NORMAL
527 # - self._heredoc.op is None: no here-document
528 # - self._heredoc.op is not None but name is: here-document operator matched,
529 # waiting for the document name/delimiter
530 # - self._heredoc.op and name are not None: here-document is ready, following
531 # tokens are being stored and will be pushed again when the document is
532 # completely parsed.
533 # *self._state==ST_HEREDOC
534 # - The here-document is being delimited by self._herelexer. Once it is done
535 # the content is pushed in front of the pending token list then all these
536 # tokens are pushed once again.
537 ST_NORMAL = 'ST_NORMAL'
538 ST_OP = 'ST_OP'
539 ST_BACKSLASH = 'ST_BACKSLASH'
540 ST_QUOTED = 'ST_QUOTED'
541 ST_COMMENT = 'ST_COMMENT'
542 ST_HEREDOC = 'ST_HEREDOC'
543
544 #Match end of backquote strings
545 RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)')
546
547 def __init__(self, parent_state = None):
548 self._input = []
549 self._pos = 0
550
551 self._token = ''
552 self._type = TK_TOKEN
553
554 self._state = self.ST_NORMAL
555 self._parent_state = parent_state
556 self._wordlexer = None
557
558 self._heredoc = HereDoc(None)
559 self._herelexer = None
560
561 ### Following attributes are not used for delimiting token and can safely
562 ### be changed after here-document detection (see _push_toke)
563
564 # Count the number of tokens following a 'For' reserved word. Needed to
565 # return an 'In' reserved word if it comes in third place.
566 self._for_count = None
567
568 def add(self, data, eof=False):
569 """Feed the lexer with data.
570
571 When eof is set to True, returns unconsumed data or raise if the lexer
572 is in the middle of a delimiting operation.
573 Raise NeedMore otherwise.
574 """
575 self._input += list(data)
576 self._parse(eof)
577 self._input[:self._pos] = []
578 return ''.join(self._input)
579
580 def _parse(self, eof):
581 while self._state:
582 if self._pos>=len(self._input):
583 if not eof:
584 raise NeedMore()
585 elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC):
586 #Delimit the current token and leave cleanly
587 self._push_token('')
588 break
589 else:
590 #Let the sublexer handle the eof themselves
591 pass
592
593 if self._state==self.ST_NORMAL:
594 self._parse_normal()
595 elif self._state==self.ST_COMMENT:
596 self._parse_comment()
597 elif self._state==self.ST_OP:
598 self._parse_op(eof)
599 elif self._state==self.ST_QUOTED:
600 self._parse_quoted(eof)
601 elif self._state==self.ST_HEREDOC:
602 self._parse_heredoc(eof)
603 else:
604 assert False, "Unknown state " + str(self._state)
605
606 if self._heredoc.op is not None:
607 raise ShellSyntaxError('missing here-document delimiter')
608
609 def _parse_normal(self):
610 c = self._input[self._pos]
611 if c=='\n':
612 self._push_token(c)
613 self._token = c
614 self._type = TK_NEWLINE
615 self._push_token('')
616 self._pos += 1
617 elif c in ('\\', '\'', '"', '`', '$'):
618 self._state = self.ST_QUOTED
619 elif is_partial_op(c):
620 self._push_token(c)
621
622 self._type = TK_OP
623 self._token += c
624 self._pos += 1
625 self._state = self.ST_OP
626 elif is_blank(c):
627 self._push_token(c)
628
629 #Discard blanks
630 self._pos += 1
631 elif self._token:
632 self._token += c
633 self._pos += 1
634 elif c=='#':
635 self._state = self.ST_COMMENT
636 self._type = TK_COMMENT
637 self._pos += 1
638 else:
639 self._pos += 1
640 self._token += c
641
642 def _parse_op(self, eof):
643 assert self._token
644
645 while 1:
646 if self._pos>=len(self._input):
647 if not eof:
648 raise NeedMore()
649 c = ''
650 else:
651 c = self._input[self._pos]
652
653 op = self._token + c
654 if c and is_partial_op(op):
655 #Still parsing an operator
656 self._token = op
657 self._pos += 1
658 else:
659 #End of operator
660 self._push_token(c)
661 self._state = self.ST_NORMAL
662 break
663
664 def _parse_comment(self):
665 while 1:
666 if self._pos>=len(self._input):
667 raise NeedMore()
668
669 c = self._input[self._pos]
670 if c=='\n':
671 #End of comment, do not consume the end of line
672 self._state = self.ST_NORMAL
673 break
674 else:
675 self._token += c
676 self._pos += 1
677
678 def _parse_quoted(self, eof):
679 """Precondition: the starting backquote/dollar is still in the input queue."""
680 if not self._wordlexer:
681 self._wordlexer = WordLexer()
682
683 if self._pos<len(self._input):
684 #Transfer input queue character into the subparser
685 input = self._input[self._pos:]
686 self._pos += len(input)
687
688 wtree, remaining = self._wordlexer.add(input, eof)
689 self._wordlexer = None
690 self._token += wordtree_as_string(wtree)
691
692 #Put unparsed character back in the input queue
693 if remaining:
694 self._input[self._pos:self._pos] = list(remaining)
695 self._state = self.ST_NORMAL
696
697 def _parse_heredoc(self, eof):
698 assert not self._token
699
700 if self._herelexer is None:
701 self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name)
702
703 if self._pos<len(self._input):
704 #Transfer input queue character into the subparser
705 input = self._input[self._pos:]
706 self._pos += len(input)
707
708 self._token, remaining = self._herelexer.add(input, eof)
709
710 #Reset here-document state
711 self._herelexer = None
712 heredoc, self._heredoc = self._heredoc, HereDoc(None)
713 if remaining:
714 self._input[self._pos:self._pos] = list(remaining)
715 self._state = self.ST_NORMAL
716
717 #Push pending tokens
718 heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)]
719 for token, type, delim in heredoc.pendings:
720 self._token = token
721 self._type = type
722 self._push_token(delim)
723
724 def _push_token(self, delim):
725 if not self._token:
726 return 0
727
728 if self._heredoc.op is not None:
729 if self._heredoc.name is None:
730 #Here-document name
731 if self._type!=TK_TOKEN:
732 raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token)
733 self._heredoc.name = unquote_wordtree(make_wordtree(self._token))
734 self._type = TK_HERENAME
735 else:
736 #Capture all tokens until the newline starting the here-document
737 if self._type==TK_NEWLINE:
738 assert self._state==self.ST_NORMAL
739 self._state = self.ST_HEREDOC
740
741 self._heredoc.pendings.append((self._token, self._type, delim))
742 self._token = ''
743 self._type = TK_TOKEN
744 return 1
745
746 # BEWARE: do not change parser state from here to the end of the function:
747 # when parsing between an here-document operator to the end of the line
748 # tokens are stored in self._heredoc.pendings. Therefore, they will not
749 # reach the section below.
750
751 #Check operators
752 if self._type==TK_OP:
753 #False positive because of partial op matching
754 op = is_op(self._token)
755 if not op:
756 self._type = TK_TOKEN
757 else:
758 #Map to the specific operator
759 self._type = op
760 if self._token in ('<<', '<<-'):
761 #Done here rather than in _parse_op because there is no need
762 #to change the parser state since we are still waiting for
763 #the here-document name
764 if self._heredoc.op is not None:
765 raise ShellSyntaxError("syntax error near token '%s'" % self._token)
766 assert self._heredoc.op is None
767 self._heredoc.op = self._token
768
769 if self._type==TK_TOKEN:
770 if '=' in self._token and not delim:
771 if self._token.startswith('='):
772 #Token is a WORD... a TOKEN that is.
773 pass
774 else:
775 prev = self._token[:self._token.find('=')]
776 if is_name(prev):
777 self._type = TK_ASSIGNMENT
778 else:
779 #Just a token (unspecified)
780 pass
781 else:
782 reserved = get_reserved(self._token)
783 if reserved is not None:
784 if reserved=='In' and self._for_count!=2:
785 #Sorry, not a reserved word after all
786 pass
787 else:
788 self._type = reserved
789 if reserved in ('For', 'Case'):
790 self._for_count = 0
791 elif are_digits(self._token) and delim in ('<', '>'):
792 #Detect IO_NUMBER
793 self._type = TK_IONUMBER
794 elif self._token==';':
795 self._type = TK_COMMA
796 elif self._token=='&':
797 self._type = TK_AMPERSAND
798 elif self._type==TK_COMMENT:
799 #Comments are not part of sh grammar, ignore them
800 self._token = ''
801 self._type = TK_TOKEN
802 return 0
803
804 if self._for_count is not None:
805 #Track token count in 'For' expression to detect 'In' reserved words.
806 #Can only be in third position, no need to go beyond
807 self._for_count += 1
808 if self._for_count==3:
809 self._for_count = None
810
811 self.on_token((self._token, self._type))
812 self._token = ''
813 self._type = TK_TOKEN
814 return 1
815
816 def on_token(self, token):
817 raise NotImplementedError
818
819
820tokens = [
821 TK_TOKEN,
822# To silence yacc unused token warnings
823# TK_COMMENT,
824 TK_NEWLINE,
825 TK_IONUMBER,
826 TK_ASSIGNMENT,
827 TK_HERENAME,
828]
829
830#Add specific operators
831tokens += _OPERATORS.values()
832#Add reserved words
833tokens += _RESERVEDS.values()
834
835class PLYLexer(Lexer):
836 """Bridge Lexer and PLY lexer interface."""
837 def __init__(self):
838 Lexer.__init__(self)
839 self._tokens = []
840 self._current = 0
841 self.lineno = 0
842
843 def on_token(self, token):
844 value, type = token
845
846 self.lineno = 0
847 t = lex.LexToken()
848 t.value = value
849 t.type = type
850 t.lexer = self
851 t.lexpos = 0
852 t.lineno = 0
853
854 self._tokens.append(t)
855
856 def is_empty(self):
857 return not bool(self._tokens)
858
859 #PLY compliant interface
860 def token(self):
861 if self._current>=len(self._tokens):
862 return None
863 t = self._tokens[self._current]
864 self._current += 1
865 return t
866
867
868def get_tokens(s):
869 """Parse the input string and return a tuple (tokens, unprocessed) where
870 tokens is a list of parsed tokens and unprocessed is the part of the input
871 string left untouched by the lexer.
872 """
873 lexer = PLYLexer()
874 untouched = lexer.add(s, True)
875 tokens = []
876 while 1:
877 token = lexer.token()
878 if token is None:
879 break
880 tokens.append(token)
881
882 tokens = [(t.value, t.type) for t in tokens]
883 return tokens, untouched