blob: b30123675c1b1cc43a79615f98180d02331bc48a [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001# pyshlex.py - PLY compatible lexer for pysh.
2#
3# Copyright 2007 Patrick Mezard
4#
5# This software may be used and distributed according to the terms
6# of the GNU General Public License, incorporated herein by reference.
7
8# TODO:
9# - review all "char in 'abc'" snippets: the empty string can be matched
10# - test line continuations within quoted/expansion strings
11# - eof is buggy wrt sublexers
12# - the lexer cannot really work in pull mode as it would be required to run
13# PLY in pull mode. It was designed to work incrementally and it would not be
14# that hard to enable pull mode.
15import re
16try:
17 s = set()
18 del s
19except NameError:
20 from Set import Set as set
21
22from ply import lex
23from sherrors import *
24
25class NeedMore(Exception):
26 pass
27
28def is_blank(c):
29 return c in (' ', '\t')
30
31_RE_DIGITS = re.compile(r'^\d+$')
32
33def are_digits(s):
34 return _RE_DIGITS.search(s) is not None
35
36_OPERATORS = dict([
37 ('&&', 'AND_IF'),
38 ('||', 'OR_IF'),
39 (';;', 'DSEMI'),
40 ('<<', 'DLESS'),
41 ('>>', 'DGREAT'),
42 ('<&', 'LESSAND'),
43 ('>&', 'GREATAND'),
44 ('<>', 'LESSGREAT'),
45 ('<<-', 'DLESSDASH'),
46 ('>|', 'CLOBBER'),
47 ('&', 'AMP'),
48 (';', 'COMMA'),
49 ('<', 'LESS'),
50 ('>', 'GREATER'),
51 ('(', 'LPARENS'),
52 (')', 'RPARENS'),
53])
54
55#Make a function to silence pychecker "Local variable shadows global"
56def make_partial_ops():
57 partials = {}
58 for k in _OPERATORS:
59 for i in range(1, len(k)+1):
60 partials[k[:i]] = None
61 return partials
62
63_PARTIAL_OPERATORS = make_partial_ops()
64
65def is_partial_op(s):
66 """Return True if s matches a non-empty subpart of an operator starting
67 at its first character.
68 """
69 return s in _PARTIAL_OPERATORS
70
71def is_op(s):
72 """If s matches an operator, returns the operator identifier. Return None
73 otherwise.
74 """
75 return _OPERATORS.get(s)
76
77_RESERVEDS = dict([
78 ('if', 'If'),
79 ('then', 'Then'),
80 ('else', 'Else'),
81 ('elif', 'Elif'),
82 ('fi', 'Fi'),
83 ('do', 'Do'),
84 ('done', 'Done'),
85 ('case', 'Case'),
86 ('esac', 'Esac'),
87 ('while', 'While'),
88 ('until', 'Until'),
89 ('for', 'For'),
90 ('{', 'Lbrace'),
91 ('}', 'Rbrace'),
92 ('!', 'Bang'),
93 ('in', 'In'),
94 ('|', 'PIPE'),
95])
96
97def get_reserved(s):
98 return _RESERVEDS.get(s)
99
100_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$')
101
102def is_name(s):
103 return _RE_NAME.search(s) is not None
104
105def find_chars(seq, chars):
106 for i,v in enumerate(seq):
107 if v in chars:
108 return i,v
109 return -1, None
110
111class WordLexer:
112 """WordLexer parse quoted or expansion expressions and return an expression
113 tree. The input string can be any well formed sequence beginning with quoting
114 or expansion character. Embedded expressions are handled recursively. The
115 resulting tree is made of lists and strings. Lists represent quoted or
116 expansion expressions. Each list first element is the opening separator,
117 the last one the closing separator. In-between can be any number of strings
118 or lists for sub-expressions. Non quoted/expansion expression can written as
119 strings or as lists with empty strings as starting and ending delimiters.
120 """
121
122 NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
123 NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET))
124
125 SPECIAL_CHARSET = '@*#?-$!0'
126
127 #Characters which can be escaped depends on the current delimiters
128 ESCAPABLE = {
129 '`': set(['$', '\\', '`']),
130 '"': set(['$', '\\', '`', '"']),
131 "'": set(),
132 }
133
134 def __init__(self, heredoc = False):
135 # _buffer is the unprocessed input characters buffer
136 self._buffer = []
137 # _stack is empty or contains a quoted list being processed
138 # (this is the DFS path to the quoted expression being evaluated).
139 self._stack = []
140 self._escapable = None
141 # True when parsing unquoted here documents
142 self._heredoc = heredoc
143
144 def add(self, data, eof=False):
145 """Feed the lexer with more data. If the quoted expression can be
146 delimited, return a tuple (expr, remaining) containing the expression
147 tree and the unconsumed data.
148 Otherwise, raise NeedMore.
149 """
150 self._buffer += list(data)
151 self._parse(eof)
152
153 result = self._stack[0]
154 remaining = ''.join(self._buffer)
155 self._stack = []
156 self._buffer = []
157 return result, remaining
158
159 def _is_escapable(self, c, delim=None):
160 if delim is None:
161 if self._heredoc:
162 # Backslashes works as if they were double quoted in unquoted
163 # here-documents
164 delim = '"'
165 else:
166 if len(self._stack)<=1:
167 return True
168 delim = self._stack[-2][0]
169
170 escapables = self.ESCAPABLE.get(delim, None)
171 return escapables is None or c in escapables
172
173 def _parse_squote(self, buf, result, eof):
174 if not buf:
175 raise NeedMore()
176 try:
177 pos = buf.index("'")
178 except ValueError:
179 raise NeedMore()
180 result[-1] += ''.join(buf[:pos])
181 result += ["'"]
182 return pos+1, True
183
184 def _parse_bquote(self, buf, result, eof):
185 if not buf:
186 raise NeedMore()
187
188 if buf[0]=='\n':
189 #Remove line continuations
190 result[:] = ['', '', '']
191 elif self._is_escapable(buf[0]):
192 result[-1] += buf[0]
193 result += ['']
194 else:
195 #Keep as such
196 result[:] = ['', '\\'+buf[0], '']
197
198 return 1, True
199
200 def _parse_dquote(self, buf, result, eof):
201 if not buf:
202 raise NeedMore()
203 pos, sep = find_chars(buf, '$\\`"')
204 if pos==-1:
205 raise NeedMore()
206
207 result[-1] += ''.join(buf[:pos])
208 if sep=='"':
209 result += ['"']
210 return pos+1, True
211 else:
212 #Keep everything until the separator and defer processing
213 return pos, False
214
215 def _parse_command(self, buf, result, eof):
216 if not buf:
217 raise NeedMore()
218
219 chars = '$\\`"\''
220 if result[0] == '$(':
221 chars += ')'
222 pos, sep = find_chars(buf, chars)
223 if pos == -1:
224 raise NeedMore()
225
226 result[-1] += ''.join(buf[:pos])
227 if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'):
228 result += [sep]
229 return pos+1, True
230 else:
231 return pos, False
232
233 def _parse_parameter(self, buf, result, eof):
234 if not buf:
235 raise NeedMore()
236
237 pos, sep = find_chars(buf, '$\\`"\'}')
238 if pos==-1:
239 raise NeedMore()
240
241 result[-1] += ''.join(buf[:pos])
242 if sep=='}':
243 result += [sep]
244 return pos+1, True
245 else:
246 return pos, False
247
248 def _parse_dollar(self, buf, result, eof):
249 sep = result[0]
250 if sep=='$':
251 if not buf:
252 #TODO: handle empty $
253 raise NeedMore()
254 if buf[0]=='(':
255 if len(buf)==1:
256 raise NeedMore()
257
258 if buf[1]=='(':
259 result[0] = '$(('
260 buf[:2] = []
261 else:
262 result[0] = '$('
263 buf[:1] = []
264
265 elif buf[0]=='{':
266 result[0] = '${'
267 buf[:1] = []
268 else:
269 if buf[0] in self.SPECIAL_CHARSET:
270 result[-1] = buf[0]
271 read = 1
272 else:
273 for read,c in enumerate(buf):
274 if c not in self.NAME_CHARSET:
275 break
276 else:
277 if not eof:
278 raise NeedMore()
279 read += 1
280
281 result[-1] += ''.join(buf[0:read])
282
283 if not result[-1]:
284 result[:] = ['', result[0], '']
285 else:
286 result += ['']
287 return read,True
288
289 sep = result[0]
290 if sep=='$(':
291 parsefunc = self._parse_command
292 elif sep=='${':
293 parsefunc = self._parse_parameter
294 else:
295 raise NotImplementedError(sep)
296
297 pos, closed = parsefunc(buf, result, eof)
298 return pos, closed
299
300 def _parse(self, eof):
301 buf = self._buffer
302 stack = self._stack
303 recurse = False
304
305 while 1:
306 if not stack or recurse:
307 if not buf:
308 raise NeedMore()
309 if buf[0] not in ('"\\`$\''):
310 raise ShellSyntaxError('Invalid quoted string sequence')
311 stack.append([buf[0], ''])
312 buf[:1] = []
313 recurse = False
314
315 result = stack[-1]
316 if result[0]=="'":
317 parsefunc = self._parse_squote
318 elif result[0]=='\\':
319 parsefunc = self._parse_bquote
320 elif result[0]=='"':
321 parsefunc = self._parse_dquote
322 elif result[0]=='`':
323 parsefunc = self._parse_command
324 elif result[0][0]=='$':
325 parsefunc = self._parse_dollar
326 else:
327 raise NotImplementedError()
328
329 read, closed = parsefunc(buf, result, eof)
330
331 buf[:read] = []
332 if closed:
333 if len(stack)>1:
334 #Merge in parent expression
335 parsed = stack.pop()
336 stack[-1] += [parsed]
337 stack[-1] += ['']
338 else:
339 break
340 else:
341 recurse = True
342
343def normalize_wordtree(wtree):
344 """Fold back every literal sequence (delimited with empty strings) into
345 parent sequence.
346 """
347 def normalize(wtree):
348 result = []
349 for part in wtree[1:-1]:
350 if isinstance(part, list):
351 part = normalize(part)
352 if part[0]=='':
353 #Move the part content back at current level
354 result += part[1:-1]
355 continue
356 elif not part:
357 #Remove empty strings
358 continue
359 result.append(part)
360 if not result:
361 result = ['']
362 return [wtree[0]] + result + [wtree[-1]]
363
364 return normalize(wtree)
365
366
367def make_wordtree(token, here_document=False):
368 """Parse a delimited token and return a tree similar to the ones returned by
369 WordLexer. token may contain any combinations of expansion/quoted fields and
370 non-ones.
371 """
372 tree = ['']
373 remaining = token
374 delimiters = '\\$`'
375 if not here_document:
376 delimiters += '\'"'
377
378 while 1:
379 pos, sep = find_chars(remaining, delimiters)
380 if pos==-1:
381 tree += [remaining, '']
382 return normalize_wordtree(tree)
383 tree.append(remaining[:pos])
384 remaining = remaining[pos:]
385
386 try:
387 result, remaining = WordLexer(heredoc = here_document).add(remaining, True)
388 except NeedMore:
389 raise ShellSyntaxError('Invalid token "%s"')
390 tree.append(result)
391
392
393def wordtree_as_string(wtree):
394 """Rewrite an expression tree generated by make_wordtree as string."""
395 def visit(node, output):
396 for child in node:
397 if isinstance(child, list):
398 visit(child, output)
399 else:
400 output.append(child)
401
402 output = []
403 visit(wtree, output)
404 return ''.join(output)
405
406
407def unquote_wordtree(wtree):
408 """Fold the word tree while removing quotes everywhere. Other expansion
409 sequences are joined as such.
410 """
411 def unquote(wtree):
412 unquoted = []
413 if wtree[0] in ('', "'", '"', '\\'):
414 wtree = wtree[1:-1]
415
416 for part in wtree:
417 if isinstance(part, list):
418 part = unquote(part)
419 unquoted.append(part)
420 return ''.join(unquoted)
421
422 return unquote(wtree)
423
424
425class HereDocLexer:
426 """HereDocLexer delimits whatever comes from the here-document starting newline
427 not included to the closing delimiter line included.
428 """
429 def __init__(self, op, delim):
430 assert op in ('<<', '<<-')
431 if not delim:
432 raise ShellSyntaxError('invalid here document delimiter %s' % str(delim))
433
434 self._op = op
435 self._delim = delim
436 self._buffer = []
437 self._token = []
438
439 def add(self, data, eof):
440 """If the here-document was delimited, return a tuple (content, remaining).
441 Raise NeedMore() otherwise.
442 """
443 self._buffer += list(data)
444 self._parse(eof)
445 token = ''.join(self._token)
446 remaining = ''.join(self._buffer)
447 self._token, self._remaining = [], []
448 return token, remaining
449
450 def _parse(self, eof):
451 while 1:
452 #Look for first unescaped newline. Quotes may be ignored
453 escaped = False
454 for i,c in enumerate(self._buffer):
455 if escaped:
456 escaped = False
457 elif c=='\\':
458 escaped = True
459 elif c=='\n':
460 break
461 else:
462 i = -1
463
464 if i==-1 or self._buffer[i]!='\n':
465 if not eof:
466 raise NeedMore()
467 #No more data, maybe the last line is closing delimiter
468 line = ''.join(self._buffer)
469 eol = ''
470 self._buffer[:] = []
471 else:
472 line = ''.join(self._buffer[:i])
473 eol = self._buffer[i]
474 self._buffer[:i+1] = []
475
476 if self._op=='<<-':
477 line = line.lstrip('\t')
478
479 if line==self._delim:
480 break
481
482 self._token += [line, eol]
483 if i==-1:
484 break
485
486class Token:
487 #TODO: check this is still in use
488 OPERATOR = 'OPERATOR'
489 WORD = 'WORD'
490
491 def __init__(self):
492 self.value = ''
493 self.type = None
494
495 def __getitem__(self, key):
496 #Behave like a two elements tuple
497 if key==0:
498 return self.type
499 if key==1:
500 return self.value
501 raise IndexError(key)
502
503
504class HereDoc:
505 def __init__(self, op, name=None):
506 self.op = op
507 self.name = name
508 self.pendings = []
509
510TK_COMMA = 'COMMA'
511TK_AMPERSAND = 'AMP'
512TK_OP = 'OP'
513TK_TOKEN = 'TOKEN'
514TK_COMMENT = 'COMMENT'
515TK_NEWLINE = 'NEWLINE'
516TK_IONUMBER = 'IO_NUMBER'
517TK_ASSIGNMENT = 'ASSIGNMENT_WORD'
518TK_HERENAME = 'HERENAME'
519
520class Lexer:
521 """Main lexer.
522
523 Call add() until the script AST is returned.
524 """
525 # Here-document handling makes the whole thing more complex because they basically
526 # force tokens to be reordered: here-content must come right after the operator
527 # and the here-document name, while some other tokens might be following the
528 # here-document expression on the same line.
529 #
530 # So, here-doc states are basically:
531 # *self._state==ST_NORMAL
532 # - self._heredoc.op is None: no here-document
533 # - self._heredoc.op is not None but name is: here-document operator matched,
534 # waiting for the document name/delimiter
535 # - self._heredoc.op and name are not None: here-document is ready, following
536 # tokens are being stored and will be pushed again when the document is
537 # completely parsed.
538 # *self._state==ST_HEREDOC
539 # - The here-document is being delimited by self._herelexer. Once it is done
540 # the content is pushed in front of the pending token list then all these
541 # tokens are pushed once again.
542 ST_NORMAL = 'ST_NORMAL'
543 ST_OP = 'ST_OP'
544 ST_BACKSLASH = 'ST_BACKSLASH'
545 ST_QUOTED = 'ST_QUOTED'
546 ST_COMMENT = 'ST_COMMENT'
547 ST_HEREDOC = 'ST_HEREDOC'
548
549 #Match end of backquote strings
550 RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)')
551
552 def __init__(self, parent_state = None):
553 self._input = []
554 self._pos = 0
555
556 self._token = ''
557 self._type = TK_TOKEN
558
559 self._state = self.ST_NORMAL
560 self._parent_state = parent_state
561 self._wordlexer = None
562
563 self._heredoc = HereDoc(None)
564 self._herelexer = None
565
566 ### Following attributes are not used for delimiting token and can safely
567 ### be changed after here-document detection (see _push_toke)
568
569 # Count the number of tokens following a 'For' reserved word. Needed to
570 # return an 'In' reserved word if it comes in third place.
571 self._for_count = None
572
573 def add(self, data, eof=False):
574 """Feed the lexer with data.
575
576 When eof is set to True, returns unconsumed data or raise if the lexer
577 is in the middle of a delimiting operation.
578 Raise NeedMore otherwise.
579 """
580 self._input += list(data)
581 self._parse(eof)
582 self._input[:self._pos] = []
583 return ''.join(self._input)
584
585 def _parse(self, eof):
586 while self._state:
587 if self._pos>=len(self._input):
588 if not eof:
589 raise NeedMore()
590 elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC):
591 #Delimit the current token and leave cleanly
592 self._push_token('')
593 break
594 else:
595 #Let the sublexer handle the eof themselves
596 pass
597
598 if self._state==self.ST_NORMAL:
599 self._parse_normal()
600 elif self._state==self.ST_COMMENT:
601 self._parse_comment()
602 elif self._state==self.ST_OP:
603 self._parse_op(eof)
604 elif self._state==self.ST_QUOTED:
605 self._parse_quoted(eof)
606 elif self._state==self.ST_HEREDOC:
607 self._parse_heredoc(eof)
608 else:
609 assert False, "Unknown state " + str(self._state)
610
611 if self._heredoc.op is not None:
612 raise ShellSyntaxError('missing here-document delimiter')
613
614 def _parse_normal(self):
615 c = self._input[self._pos]
616 if c=='\n':
617 self._push_token(c)
618 self._token = c
619 self._type = TK_NEWLINE
620 self._push_token('')
621 self._pos += 1
622 elif c in ('\\', '\'', '"', '`', '$'):
623 self._state = self.ST_QUOTED
624 elif is_partial_op(c):
625 self._push_token(c)
626
627 self._type = TK_OP
628 self._token += c
629 self._pos += 1
630 self._state = self.ST_OP
631 elif is_blank(c):
632 self._push_token(c)
633
634 #Discard blanks
635 self._pos += 1
636 elif self._token:
637 self._token += c
638 self._pos += 1
639 elif c=='#':
640 self._state = self.ST_COMMENT
641 self._type = TK_COMMENT
642 self._pos += 1
643 else:
644 self._pos += 1
645 self._token += c
646
647 def _parse_op(self, eof):
648 assert self._token
649
650 while 1:
651 if self._pos>=len(self._input):
652 if not eof:
653 raise NeedMore()
654 c = ''
655 else:
656 c = self._input[self._pos]
657
658 op = self._token + c
659 if c and is_partial_op(op):
660 #Still parsing an operator
661 self._token = op
662 self._pos += 1
663 else:
664 #End of operator
665 self._push_token(c)
666 self._state = self.ST_NORMAL
667 break
668
669 def _parse_comment(self):
670 while 1:
671 if self._pos>=len(self._input):
672 raise NeedMore()
673
674 c = self._input[self._pos]
675 if c=='\n':
676 #End of comment, do not consume the end of line
677 self._state = self.ST_NORMAL
678 break
679 else:
680 self._token += c
681 self._pos += 1
682
683 def _parse_quoted(self, eof):
684 """Precondition: the starting backquote/dollar is still in the input queue."""
685 if not self._wordlexer:
686 self._wordlexer = WordLexer()
687
688 if self._pos<len(self._input):
689 #Transfer input queue character into the subparser
690 input = self._input[self._pos:]
691 self._pos += len(input)
692
693 wtree, remaining = self._wordlexer.add(input, eof)
694 self._wordlexer = None
695 self._token += wordtree_as_string(wtree)
696
697 #Put unparsed character back in the input queue
698 if remaining:
699 self._input[self._pos:self._pos] = list(remaining)
700 self._state = self.ST_NORMAL
701
702 def _parse_heredoc(self, eof):
703 assert not self._token
704
705 if self._herelexer is None:
706 self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name)
707
708 if self._pos<len(self._input):
709 #Transfer input queue character into the subparser
710 input = self._input[self._pos:]
711 self._pos += len(input)
712
713 self._token, remaining = self._herelexer.add(input, eof)
714
715 #Reset here-document state
716 self._herelexer = None
717 heredoc, self._heredoc = self._heredoc, HereDoc(None)
718 if remaining:
719 self._input[self._pos:self._pos] = list(remaining)
720 self._state = self.ST_NORMAL
721
722 #Push pending tokens
723 heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)]
724 for token, type, delim in heredoc.pendings:
725 self._token = token
726 self._type = type
727 self._push_token(delim)
728
729 def _push_token(self, delim):
730 if not self._token:
731 return 0
732
733 if self._heredoc.op is not None:
734 if self._heredoc.name is None:
735 #Here-document name
736 if self._type!=TK_TOKEN:
737 raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token)
738 self._heredoc.name = unquote_wordtree(make_wordtree(self._token))
739 self._type = TK_HERENAME
740 else:
741 #Capture all tokens until the newline starting the here-document
742 if self._type==TK_NEWLINE:
743 assert self._state==self.ST_NORMAL
744 self._state = self.ST_HEREDOC
745
746 self._heredoc.pendings.append((self._token, self._type, delim))
747 self._token = ''
748 self._type = TK_TOKEN
749 return 1
750
751 # BEWARE: do not change parser state from here to the end of the function:
752 # when parsing between an here-document operator to the end of the line
753 # tokens are stored in self._heredoc.pendings. Therefore, they will not
754 # reach the section below.
755
756 #Check operators
757 if self._type==TK_OP:
758 #False positive because of partial op matching
759 op = is_op(self._token)
760 if not op:
761 self._type = TK_TOKEN
762 else:
763 #Map to the specific operator
764 self._type = op
765 if self._token in ('<<', '<<-'):
766 #Done here rather than in _parse_op because there is no need
767 #to change the parser state since we are still waiting for
768 #the here-document name
769 if self._heredoc.op is not None:
770 raise ShellSyntaxError("syntax error near token '%s'" % self._token)
771 assert self._heredoc.op is None
772 self._heredoc.op = self._token
773
774 if self._type==TK_TOKEN:
775 if '=' in self._token and not delim:
776 if self._token.startswith('='):
777 #Token is a WORD... a TOKEN that is.
778 pass
779 else:
780 prev = self._token[:self._token.find('=')]
781 if is_name(prev):
782 self._type = TK_ASSIGNMENT
783 else:
784 #Just a token (unspecified)
785 pass
786 else:
787 reserved = get_reserved(self._token)
788 if reserved is not None:
789 if reserved=='In' and self._for_count!=2:
790 #Sorry, not a reserved word after all
791 pass
792 else:
793 self._type = reserved
794 if reserved in ('For', 'Case'):
795 self._for_count = 0
796 elif are_digits(self._token) and delim in ('<', '>'):
797 #Detect IO_NUMBER
798 self._type = TK_IONUMBER
799 elif self._token==';':
800 self._type = TK_COMMA
801 elif self._token=='&':
802 self._type = TK_AMPERSAND
803 elif self._type==TK_COMMENT:
804 #Comments are not part of sh grammar, ignore them
805 self._token = ''
806 self._type = TK_TOKEN
807 return 0
808
809 if self._for_count is not None:
810 #Track token count in 'For' expression to detect 'In' reserved words.
811 #Can only be in third position, no need to go beyond
812 self._for_count += 1
813 if self._for_count==3:
814 self._for_count = None
815
816 self.on_token((self._token, self._type))
817 self._token = ''
818 self._type = TK_TOKEN
819 return 1
820
821 def on_token(self, token):
822 raise NotImplementedError
823
824
825tokens = [
826 TK_TOKEN,
827# To silence yacc unused token warnings
828# TK_COMMENT,
829 TK_NEWLINE,
830 TK_IONUMBER,
831 TK_ASSIGNMENT,
832 TK_HERENAME,
833]
834
835#Add specific operators
836tokens += _OPERATORS.values()
837#Add reserved words
838tokens += _RESERVEDS.values()
839
840class PLYLexer(Lexer):
841 """Bridge Lexer and PLY lexer interface."""
842 def __init__(self):
843 Lexer.__init__(self)
844 self._tokens = []
845 self._current = 0
846 self.lineno = 0
847
848 def on_token(self, token):
849 value, type = token
850
851 self.lineno = 0
852 t = lex.LexToken()
853 t.value = value
854 t.type = type
855 t.lexer = self
856 t.lexpos = 0
857 t.lineno = 0
858
859 self._tokens.append(t)
860
861 def is_empty(self):
862 return not bool(self._tokens)
863
864 #PLY compliant interface
865 def token(self):
866 if self._current>=len(self._tokens):
867 return None
868 t = self._tokens[self._current]
869 self._current += 1
870 return t
871
872
873def get_tokens(s):
874 """Parse the input string and return a tuple (tokens, unprocessed) where
875 tokens is a list of parsed tokens and unprocessed is the part of the input
876 string left untouched by the lexer.
877 """
878 lexer = PLYLexer()
879 untouched = lexer.add(s, True)
880 tokens = []
881 while 1:
882 token = lexer.token()
883 if token is None:
884 break
885 tokens.append(token)
886
887 tokens = [(t.value, t.type) for t in tokens]
888 return tokens, untouched