Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | # pyshlex.py - PLY compatible lexer for pysh. |
| 2 | # |
| 3 | # Copyright 2007 Patrick Mezard |
| 4 | # |
| 5 | # This software may be used and distributed according to the terms |
| 6 | # of the GNU General Public License, incorporated herein by reference. |
| 7 | |
| 8 | # TODO: |
| 9 | # - review all "char in 'abc'" snippets: the empty string can be matched |
| 10 | # - test line continuations within quoted/expansion strings |
| 11 | # - eof is buggy wrt sublexers |
| 12 | # - the lexer cannot really work in pull mode as it would be required to run |
| 13 | # PLY in pull mode. It was designed to work incrementally and it would not be |
| 14 | # that hard to enable pull mode. |
| 15 | import re |
| 16 | try: |
| 17 | s = set() |
| 18 | del s |
| 19 | except NameError: |
| 20 | from Set import Set as set |
| 21 | |
| 22 | from ply import lex |
| 23 | from sherrors import * |
| 24 | |
| 25 | class NeedMore(Exception): |
| 26 | pass |
| 27 | |
| 28 | def is_blank(c): |
| 29 | return c in (' ', '\t') |
| 30 | |
| 31 | _RE_DIGITS = re.compile(r'^\d+$') |
| 32 | |
| 33 | def are_digits(s): |
| 34 | return _RE_DIGITS.search(s) is not None |
| 35 | |
| 36 | _OPERATORS = dict([ |
| 37 | ('&&', 'AND_IF'), |
| 38 | ('||', 'OR_IF'), |
| 39 | (';;', 'DSEMI'), |
| 40 | ('<<', 'DLESS'), |
| 41 | ('>>', 'DGREAT'), |
| 42 | ('<&', 'LESSAND'), |
| 43 | ('>&', 'GREATAND'), |
| 44 | ('<>', 'LESSGREAT'), |
| 45 | ('<<-', 'DLESSDASH'), |
| 46 | ('>|', 'CLOBBER'), |
| 47 | ('&', 'AMP'), |
| 48 | (';', 'COMMA'), |
| 49 | ('<', 'LESS'), |
| 50 | ('>', 'GREATER'), |
| 51 | ('(', 'LPARENS'), |
| 52 | (')', 'RPARENS'), |
| 53 | ]) |
| 54 | |
| 55 | #Make a function to silence pychecker "Local variable shadows global" |
| 56 | def make_partial_ops(): |
| 57 | partials = {} |
| 58 | for k in _OPERATORS: |
| 59 | for i in range(1, len(k)+1): |
| 60 | partials[k[:i]] = None |
| 61 | return partials |
| 62 | |
| 63 | _PARTIAL_OPERATORS = make_partial_ops() |
| 64 | |
| 65 | def is_partial_op(s): |
| 66 | """Return True if s matches a non-empty subpart of an operator starting |
| 67 | at its first character. |
| 68 | """ |
| 69 | return s in _PARTIAL_OPERATORS |
| 70 | |
| 71 | def is_op(s): |
| 72 | """If s matches an operator, returns the operator identifier. Return None |
| 73 | otherwise. |
| 74 | """ |
| 75 | return _OPERATORS.get(s) |
| 76 | |
| 77 | _RESERVEDS = dict([ |
| 78 | ('if', 'If'), |
| 79 | ('then', 'Then'), |
| 80 | ('else', 'Else'), |
| 81 | ('elif', 'Elif'), |
| 82 | ('fi', 'Fi'), |
| 83 | ('do', 'Do'), |
| 84 | ('done', 'Done'), |
| 85 | ('case', 'Case'), |
| 86 | ('esac', 'Esac'), |
| 87 | ('while', 'While'), |
| 88 | ('until', 'Until'), |
| 89 | ('for', 'For'), |
| 90 | ('{', 'Lbrace'), |
| 91 | ('}', 'Rbrace'), |
| 92 | ('!', 'Bang'), |
| 93 | ('in', 'In'), |
| 94 | ('|', 'PIPE'), |
| 95 | ]) |
| 96 | |
| 97 | def get_reserved(s): |
| 98 | return _RESERVEDS.get(s) |
| 99 | |
| 100 | _RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$') |
| 101 | |
| 102 | def is_name(s): |
| 103 | return _RE_NAME.search(s) is not None |
| 104 | |
| 105 | def find_chars(seq, chars): |
| 106 | for i,v in enumerate(seq): |
| 107 | if v in chars: |
| 108 | return i,v |
| 109 | return -1, None |
| 110 | |
| 111 | class WordLexer: |
| 112 | """WordLexer parse quoted or expansion expressions and return an expression |
| 113 | tree. The input string can be any well formed sequence beginning with quoting |
| 114 | or expansion character. Embedded expressions are handled recursively. The |
| 115 | resulting tree is made of lists and strings. Lists represent quoted or |
| 116 | expansion expressions. Each list first element is the opening separator, |
| 117 | the last one the closing separator. In-between can be any number of strings |
| 118 | or lists for sub-expressions. Non quoted/expansion expression can written as |
| 119 | strings or as lists with empty strings as starting and ending delimiters. |
| 120 | """ |
| 121 | |
| 122 | NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_' |
| 123 | NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET)) |
| 124 | |
| 125 | SPECIAL_CHARSET = '@*#?-$!0' |
| 126 | |
| 127 | #Characters which can be escaped depends on the current delimiters |
| 128 | ESCAPABLE = { |
| 129 | '`': set(['$', '\\', '`']), |
| 130 | '"': set(['$', '\\', '`', '"']), |
| 131 | "'": set(), |
| 132 | } |
| 133 | |
| 134 | def __init__(self, heredoc = False): |
| 135 | # _buffer is the unprocessed input characters buffer |
| 136 | self._buffer = [] |
| 137 | # _stack is empty or contains a quoted list being processed |
| 138 | # (this is the DFS path to the quoted expression being evaluated). |
| 139 | self._stack = [] |
| 140 | self._escapable = None |
| 141 | # True when parsing unquoted here documents |
| 142 | self._heredoc = heredoc |
| 143 | |
| 144 | def add(self, data, eof=False): |
| 145 | """Feed the lexer with more data. If the quoted expression can be |
| 146 | delimited, return a tuple (expr, remaining) containing the expression |
| 147 | tree and the unconsumed data. |
| 148 | Otherwise, raise NeedMore. |
| 149 | """ |
| 150 | self._buffer += list(data) |
| 151 | self._parse(eof) |
| 152 | |
| 153 | result = self._stack[0] |
| 154 | remaining = ''.join(self._buffer) |
| 155 | self._stack = [] |
| 156 | self._buffer = [] |
| 157 | return result, remaining |
| 158 | |
| 159 | def _is_escapable(self, c, delim=None): |
| 160 | if delim is None: |
| 161 | if self._heredoc: |
| 162 | # Backslashes works as if they were double quoted in unquoted |
| 163 | # here-documents |
| 164 | delim = '"' |
| 165 | else: |
| 166 | if len(self._stack)<=1: |
| 167 | return True |
| 168 | delim = self._stack[-2][0] |
| 169 | |
| 170 | escapables = self.ESCAPABLE.get(delim, None) |
| 171 | return escapables is None or c in escapables |
| 172 | |
| 173 | def _parse_squote(self, buf, result, eof): |
| 174 | if not buf: |
| 175 | raise NeedMore() |
| 176 | try: |
| 177 | pos = buf.index("'") |
| 178 | except ValueError: |
| 179 | raise NeedMore() |
| 180 | result[-1] += ''.join(buf[:pos]) |
| 181 | result += ["'"] |
| 182 | return pos+1, True |
| 183 | |
| 184 | def _parse_bquote(self, buf, result, eof): |
| 185 | if not buf: |
| 186 | raise NeedMore() |
| 187 | |
| 188 | if buf[0]=='\n': |
| 189 | #Remove line continuations |
| 190 | result[:] = ['', '', ''] |
| 191 | elif self._is_escapable(buf[0]): |
| 192 | result[-1] += buf[0] |
| 193 | result += [''] |
| 194 | else: |
| 195 | #Keep as such |
| 196 | result[:] = ['', '\\'+buf[0], ''] |
| 197 | |
| 198 | return 1, True |
| 199 | |
| 200 | def _parse_dquote(self, buf, result, eof): |
| 201 | if not buf: |
| 202 | raise NeedMore() |
| 203 | pos, sep = find_chars(buf, '$\\`"') |
| 204 | if pos==-1: |
| 205 | raise NeedMore() |
| 206 | |
| 207 | result[-1] += ''.join(buf[:pos]) |
| 208 | if sep=='"': |
| 209 | result += ['"'] |
| 210 | return pos+1, True |
| 211 | else: |
| 212 | #Keep everything until the separator and defer processing |
| 213 | return pos, False |
| 214 | |
| 215 | def _parse_command(self, buf, result, eof): |
| 216 | if not buf: |
| 217 | raise NeedMore() |
| 218 | |
| 219 | chars = '$\\`"\'' |
| 220 | if result[0] == '$(': |
| 221 | chars += ')' |
| 222 | pos, sep = find_chars(buf, chars) |
| 223 | if pos == -1: |
| 224 | raise NeedMore() |
| 225 | |
| 226 | result[-1] += ''.join(buf[:pos]) |
| 227 | if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'): |
| 228 | result += [sep] |
| 229 | return pos+1, True |
| 230 | else: |
| 231 | return pos, False |
| 232 | |
| 233 | def _parse_parameter(self, buf, result, eof): |
| 234 | if not buf: |
| 235 | raise NeedMore() |
| 236 | |
| 237 | pos, sep = find_chars(buf, '$\\`"\'}') |
| 238 | if pos==-1: |
| 239 | raise NeedMore() |
| 240 | |
| 241 | result[-1] += ''.join(buf[:pos]) |
| 242 | if sep=='}': |
| 243 | result += [sep] |
| 244 | return pos+1, True |
| 245 | else: |
| 246 | return pos, False |
| 247 | |
| 248 | def _parse_dollar(self, buf, result, eof): |
| 249 | sep = result[0] |
| 250 | if sep=='$': |
| 251 | if not buf: |
| 252 | #TODO: handle empty $ |
| 253 | raise NeedMore() |
| 254 | if buf[0]=='(': |
| 255 | if len(buf)==1: |
| 256 | raise NeedMore() |
| 257 | |
| 258 | if buf[1]=='(': |
| 259 | result[0] = '$((' |
| 260 | buf[:2] = [] |
| 261 | else: |
| 262 | result[0] = '$(' |
| 263 | buf[:1] = [] |
| 264 | |
| 265 | elif buf[0]=='{': |
| 266 | result[0] = '${' |
| 267 | buf[:1] = [] |
| 268 | else: |
| 269 | if buf[0] in self.SPECIAL_CHARSET: |
| 270 | result[-1] = buf[0] |
| 271 | read = 1 |
| 272 | else: |
| 273 | for read,c in enumerate(buf): |
| 274 | if c not in self.NAME_CHARSET: |
| 275 | break |
| 276 | else: |
| 277 | if not eof: |
| 278 | raise NeedMore() |
| 279 | read += 1 |
| 280 | |
| 281 | result[-1] += ''.join(buf[0:read]) |
| 282 | |
| 283 | if not result[-1]: |
| 284 | result[:] = ['', result[0], ''] |
| 285 | else: |
| 286 | result += [''] |
| 287 | return read,True |
| 288 | |
| 289 | sep = result[0] |
| 290 | if sep=='$(': |
| 291 | parsefunc = self._parse_command |
| 292 | elif sep=='${': |
| 293 | parsefunc = self._parse_parameter |
| 294 | else: |
| 295 | raise NotImplementedError(sep) |
| 296 | |
| 297 | pos, closed = parsefunc(buf, result, eof) |
| 298 | return pos, closed |
| 299 | |
| 300 | def _parse(self, eof): |
| 301 | buf = self._buffer |
| 302 | stack = self._stack |
| 303 | recurse = False |
| 304 | |
| 305 | while 1: |
| 306 | if not stack or recurse: |
| 307 | if not buf: |
| 308 | raise NeedMore() |
| 309 | if buf[0] not in ('"\\`$\''): |
| 310 | raise ShellSyntaxError('Invalid quoted string sequence') |
| 311 | stack.append([buf[0], '']) |
| 312 | buf[:1] = [] |
| 313 | recurse = False |
| 314 | |
| 315 | result = stack[-1] |
| 316 | if result[0]=="'": |
| 317 | parsefunc = self._parse_squote |
| 318 | elif result[0]=='\\': |
| 319 | parsefunc = self._parse_bquote |
| 320 | elif result[0]=='"': |
| 321 | parsefunc = self._parse_dquote |
| 322 | elif result[0]=='`': |
| 323 | parsefunc = self._parse_command |
| 324 | elif result[0][0]=='$': |
| 325 | parsefunc = self._parse_dollar |
| 326 | else: |
| 327 | raise NotImplementedError() |
| 328 | |
| 329 | read, closed = parsefunc(buf, result, eof) |
| 330 | |
| 331 | buf[:read] = [] |
| 332 | if closed: |
| 333 | if len(stack)>1: |
| 334 | #Merge in parent expression |
| 335 | parsed = stack.pop() |
| 336 | stack[-1] += [parsed] |
| 337 | stack[-1] += [''] |
| 338 | else: |
| 339 | break |
| 340 | else: |
| 341 | recurse = True |
| 342 | |
| 343 | def normalize_wordtree(wtree): |
| 344 | """Fold back every literal sequence (delimited with empty strings) into |
| 345 | parent sequence. |
| 346 | """ |
| 347 | def normalize(wtree): |
| 348 | result = [] |
| 349 | for part in wtree[1:-1]: |
| 350 | if isinstance(part, list): |
| 351 | part = normalize(part) |
| 352 | if part[0]=='': |
| 353 | #Move the part content back at current level |
| 354 | result += part[1:-1] |
| 355 | continue |
| 356 | elif not part: |
| 357 | #Remove empty strings |
| 358 | continue |
| 359 | result.append(part) |
| 360 | if not result: |
| 361 | result = [''] |
| 362 | return [wtree[0]] + result + [wtree[-1]] |
| 363 | |
| 364 | return normalize(wtree) |
| 365 | |
| 366 | |
| 367 | def make_wordtree(token, here_document=False): |
| 368 | """Parse a delimited token and return a tree similar to the ones returned by |
| 369 | WordLexer. token may contain any combinations of expansion/quoted fields and |
| 370 | non-ones. |
| 371 | """ |
| 372 | tree = [''] |
| 373 | remaining = token |
| 374 | delimiters = '\\$`' |
| 375 | if not here_document: |
| 376 | delimiters += '\'"' |
| 377 | |
| 378 | while 1: |
| 379 | pos, sep = find_chars(remaining, delimiters) |
| 380 | if pos==-1: |
| 381 | tree += [remaining, ''] |
| 382 | return normalize_wordtree(tree) |
| 383 | tree.append(remaining[:pos]) |
| 384 | remaining = remaining[pos:] |
| 385 | |
| 386 | try: |
| 387 | result, remaining = WordLexer(heredoc = here_document).add(remaining, True) |
| 388 | except NeedMore: |
| 389 | raise ShellSyntaxError('Invalid token "%s"') |
| 390 | tree.append(result) |
| 391 | |
| 392 | |
| 393 | def wordtree_as_string(wtree): |
| 394 | """Rewrite an expression tree generated by make_wordtree as string.""" |
| 395 | def visit(node, output): |
| 396 | for child in node: |
| 397 | if isinstance(child, list): |
| 398 | visit(child, output) |
| 399 | else: |
| 400 | output.append(child) |
| 401 | |
| 402 | output = [] |
| 403 | visit(wtree, output) |
| 404 | return ''.join(output) |
| 405 | |
| 406 | |
| 407 | def unquote_wordtree(wtree): |
| 408 | """Fold the word tree while removing quotes everywhere. Other expansion |
| 409 | sequences are joined as such. |
| 410 | """ |
| 411 | def unquote(wtree): |
| 412 | unquoted = [] |
| 413 | if wtree[0] in ('', "'", '"', '\\'): |
| 414 | wtree = wtree[1:-1] |
| 415 | |
| 416 | for part in wtree: |
| 417 | if isinstance(part, list): |
| 418 | part = unquote(part) |
| 419 | unquoted.append(part) |
| 420 | return ''.join(unquoted) |
| 421 | |
| 422 | return unquote(wtree) |
| 423 | |
| 424 | |
| 425 | class HereDocLexer: |
| 426 | """HereDocLexer delimits whatever comes from the here-document starting newline |
| 427 | not included to the closing delimiter line included. |
| 428 | """ |
| 429 | def __init__(self, op, delim): |
| 430 | assert op in ('<<', '<<-') |
| 431 | if not delim: |
| 432 | raise ShellSyntaxError('invalid here document delimiter %s' % str(delim)) |
| 433 | |
| 434 | self._op = op |
| 435 | self._delim = delim |
| 436 | self._buffer = [] |
| 437 | self._token = [] |
| 438 | |
| 439 | def add(self, data, eof): |
| 440 | """If the here-document was delimited, return a tuple (content, remaining). |
| 441 | Raise NeedMore() otherwise. |
| 442 | """ |
| 443 | self._buffer += list(data) |
| 444 | self._parse(eof) |
| 445 | token = ''.join(self._token) |
| 446 | remaining = ''.join(self._buffer) |
| 447 | self._token, self._remaining = [], [] |
| 448 | return token, remaining |
| 449 | |
| 450 | def _parse(self, eof): |
| 451 | while 1: |
| 452 | #Look for first unescaped newline. Quotes may be ignored |
| 453 | escaped = False |
| 454 | for i,c in enumerate(self._buffer): |
| 455 | if escaped: |
| 456 | escaped = False |
| 457 | elif c=='\\': |
| 458 | escaped = True |
| 459 | elif c=='\n': |
| 460 | break |
| 461 | else: |
| 462 | i = -1 |
| 463 | |
| 464 | if i==-1 or self._buffer[i]!='\n': |
| 465 | if not eof: |
| 466 | raise NeedMore() |
| 467 | #No more data, maybe the last line is closing delimiter |
| 468 | line = ''.join(self._buffer) |
| 469 | eol = '' |
| 470 | self._buffer[:] = [] |
| 471 | else: |
| 472 | line = ''.join(self._buffer[:i]) |
| 473 | eol = self._buffer[i] |
| 474 | self._buffer[:i+1] = [] |
| 475 | |
| 476 | if self._op=='<<-': |
| 477 | line = line.lstrip('\t') |
| 478 | |
| 479 | if line==self._delim: |
| 480 | break |
| 481 | |
| 482 | self._token += [line, eol] |
| 483 | if i==-1: |
| 484 | break |
| 485 | |
| 486 | class Token: |
| 487 | #TODO: check this is still in use |
| 488 | OPERATOR = 'OPERATOR' |
| 489 | WORD = 'WORD' |
| 490 | |
| 491 | def __init__(self): |
| 492 | self.value = '' |
| 493 | self.type = None |
| 494 | |
| 495 | def __getitem__(self, key): |
| 496 | #Behave like a two elements tuple |
| 497 | if key==0: |
| 498 | return self.type |
| 499 | if key==1: |
| 500 | return self.value |
| 501 | raise IndexError(key) |
| 502 | |
| 503 | |
| 504 | class HereDoc: |
| 505 | def __init__(self, op, name=None): |
| 506 | self.op = op |
| 507 | self.name = name |
| 508 | self.pendings = [] |
| 509 | |
| 510 | TK_COMMA = 'COMMA' |
| 511 | TK_AMPERSAND = 'AMP' |
| 512 | TK_OP = 'OP' |
| 513 | TK_TOKEN = 'TOKEN' |
| 514 | TK_COMMENT = 'COMMENT' |
| 515 | TK_NEWLINE = 'NEWLINE' |
| 516 | TK_IONUMBER = 'IO_NUMBER' |
| 517 | TK_ASSIGNMENT = 'ASSIGNMENT_WORD' |
| 518 | TK_HERENAME = 'HERENAME' |
| 519 | |
| 520 | class Lexer: |
| 521 | """Main lexer. |
| 522 | |
| 523 | Call add() until the script AST is returned. |
| 524 | """ |
| 525 | # Here-document handling makes the whole thing more complex because they basically |
| 526 | # force tokens to be reordered: here-content must come right after the operator |
| 527 | # and the here-document name, while some other tokens might be following the |
| 528 | # here-document expression on the same line. |
| 529 | # |
| 530 | # So, here-doc states are basically: |
| 531 | # *self._state==ST_NORMAL |
| 532 | # - self._heredoc.op is None: no here-document |
| 533 | # - self._heredoc.op is not None but name is: here-document operator matched, |
| 534 | # waiting for the document name/delimiter |
| 535 | # - self._heredoc.op and name are not None: here-document is ready, following |
| 536 | # tokens are being stored and will be pushed again when the document is |
| 537 | # completely parsed. |
| 538 | # *self._state==ST_HEREDOC |
| 539 | # - The here-document is being delimited by self._herelexer. Once it is done |
| 540 | # the content is pushed in front of the pending token list then all these |
| 541 | # tokens are pushed once again. |
| 542 | ST_NORMAL = 'ST_NORMAL' |
| 543 | ST_OP = 'ST_OP' |
| 544 | ST_BACKSLASH = 'ST_BACKSLASH' |
| 545 | ST_QUOTED = 'ST_QUOTED' |
| 546 | ST_COMMENT = 'ST_COMMENT' |
| 547 | ST_HEREDOC = 'ST_HEREDOC' |
| 548 | |
| 549 | #Match end of backquote strings |
| 550 | RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)') |
| 551 | |
| 552 | def __init__(self, parent_state = None): |
| 553 | self._input = [] |
| 554 | self._pos = 0 |
| 555 | |
| 556 | self._token = '' |
| 557 | self._type = TK_TOKEN |
| 558 | |
| 559 | self._state = self.ST_NORMAL |
| 560 | self._parent_state = parent_state |
| 561 | self._wordlexer = None |
| 562 | |
| 563 | self._heredoc = HereDoc(None) |
| 564 | self._herelexer = None |
| 565 | |
| 566 | ### Following attributes are not used for delimiting token and can safely |
| 567 | ### be changed after here-document detection (see _push_toke) |
| 568 | |
| 569 | # Count the number of tokens following a 'For' reserved word. Needed to |
| 570 | # return an 'In' reserved word if it comes in third place. |
| 571 | self._for_count = None |
| 572 | |
| 573 | def add(self, data, eof=False): |
| 574 | """Feed the lexer with data. |
| 575 | |
| 576 | When eof is set to True, returns unconsumed data or raise if the lexer |
| 577 | is in the middle of a delimiting operation. |
| 578 | Raise NeedMore otherwise. |
| 579 | """ |
| 580 | self._input += list(data) |
| 581 | self._parse(eof) |
| 582 | self._input[:self._pos] = [] |
| 583 | return ''.join(self._input) |
| 584 | |
| 585 | def _parse(self, eof): |
| 586 | while self._state: |
| 587 | if self._pos>=len(self._input): |
| 588 | if not eof: |
| 589 | raise NeedMore() |
| 590 | elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC): |
| 591 | #Delimit the current token and leave cleanly |
| 592 | self._push_token('') |
| 593 | break |
| 594 | else: |
| 595 | #Let the sublexer handle the eof themselves |
| 596 | pass |
| 597 | |
| 598 | if self._state==self.ST_NORMAL: |
| 599 | self._parse_normal() |
| 600 | elif self._state==self.ST_COMMENT: |
| 601 | self._parse_comment() |
| 602 | elif self._state==self.ST_OP: |
| 603 | self._parse_op(eof) |
| 604 | elif self._state==self.ST_QUOTED: |
| 605 | self._parse_quoted(eof) |
| 606 | elif self._state==self.ST_HEREDOC: |
| 607 | self._parse_heredoc(eof) |
| 608 | else: |
| 609 | assert False, "Unknown state " + str(self._state) |
| 610 | |
| 611 | if self._heredoc.op is not None: |
| 612 | raise ShellSyntaxError('missing here-document delimiter') |
| 613 | |
| 614 | def _parse_normal(self): |
| 615 | c = self._input[self._pos] |
| 616 | if c=='\n': |
| 617 | self._push_token(c) |
| 618 | self._token = c |
| 619 | self._type = TK_NEWLINE |
| 620 | self._push_token('') |
| 621 | self._pos += 1 |
| 622 | elif c in ('\\', '\'', '"', '`', '$'): |
| 623 | self._state = self.ST_QUOTED |
| 624 | elif is_partial_op(c): |
| 625 | self._push_token(c) |
| 626 | |
| 627 | self._type = TK_OP |
| 628 | self._token += c |
| 629 | self._pos += 1 |
| 630 | self._state = self.ST_OP |
| 631 | elif is_blank(c): |
| 632 | self._push_token(c) |
| 633 | |
| 634 | #Discard blanks |
| 635 | self._pos += 1 |
| 636 | elif self._token: |
| 637 | self._token += c |
| 638 | self._pos += 1 |
| 639 | elif c=='#': |
| 640 | self._state = self.ST_COMMENT |
| 641 | self._type = TK_COMMENT |
| 642 | self._pos += 1 |
| 643 | else: |
| 644 | self._pos += 1 |
| 645 | self._token += c |
| 646 | |
| 647 | def _parse_op(self, eof): |
| 648 | assert self._token |
| 649 | |
| 650 | while 1: |
| 651 | if self._pos>=len(self._input): |
| 652 | if not eof: |
| 653 | raise NeedMore() |
| 654 | c = '' |
| 655 | else: |
| 656 | c = self._input[self._pos] |
| 657 | |
| 658 | op = self._token + c |
| 659 | if c and is_partial_op(op): |
| 660 | #Still parsing an operator |
| 661 | self._token = op |
| 662 | self._pos += 1 |
| 663 | else: |
| 664 | #End of operator |
| 665 | self._push_token(c) |
| 666 | self._state = self.ST_NORMAL |
| 667 | break |
| 668 | |
| 669 | def _parse_comment(self): |
| 670 | while 1: |
| 671 | if self._pos>=len(self._input): |
| 672 | raise NeedMore() |
| 673 | |
| 674 | c = self._input[self._pos] |
| 675 | if c=='\n': |
| 676 | #End of comment, do not consume the end of line |
| 677 | self._state = self.ST_NORMAL |
| 678 | break |
| 679 | else: |
| 680 | self._token += c |
| 681 | self._pos += 1 |
| 682 | |
| 683 | def _parse_quoted(self, eof): |
| 684 | """Precondition: the starting backquote/dollar is still in the input queue.""" |
| 685 | if not self._wordlexer: |
| 686 | self._wordlexer = WordLexer() |
| 687 | |
| 688 | if self._pos<len(self._input): |
| 689 | #Transfer input queue character into the subparser |
| 690 | input = self._input[self._pos:] |
| 691 | self._pos += len(input) |
| 692 | |
| 693 | wtree, remaining = self._wordlexer.add(input, eof) |
| 694 | self._wordlexer = None |
| 695 | self._token += wordtree_as_string(wtree) |
| 696 | |
| 697 | #Put unparsed character back in the input queue |
| 698 | if remaining: |
| 699 | self._input[self._pos:self._pos] = list(remaining) |
| 700 | self._state = self.ST_NORMAL |
| 701 | |
| 702 | def _parse_heredoc(self, eof): |
| 703 | assert not self._token |
| 704 | |
| 705 | if self._herelexer is None: |
| 706 | self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name) |
| 707 | |
| 708 | if self._pos<len(self._input): |
| 709 | #Transfer input queue character into the subparser |
| 710 | input = self._input[self._pos:] |
| 711 | self._pos += len(input) |
| 712 | |
| 713 | self._token, remaining = self._herelexer.add(input, eof) |
| 714 | |
| 715 | #Reset here-document state |
| 716 | self._herelexer = None |
| 717 | heredoc, self._heredoc = self._heredoc, HereDoc(None) |
| 718 | if remaining: |
| 719 | self._input[self._pos:self._pos] = list(remaining) |
| 720 | self._state = self.ST_NORMAL |
| 721 | |
| 722 | #Push pending tokens |
| 723 | heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)] |
| 724 | for token, type, delim in heredoc.pendings: |
| 725 | self._token = token |
| 726 | self._type = type |
| 727 | self._push_token(delim) |
| 728 | |
| 729 | def _push_token(self, delim): |
| 730 | if not self._token: |
| 731 | return 0 |
| 732 | |
| 733 | if self._heredoc.op is not None: |
| 734 | if self._heredoc.name is None: |
| 735 | #Here-document name |
| 736 | if self._type!=TK_TOKEN: |
| 737 | raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token) |
| 738 | self._heredoc.name = unquote_wordtree(make_wordtree(self._token)) |
| 739 | self._type = TK_HERENAME |
| 740 | else: |
| 741 | #Capture all tokens until the newline starting the here-document |
| 742 | if self._type==TK_NEWLINE: |
| 743 | assert self._state==self.ST_NORMAL |
| 744 | self._state = self.ST_HEREDOC |
| 745 | |
| 746 | self._heredoc.pendings.append((self._token, self._type, delim)) |
| 747 | self._token = '' |
| 748 | self._type = TK_TOKEN |
| 749 | return 1 |
| 750 | |
| 751 | # BEWARE: do not change parser state from here to the end of the function: |
| 752 | # when parsing between an here-document operator to the end of the line |
| 753 | # tokens are stored in self._heredoc.pendings. Therefore, they will not |
| 754 | # reach the section below. |
| 755 | |
| 756 | #Check operators |
| 757 | if self._type==TK_OP: |
| 758 | #False positive because of partial op matching |
| 759 | op = is_op(self._token) |
| 760 | if not op: |
| 761 | self._type = TK_TOKEN |
| 762 | else: |
| 763 | #Map to the specific operator |
| 764 | self._type = op |
| 765 | if self._token in ('<<', '<<-'): |
| 766 | #Done here rather than in _parse_op because there is no need |
| 767 | #to change the parser state since we are still waiting for |
| 768 | #the here-document name |
| 769 | if self._heredoc.op is not None: |
| 770 | raise ShellSyntaxError("syntax error near token '%s'" % self._token) |
| 771 | assert self._heredoc.op is None |
| 772 | self._heredoc.op = self._token |
| 773 | |
| 774 | if self._type==TK_TOKEN: |
| 775 | if '=' in self._token and not delim: |
| 776 | if self._token.startswith('='): |
| 777 | #Token is a WORD... a TOKEN that is. |
| 778 | pass |
| 779 | else: |
| 780 | prev = self._token[:self._token.find('=')] |
| 781 | if is_name(prev): |
| 782 | self._type = TK_ASSIGNMENT |
| 783 | else: |
| 784 | #Just a token (unspecified) |
| 785 | pass |
| 786 | else: |
| 787 | reserved = get_reserved(self._token) |
| 788 | if reserved is not None: |
| 789 | if reserved=='In' and self._for_count!=2: |
| 790 | #Sorry, not a reserved word after all |
| 791 | pass |
| 792 | else: |
| 793 | self._type = reserved |
| 794 | if reserved in ('For', 'Case'): |
| 795 | self._for_count = 0 |
| 796 | elif are_digits(self._token) and delim in ('<', '>'): |
| 797 | #Detect IO_NUMBER |
| 798 | self._type = TK_IONUMBER |
| 799 | elif self._token==';': |
| 800 | self._type = TK_COMMA |
| 801 | elif self._token=='&': |
| 802 | self._type = TK_AMPERSAND |
| 803 | elif self._type==TK_COMMENT: |
| 804 | #Comments are not part of sh grammar, ignore them |
| 805 | self._token = '' |
| 806 | self._type = TK_TOKEN |
| 807 | return 0 |
| 808 | |
| 809 | if self._for_count is not None: |
| 810 | #Track token count in 'For' expression to detect 'In' reserved words. |
| 811 | #Can only be in third position, no need to go beyond |
| 812 | self._for_count += 1 |
| 813 | if self._for_count==3: |
| 814 | self._for_count = None |
| 815 | |
| 816 | self.on_token((self._token, self._type)) |
| 817 | self._token = '' |
| 818 | self._type = TK_TOKEN |
| 819 | return 1 |
| 820 | |
| 821 | def on_token(self, token): |
| 822 | raise NotImplementedError |
| 823 | |
| 824 | |
| 825 | tokens = [ |
| 826 | TK_TOKEN, |
| 827 | # To silence yacc unused token warnings |
| 828 | # TK_COMMENT, |
| 829 | TK_NEWLINE, |
| 830 | TK_IONUMBER, |
| 831 | TK_ASSIGNMENT, |
| 832 | TK_HERENAME, |
| 833 | ] |
| 834 | |
| 835 | #Add specific operators |
| 836 | tokens += _OPERATORS.values() |
| 837 | #Add reserved words |
| 838 | tokens += _RESERVEDS.values() |
| 839 | |
| 840 | class PLYLexer(Lexer): |
| 841 | """Bridge Lexer and PLY lexer interface.""" |
| 842 | def __init__(self): |
| 843 | Lexer.__init__(self) |
| 844 | self._tokens = [] |
| 845 | self._current = 0 |
| 846 | self.lineno = 0 |
| 847 | |
| 848 | def on_token(self, token): |
| 849 | value, type = token |
| 850 | |
| 851 | self.lineno = 0 |
| 852 | t = lex.LexToken() |
| 853 | t.value = value |
| 854 | t.type = type |
| 855 | t.lexer = self |
| 856 | t.lexpos = 0 |
| 857 | t.lineno = 0 |
| 858 | |
| 859 | self._tokens.append(t) |
| 860 | |
| 861 | def is_empty(self): |
| 862 | return not bool(self._tokens) |
| 863 | |
| 864 | #PLY compliant interface |
| 865 | def token(self): |
| 866 | if self._current>=len(self._tokens): |
| 867 | return None |
| 868 | t = self._tokens[self._current] |
| 869 | self._current += 1 |
| 870 | return t |
| 871 | |
| 872 | |
| 873 | def get_tokens(s): |
| 874 | """Parse the input string and return a tuple (tokens, unprocessed) where |
| 875 | tokens is a list of parsed tokens and unprocessed is the part of the input |
| 876 | string left untouched by the lexer. |
| 877 | """ |
| 878 | lexer = PLYLexer() |
| 879 | untouched = lexer.add(s, True) |
| 880 | tokens = [] |
| 881 | while 1: |
| 882 | token = lexer.token() |
| 883 | if token is None: |
| 884 | break |
| 885 | tokens.append(token) |
| 886 | |
| 887 | tokens = [(t.value, t.type) for t in tokens] |
| 888 | return tokens, untouched |