Blame - poky/bitbake/lib/bb/pysh/pyshlex.py - openbmc/openbmc

blob: a42c294464121c27db5bac04293edcfbf9015a63 [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	# pyshlex.py - PLY compatible lexer for pysh.
				2	#
				3	# Copyright 2007 Patrick Mezard
				4	#
				5	# This software may be used and distributed according to the terms
				6	# of the GNU General Public License, incorporated herein by reference.
				7
				8	# TODO:
				9	# - review all "char in 'abc'" snippets: the empty string can be matched
				10	# - test line continuations within quoted/expansion strings
				11	# - eof is buggy wrt sublexers
				12	# - the lexer cannot really work in pull mode as it would be required to run
				13	# PLY in pull mode. It was designed to work incrementally and it would not be
				14	# that hard to enable pull mode.
				15	import re
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	16
				17	from ply import lex
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	18	from bb.pysh.sherrors import *
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	19
				20	class NeedMore(Exception):
				21	pass
				22
				23	def is_blank(c):
				24	return c in (' ', '\t')
				25
				26	_RE_DIGITS = re.compile(r'^\d+$')
				27
				28	def are_digits(s):
				29	return _RE_DIGITS.search(s) is not None
				30
				31	_OPERATORS = dict([
				32	('&&', 'AND_IF'),
				33	('\|\|', 'OR_IF'),
				34	(';;', 'DSEMI'),
				35	('<<', 'DLESS'),
				36	('>>', 'DGREAT'),
				37	('<&', 'LESSAND'),
				38	('>&', 'GREATAND'),
				39	('<>', 'LESSGREAT'),
				40	('<<-', 'DLESSDASH'),
				41	('>\|', 'CLOBBER'),
				42	('&', 'AMP'),
				43	(';', 'COMMA'),
				44	('<', 'LESS'),
				45	('>', 'GREATER'),
				46	('(', 'LPARENS'),
				47	(')', 'RPARENS'),
				48	])
				49
				50	#Make a function to silence pychecker "Local variable shadows global"
				51	def make_partial_ops():
				52	partials = {}
				53	for k in _OPERATORS:
				54	for i in range(1, len(k)+1):
				55	partials[k[:i]] = None
				56	return partials
				57
				58	_PARTIAL_OPERATORS = make_partial_ops()
				59
				60	def is_partial_op(s):
				61	"""Return True if s matches a non-empty subpart of an operator starting
				62	at its first character.
				63	"""
				64	return s in _PARTIAL_OPERATORS
				65
				66	def is_op(s):
				67	"""If s matches an operator, returns the operator identifier. Return None
				68	otherwise.
				69	"""
				70	return _OPERATORS.get(s)
				71
				72	_RESERVEDS = dict([
				73	('if', 'If'),
				74	('then', 'Then'),
				75	('else', 'Else'),
				76	('elif', 'Elif'),
				77	('fi', 'Fi'),
				78	('do', 'Do'),
				79	('done', 'Done'),
				80	('case', 'Case'),
				81	('esac', 'Esac'),
				82	('while', 'While'),
				83	('until', 'Until'),
				84	('for', 'For'),
				85	('{', 'Lbrace'),
				86	('}', 'Rbrace'),
				87	('!', 'Bang'),
				88	('in', 'In'),
				89	('\|', 'PIPE'),
				90	])
				91
				92	def get_reserved(s):
				93	return _RESERVEDS.get(s)
				94
				95	_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$')
				96
				97	def is_name(s):
				98	return _RE_NAME.search(s) is not None
				99
				100	def find_chars(seq, chars):
				101	for i,v in enumerate(seq):
				102	if v in chars:
				103	return i,v
				104	return -1, None
				105
				106	class WordLexer:
				107	"""WordLexer parse quoted or expansion expressions and return an expression
				108	tree. The input string can be any well formed sequence beginning with quoting
				109	or expansion character. Embedded expressions are handled recursively. The
				110	resulting tree is made of lists and strings. Lists represent quoted or
				111	expansion expressions. Each list first element is the opening separator,
				112	the last one the closing separator. In-between can be any number of strings
				113	or lists for sub-expressions. Non quoted/expansion expression can written as
				114	strings or as lists with empty strings as starting and ending delimiters.
				115	"""
				116
				117	NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
				118	NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET))
				119
				120	SPECIAL_CHARSET = '@*#?-$!0'
				121
				122	#Characters which can be escaped depends on the current delimiters
				123	ESCAPABLE = {
				124	'`': set(['$', '\\', '`']),
				125	'"': set(['$', '\\', '`', '"']),
				126	"'": set(),
				127	}
				128
				129	def __init__(self, heredoc = False):
				130	# _buffer is the unprocessed input characters buffer
				131	self._buffer = []
				132	# _stack is empty or contains a quoted list being processed
				133	# (this is the DFS path to the quoted expression being evaluated).
				134	self._stack = []
				135	self._escapable = None
				136	# True when parsing unquoted here documents
				137	self._heredoc = heredoc
				138
				139	def add(self, data, eof=False):
				140	"""Feed the lexer with more data. If the quoted expression can be
				141	delimited, return a tuple (expr, remaining) containing the expression
				142	tree and the unconsumed data.
				143	Otherwise, raise NeedMore.
				144	"""
				145	self._buffer += list(data)
				146	self._parse(eof)
				147
				148	result = self._stack[0]
				149	remaining = ''.join(self._buffer)
				150	self._stack = []
				151	self._buffer = []
				152	return result, remaining
				153
				154	def _is_escapable(self, c, delim=None):
				155	if delim is None:
				156	if self._heredoc:
				157	# Backslashes works as if they were double quoted in unquoted
				158	# here-documents
				159	delim = '"'
				160	else:
				161	if len(self._stack)<=1:
				162	return True
				163	delim = self._stack[-2][0]
				164
				165	escapables = self.ESCAPABLE.get(delim, None)
				166	return escapables is None or c in escapables
				167
				168	def _parse_squote(self, buf, result, eof):
				169	if not buf:
				170	raise NeedMore()
				171	try:
				172	pos = buf.index("'")
				173	except ValueError:
				174	raise NeedMore()
				175	result[-1] += ''.join(buf[:pos])
				176	result += ["'"]
				177	return pos+1, True
				178
				179	def _parse_bquote(self, buf, result, eof):
				180	if not buf:
				181	raise NeedMore()
				182
				183	if buf[0]=='\n':
				184	#Remove line continuations
				185	result[:] = ['', '', '']
				186	elif self._is_escapable(buf[0]):
				187	result[-1] += buf[0]
				188	result += ['']
				189	else:
				190	#Keep as such
				191	result[:] = ['', '\\'+buf[0], '']
				192
				193	return 1, True
				194
				195	def _parse_dquote(self, buf, result, eof):
				196	if not buf:
				197	raise NeedMore()
				198	pos, sep = find_chars(buf, '$\\`"')
				199	if pos==-1:
				200	raise NeedMore()
				201
				202	result[-1] += ''.join(buf[:pos])
				203	if sep=='"':
				204	result += ['"']
				205	return pos+1, True
				206	else:
				207	#Keep everything until the separator and defer processing
				208	return pos, False
				209
				210	def _parse_command(self, buf, result, eof):
				211	if not buf:
				212	raise NeedMore()
				213
				214	chars = '$\\`"\''
				215	if result[0] == '$(':
				216	chars += ')'
				217	pos, sep = find_chars(buf, chars)
				218	if pos == -1:
				219	raise NeedMore()
				220
				221	result[-1] += ''.join(buf[:pos])
				222	if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'):
				223	result += [sep]
				224	return pos+1, True
				225	else:
				226	return pos, False
				227
				228	def _parse_parameter(self, buf, result, eof):
				229	if not buf:
				230	raise NeedMore()
				231
				232	pos, sep = find_chars(buf, '$\\`"\'}')
				233	if pos==-1:
				234	raise NeedMore()
				235
				236	result[-1] += ''.join(buf[:pos])
				237	if sep=='}':
				238	result += [sep]
				239	return pos+1, True
				240	else:
				241	return pos, False
				242
				243	def _parse_dollar(self, buf, result, eof):
				244	sep = result[0]
				245	if sep=='$':
				246	if not buf:
				247	#TODO: handle empty $
				248	raise NeedMore()
				249	if buf[0]=='(':
				250	if len(buf)==1:
				251	raise NeedMore()
				252
				253	if buf[1]=='(':
				254	result[0] = '$(('
				255	buf[:2] = []
				256	else:
				257	result[0] = '$('
				258	buf[:1] = []
				259
				260	elif buf[0]=='{':
				261	result[0] = '${'
				262	buf[:1] = []
				263	else:
				264	if buf[0] in self.SPECIAL_CHARSET:
				265	result[-1] = buf[0]
				266	read = 1
				267	else:
				268	for read,c in enumerate(buf):
				269	if c not in self.NAME_CHARSET:
				270	break
				271	else:
				272	if not eof:
				273	raise NeedMore()
				274	read += 1
				275
				276	result[-1] += ''.join(buf[0:read])
				277
				278	if not result[-1]:
				279	result[:] = ['', result[0], '']
				280	else:
				281	result += ['']
				282	return read,True
				283
				284	sep = result[0]
				285	if sep=='$(':
				286	parsefunc = self._parse_command
				287	elif sep=='${':
				288	parsefunc = self._parse_parameter
				289	else:
				290	raise NotImplementedError(sep)
				291
				292	pos, closed = parsefunc(buf, result, eof)
				293	return pos, closed
				294
				295	def _parse(self, eof):
				296	buf = self._buffer
				297	stack = self._stack
				298	recurse = False
				299
				300	while 1:
				301	if not stack or recurse:
				302	if not buf:
				303	raise NeedMore()
				304	if buf[0] not in ('"\\`$\''):
				305	raise ShellSyntaxError('Invalid quoted string sequence')
				306	stack.append([buf[0], ''])
				307	buf[:1] = []
				308	recurse = False
				309
				310	result = stack[-1]
				311	if result[0]=="'":
				312	parsefunc = self._parse_squote
				313	elif result[0]=='\\':
				314	parsefunc = self._parse_bquote
				315	elif result[0]=='"':
				316	parsefunc = self._parse_dquote
				317	elif result[0]=='`':
				318	parsefunc = self._parse_command
				319	elif result[0][0]=='$':
				320	parsefunc = self._parse_dollar
				321	else:
				322	raise NotImplementedError()
				323
				324	read, closed = parsefunc(buf, result, eof)
				325
				326	buf[:read] = []
				327	if closed:
				328	if len(stack)>1:
				329	#Merge in parent expression
				330	parsed = stack.pop()
				331	stack[-1] += [parsed]
				332	stack[-1] += ['']
				333	else:
				334	break
				335	else:
				336	recurse = True
				337
				338	def normalize_wordtree(wtree):
				339	"""Fold back every literal sequence (delimited with empty strings) into
				340	parent sequence.
				341	"""
				342	def normalize(wtree):
				343	result = []
				344	for part in wtree[1:-1]:
				345	if isinstance(part, list):
				346	part = normalize(part)
				347	if part[0]=='':
				348	#Move the part content back at current level
				349	result += part[1:-1]
				350	continue
				351	elif not part:
				352	#Remove empty strings
				353	continue
				354	result.append(part)
				355	if not result:
				356	result = ['']
				357	return [wtree[0]] + result + [wtree[-1]]
				358
				359	return normalize(wtree)
				360
				361
				362	def make_wordtree(token, here_document=False):
				363	"""Parse a delimited token and return a tree similar to the ones returned by
				364	WordLexer. token may contain any combinations of expansion/quoted fields and
				365	non-ones.
				366	"""
				367	tree = ['']
				368	remaining = token
				369	delimiters = '\\$`'
				370	if not here_document:
				371	delimiters += '\'"'
				372
				373	while 1:
				374	pos, sep = find_chars(remaining, delimiters)
				375	if pos==-1:
				376	tree += [remaining, '']
				377	return normalize_wordtree(tree)
				378	tree.append(remaining[:pos])
				379	remaining = remaining[pos:]
				380
				381	try:
				382	result, remaining = WordLexer(heredoc = here_document).add(remaining, True)
				383	except NeedMore:
				384	raise ShellSyntaxError('Invalid token "%s"')
				385	tree.append(result)
				386
				387
				388	def wordtree_as_string(wtree):
				389	"""Rewrite an expression tree generated by make_wordtree as string."""
				390	def visit(node, output):
				391	for child in node:
				392	if isinstance(child, list):
				393	visit(child, output)
				394	else:
				395	output.append(child)
				396
				397	output = []
				398	visit(wtree, output)
				399	return ''.join(output)
				400
				401
				402	def unquote_wordtree(wtree):
				403	"""Fold the word tree while removing quotes everywhere. Other expansion
				404	sequences are joined as such.
				405	"""
				406	def unquote(wtree):
				407	unquoted = []
				408	if wtree[0] in ('', "'", '"', '\\'):
				409	wtree = wtree[1:-1]
				410
				411	for part in wtree:
				412	if isinstance(part, list):
				413	part = unquote(part)
				414	unquoted.append(part)
				415	return ''.join(unquoted)
				416
				417	return unquote(wtree)
				418
				419
				420	class HereDocLexer:
				421	"""HereDocLexer delimits whatever comes from the here-document starting newline
				422	not included to the closing delimiter line included.
				423	"""
				424	def __init__(self, op, delim):
				425	assert op in ('<<', '<<-')
				426	if not delim:
				427	raise ShellSyntaxError('invalid here document delimiter %s' % str(delim))
				428
				429	self._op = op
				430	self._delim = delim
				431	self._buffer = []
				432	self._token = []
				433
				434	def add(self, data, eof):
				435	"""If the here-document was delimited, return a tuple (content, remaining).
				436	Raise NeedMore() otherwise.
				437	"""
				438	self._buffer += list(data)
				439	self._parse(eof)
				440	token = ''.join(self._token)
				441	remaining = ''.join(self._buffer)
				442	self._token, self._remaining = [], []
				443	return token, remaining
				444
				445	def _parse(self, eof):
				446	while 1:
				447	#Look for first unescaped newline. Quotes may be ignored
				448	escaped = False
				449	for i,c in enumerate(self._buffer):
				450	if escaped:
				451	escaped = False
				452	elif c=='\\':
				453	escaped = True
				454	elif c=='\n':
				455	break
				456	else:
				457	i = -1
				458
				459	if i==-1 or self._buffer[i]!='\n':
				460	if not eof:
				461	raise NeedMore()
				462	#No more data, maybe the last line is closing delimiter
				463	line = ''.join(self._buffer)
				464	eol = ''
				465	self._buffer[:] = []
				466	else:
				467	line = ''.join(self._buffer[:i])
				468	eol = self._buffer[i]
				469	self._buffer[:i+1] = []
				470
				471	if self._op=='<<-':
				472	line = line.lstrip('\t')
				473
				474	if line==self._delim:
				475	break
				476
				477	self._token += [line, eol]
				478	if i==-1:
				479	break
				480
				481	class Token:
				482	#TODO: check this is still in use
				483	OPERATOR = 'OPERATOR'
				484	WORD = 'WORD'
				485
				486	def __init__(self):
				487	self.value = ''
				488	self.type = None
				489
				490	def __getitem__(self, key):
				491	#Behave like a two elements tuple
				492	if key==0:
				493	return self.type
				494	if key==1:
				495	return self.value
				496	raise IndexError(key)
				497
				498
				499	class HereDoc:
				500	def __init__(self, op, name=None):
				501	self.op = op
				502	self.name = name
				503	self.pendings = []
				504
				505	TK_COMMA = 'COMMA'
				506	TK_AMPERSAND = 'AMP'
				507	TK_OP = 'OP'
				508	TK_TOKEN = 'TOKEN'
				509	TK_COMMENT = 'COMMENT'
				510	TK_NEWLINE = 'NEWLINE'
				511	TK_IONUMBER = 'IO_NUMBER'
				512	TK_ASSIGNMENT = 'ASSIGNMENT_WORD'
				513	TK_HERENAME = 'HERENAME'
				514
				515	class Lexer:
				516	"""Main lexer.
				517
				518	Call add() until the script AST is returned.
				519	"""
				520	# Here-document handling makes the whole thing more complex because they basically
				521	# force tokens to be reordered: here-content must come right after the operator
				522	# and the here-document name, while some other tokens might be following the
				523	# here-document expression on the same line.
				524	#
				525	# So, here-doc states are basically:
				526	# *self._state==ST_NORMAL
				527	# - self._heredoc.op is None: no here-document
				528	# - self._heredoc.op is not None but name is: here-document operator matched,
				529	# waiting for the document name/delimiter
				530	# - self._heredoc.op and name are not None: here-document is ready, following
				531	# tokens are being stored and will be pushed again when the document is
				532	# completely parsed.
				533	# *self._state==ST_HEREDOC
				534	# - The here-document is being delimited by self._herelexer. Once it is done
				535	# the content is pushed in front of the pending token list then all these
				536	# tokens are pushed once again.
				537	ST_NORMAL = 'ST_NORMAL'
				538	ST_OP = 'ST_OP'
				539	ST_BACKSLASH = 'ST_BACKSLASH'
				540	ST_QUOTED = 'ST_QUOTED'
				541	ST_COMMENT = 'ST_COMMENT'
				542	ST_HEREDOC = 'ST_HEREDOC'
				543
				544	#Match end of backquote strings
				545	RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)')
				546
				547	def __init__(self, parent_state = None):
				548	self._input = []
				549	self._pos = 0
				550
				551	self._token = ''
				552	self._type = TK_TOKEN
				553
				554	self._state = self.ST_NORMAL
				555	self._parent_state = parent_state
				556	self._wordlexer = None
				557
				558	self._heredoc = HereDoc(None)
				559	self._herelexer = None
				560
				561	### Following attributes are not used for delimiting token and can safely
				562	### be changed after here-document detection (see _push_toke)
				563
				564	# Count the number of tokens following a 'For' reserved word. Needed to
				565	# return an 'In' reserved word if it comes in third place.
				566	self._for_count = None
				567
				568	def add(self, data, eof=False):
				569	"""Feed the lexer with data.
				570
				571	When eof is set to True, returns unconsumed data or raise if the lexer
				572	is in the middle of a delimiting operation.
				573	Raise NeedMore otherwise.
				574	"""
				575	self._input += list(data)
				576	self._parse(eof)
				577	self._input[:self._pos] = []
				578	return ''.join(self._input)
				579
				580	def _parse(self, eof):
				581	while self._state:
				582	if self._pos>=len(self._input):
				583	if not eof:
				584	raise NeedMore()
				585	elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC):
				586	#Delimit the current token and leave cleanly
				587	self._push_token('')
				588	break
				589	else:
				590	#Let the sublexer handle the eof themselves
				591	pass
				592
				593	if self._state==self.ST_NORMAL:
				594	self._parse_normal()
				595	elif self._state==self.ST_COMMENT:
				596	self._parse_comment()
				597	elif self._state==self.ST_OP:
				598	self._parse_op(eof)
				599	elif self._state==self.ST_QUOTED:
				600	self._parse_quoted(eof)
				601	elif self._state==self.ST_HEREDOC:
				602	self._parse_heredoc(eof)
				603	else:
				604	assert False, "Unknown state " + str(self._state)
				605
				606	if self._heredoc.op is not None:
				607	raise ShellSyntaxError('missing here-document delimiter')
				608
				609	def _parse_normal(self):
				610	c = self._input[self._pos]
				611	if c=='\n':
				612	self._push_token(c)
				613	self._token = c
				614	self._type = TK_NEWLINE
				615	self._push_token('')
				616	self._pos += 1
				617	elif c in ('\\', '\'', '"', '`', '$'):
				618	self._state = self.ST_QUOTED
				619	elif is_partial_op(c):
				620	self._push_token(c)
				621
				622	self._type = TK_OP
				623	self._token += c
				624	self._pos += 1
				625	self._state = self.ST_OP
				626	elif is_blank(c):
				627	self._push_token(c)
				628
				629	#Discard blanks
				630	self._pos += 1
				631	elif self._token:
				632	self._token += c
				633	self._pos += 1
				634	elif c=='#':
				635	self._state = self.ST_COMMENT
				636	self._type = TK_COMMENT
				637	self._pos += 1
				638	else:
				639	self._pos += 1
				640	self._token += c
				641
				642	def _parse_op(self, eof):
				643	assert self._token
				644
				645	while 1:
				646	if self._pos>=len(self._input):
				647	if not eof:
				648	raise NeedMore()
				649	c = ''
				650	else:
				651	c = self._input[self._pos]
				652
				653	op = self._token + c
				654	if c and is_partial_op(op):
				655	#Still parsing an operator
				656	self._token = op
				657	self._pos += 1
				658	else:
				659	#End of operator
				660	self._push_token(c)
				661	self._state = self.ST_NORMAL
				662	break
				663
				664	def _parse_comment(self):
				665	while 1:
				666	if self._pos>=len(self._input):
				667	raise NeedMore()
				668
				669	c = self._input[self._pos]
				670	if c=='\n':
				671	#End of comment, do not consume the end of line
				672	self._state = self.ST_NORMAL
				673	break
				674	else:
				675	self._token += c
				676	self._pos += 1
				677
				678	def _parse_quoted(self, eof):
				679	"""Precondition: the starting backquote/dollar is still in the input queue."""
				680	if not self._wordlexer:
				681	self._wordlexer = WordLexer()
				682
				683	if self._pos<len(self._input):
				684	#Transfer input queue character into the subparser
				685	input = self._input[self._pos:]
				686	self._pos += len(input)
				687
				688	wtree, remaining = self._wordlexer.add(input, eof)
				689	self._wordlexer = None
				690	self._token += wordtree_as_string(wtree)
				691
				692	#Put unparsed character back in the input queue
				693	if remaining:
				694	self._input[self._pos:self._pos] = list(remaining)
				695	self._state = self.ST_NORMAL
				696
				697	def _parse_heredoc(self, eof):
				698	assert not self._token
				699
				700	if self._herelexer is None:
				701	self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name)
				702
				703	if self._pos<len(self._input):
				704	#Transfer input queue character into the subparser
				705	input = self._input[self._pos:]
				706	self._pos += len(input)
				707
				708	self._token, remaining = self._herelexer.add(input, eof)
				709
				710	#Reset here-document state
				711	self._herelexer = None
				712	heredoc, self._heredoc = self._heredoc, HereDoc(None)
				713	if remaining:
				714	self._input[self._pos:self._pos] = list(remaining)
				715	self._state = self.ST_NORMAL
				716
				717	#Push pending tokens
				718	heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)]
				719	for token, type, delim in heredoc.pendings:
				720	self._token = token
				721	self._type = type
				722	self._push_token(delim)
				723
				724	def _push_token(self, delim):
				725	if not self._token:
				726	return 0
				727
				728	if self._heredoc.op is not None:
				729	if self._heredoc.name is None:
				730	#Here-document name
				731	if self._type!=TK_TOKEN:
				732	raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token)
				733	self._heredoc.name = unquote_wordtree(make_wordtree(self._token))
				734	self._type = TK_HERENAME
				735	else:
				736	#Capture all tokens until the newline starting the here-document
				737	if self._type==TK_NEWLINE:
				738	assert self._state==self.ST_NORMAL
				739	self._state = self.ST_HEREDOC
				740
				741	self._heredoc.pendings.append((self._token, self._type, delim))
				742	self._token = ''
				743	self._type = TK_TOKEN
				744	return 1
				745
				746	# BEWARE: do not change parser state from here to the end of the function:
				747	# when parsing between an here-document operator to the end of the line
				748	# tokens are stored in self._heredoc.pendings. Therefore, they will not
				749	# reach the section below.
				750
				751	#Check operators
				752	if self._type==TK_OP:
				753	#False positive because of partial op matching
				754	op = is_op(self._token)
				755	if not op:
				756	self._type = TK_TOKEN
				757	else:
				758	#Map to the specific operator
				759	self._type = op
				760	if self._token in ('<<', '<<-'):
				761	#Done here rather than in _parse_op because there is no need
				762	#to change the parser state since we are still waiting for
				763	#the here-document name
				764	if self._heredoc.op is not None:
				765	raise ShellSyntaxError("syntax error near token '%s'" % self._token)
				766	assert self._heredoc.op is None
				767	self._heredoc.op = self._token
				768
				769	if self._type==TK_TOKEN:
				770	if '=' in self._token and not delim:
				771	if self._token.startswith('='):
				772	#Token is a WORD... a TOKEN that is.
				773	pass
				774	else:
				775	prev = self._token[:self._token.find('=')]
				776	if is_name(prev):
				777	self._type = TK_ASSIGNMENT
				778	else:
				779	#Just a token (unspecified)
				780	pass
				781	else:
				782	reserved = get_reserved(self._token)
				783	if reserved is not None:
				784	if reserved=='In' and self._for_count!=2:
				785	#Sorry, not a reserved word after all
				786	pass
				787	else:
				788	self._type = reserved
				789	if reserved in ('For', 'Case'):
				790	self._for_count = 0
				791	elif are_digits(self._token) and delim in ('<', '>'):
				792	#Detect IO_NUMBER
				793	self._type = TK_IONUMBER
				794	elif self._token==';':
				795	self._type = TK_COMMA
				796	elif self._token=='&':
				797	self._type = TK_AMPERSAND
				798	elif self._type==TK_COMMENT:
				799	#Comments are not part of sh grammar, ignore them
				800	self._token = ''
				801	self._type = TK_TOKEN
				802	return 0
				803
				804	if self._for_count is not None:
				805	#Track token count in 'For' expression to detect 'In' reserved words.
				806	#Can only be in third position, no need to go beyond
				807	self._for_count += 1
				808	if self._for_count==3:
				809	self._for_count = None
				810
				811	self.on_token((self._token, self._type))
				812	self._token = ''
				813	self._type = TK_TOKEN
				814	return 1
				815
				816	def on_token(self, token):
				817	raise NotImplementedError
				818
				819
				820	tokens = [
				821	TK_TOKEN,
				822	# To silence yacc unused token warnings
				823	# TK_COMMENT,
				824	TK_NEWLINE,
				825	TK_IONUMBER,
				826	TK_ASSIGNMENT,
				827	TK_HERENAME,
				828	]
				829
				830	#Add specific operators
				831	tokens += _OPERATORS.values()
				832	#Add reserved words
				833	tokens += _RESERVEDS.values()
				834
				835	class PLYLexer(Lexer):
				836	"""Bridge Lexer and PLY lexer interface."""
				837	def __init__(self):
				838	Lexer.__init__(self)
				839	self._tokens = []
				840	self._current = 0
				841	self.lineno = 0
				842
				843	def on_token(self, token):
				844	value, type = token
				845
				846	self.lineno = 0
				847	t = lex.LexToken()
				848	t.value = value
				849	t.type = type
				850	t.lexer = self
				851	t.lexpos = 0
				852	t.lineno = 0
				853
				854	self._tokens.append(t)
				855
				856	def is_empty(self):
				857	return not bool(self._tokens)
				858
				859	#PLY compliant interface
				860	def token(self):
				861	if self._current>=len(self._tokens):
				862	return None
				863	t = self._tokens[self._current]
				864	self._current += 1
				865	return t
				866
				867
				868	def get_tokens(s):
				869	"""Parse the input string and return a tuple (tokens, unprocessed) where
				870	tokens is a list of parsed tokens and unprocessed is the part of the input
				871	string left untouched by the lexer.
				872	"""
				873	lexer = PLYLexer()
				874	untouched = lexer.add(s, True)
				875	tokens = []
				876	while 1:
				877	token = lexer.token()
				878	if token is None:
				879	break
				880	tokens.append(token)
				881
				882	tokens = [(t.value, t.type) for t in tokens]
				883	return tokens, untouched