Blame - poky/bitbake/lib/bb/pysh/pyshlex.py - mdmillerii/openbmc

blob: fbf094b7a9cbbc2ac8d9cbe9d74242b411b47a0d [file] [log] [blame]

Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	1	# pyshlex.py - PLY compatible lexer for pysh.
				2	#
				3	# Copyright 2007 Patrick Mezard
				4	#
				5	# This software may be used and distributed according to the terms
				6	# of the GNU General Public License, incorporated herein by reference.
				7
				8	# TODO:
				9	# - review all "char in 'abc'" snippets: the empty string can be matched
				10	# - test line continuations within quoted/expansion strings
				11	# - eof is buggy wrt sublexers
				12	# - the lexer cannot really work in pull mode as it would be required to run
				13	# PLY in pull mode. It was designed to work incrementally and it would not be
				14	# that hard to enable pull mode.
				15	import re
				16	try:
				17	s = set()
				18	del s
				19	except NameError:
				20	from Set import Set as set
				21
				22	from ply import lex
Patrick Williams	c0f7c04	2017-02-23 20:41:17 -0600	[diff] [blame]	23	from bb.pysh.sherrors import *
Patrick Williams	c124f4f	2015-09-15 14:41:29 -0500	[diff] [blame]	24
				25	class NeedMore(Exception):
				26	pass
				27
				28	def is_blank(c):
				29	return c in (' ', '\t')
				30
				31	_RE_DIGITS = re.compile(r'^\d+$')
				32
				33	def are_digits(s):
				34	return _RE_DIGITS.search(s) is not None
				35
				36	_OPERATORS = dict([
				37	('&&', 'AND_IF'),
				38	('\|\|', 'OR_IF'),
				39	(';;', 'DSEMI'),
				40	('<<', 'DLESS'),
				41	('>>', 'DGREAT'),
				42	('<&', 'LESSAND'),
				43	('>&', 'GREATAND'),
				44	('<>', 'LESSGREAT'),
				45	('<<-', 'DLESSDASH'),
				46	('>\|', 'CLOBBER'),
				47	('&', 'AMP'),
				48	(';', 'COMMA'),
				49	('<', 'LESS'),
				50	('>', 'GREATER'),
				51	('(', 'LPARENS'),
				52	(')', 'RPARENS'),
				53	])
				54
				55	#Make a function to silence pychecker "Local variable shadows global"
				56	def make_partial_ops():
				57	partials = {}
				58	for k in _OPERATORS:
				59	for i in range(1, len(k)+1):
				60	partials[k[:i]] = None
				61	return partials
				62
				63	_PARTIAL_OPERATORS = make_partial_ops()
				64
				65	def is_partial_op(s):
				66	"""Return True if s matches a non-empty subpart of an operator starting
				67	at its first character.
				68	"""
				69	return s in _PARTIAL_OPERATORS
				70
				71	def is_op(s):
				72	"""If s matches an operator, returns the operator identifier. Return None
				73	otherwise.
				74	"""
				75	return _OPERATORS.get(s)
				76
				77	_RESERVEDS = dict([
				78	('if', 'If'),
				79	('then', 'Then'),
				80	('else', 'Else'),
				81	('elif', 'Elif'),
				82	('fi', 'Fi'),
				83	('do', 'Do'),
				84	('done', 'Done'),
				85	('case', 'Case'),
				86	('esac', 'Esac'),
				87	('while', 'While'),
				88	('until', 'Until'),
				89	('for', 'For'),
				90	('{', 'Lbrace'),
				91	('}', 'Rbrace'),
				92	('!', 'Bang'),
				93	('in', 'In'),
				94	('\|', 'PIPE'),
				95	])
				96
				97	def get_reserved(s):
				98	return _RESERVEDS.get(s)
				99
				100	_RE_NAME = re.compile(r'^[0-9a-zA-Z_]+$')
				101
				102	def is_name(s):
				103	return _RE_NAME.search(s) is not None
				104
				105	def find_chars(seq, chars):
				106	for i,v in enumerate(seq):
				107	if v in chars:
				108	return i,v
				109	return -1, None
				110
				111	class WordLexer:
				112	"""WordLexer parse quoted or expansion expressions and return an expression
				113	tree. The input string can be any well formed sequence beginning with quoting
				114	or expansion character. Embedded expressions are handled recursively. The
				115	resulting tree is made of lists and strings. Lists represent quoted or
				116	expansion expressions. Each list first element is the opening separator,
				117	the last one the closing separator. In-between can be any number of strings
				118	or lists for sub-expressions. Non quoted/expansion expression can written as
				119	strings or as lists with empty strings as starting and ending delimiters.
				120	"""
				121
				122	NAME_CHARSET = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
				123	NAME_CHARSET = dict(zip(NAME_CHARSET, NAME_CHARSET))
				124
				125	SPECIAL_CHARSET = '@*#?-$!0'
				126
				127	#Characters which can be escaped depends on the current delimiters
				128	ESCAPABLE = {
				129	'`': set(['$', '\\', '`']),
				130	'"': set(['$', '\\', '`', '"']),
				131	"'": set(),
				132	}
				133
				134	def __init__(self, heredoc = False):
				135	# _buffer is the unprocessed input characters buffer
				136	self._buffer = []
				137	# _stack is empty or contains a quoted list being processed
				138	# (this is the DFS path to the quoted expression being evaluated).
				139	self._stack = []
				140	self._escapable = None
				141	# True when parsing unquoted here documents
				142	self._heredoc = heredoc
				143
				144	def add(self, data, eof=False):
				145	"""Feed the lexer with more data. If the quoted expression can be
				146	delimited, return a tuple (expr, remaining) containing the expression
				147	tree and the unconsumed data.
				148	Otherwise, raise NeedMore.
				149	"""
				150	self._buffer += list(data)
				151	self._parse(eof)
				152
				153	result = self._stack[0]
				154	remaining = ''.join(self._buffer)
				155	self._stack = []
				156	self._buffer = []
				157	return result, remaining
				158
				159	def _is_escapable(self, c, delim=None):
				160	if delim is None:
				161	if self._heredoc:
				162	# Backslashes works as if they were double quoted in unquoted
				163	# here-documents
				164	delim = '"'
				165	else:
				166	if len(self._stack)<=1:
				167	return True
				168	delim = self._stack[-2][0]
				169
				170	escapables = self.ESCAPABLE.get(delim, None)
				171	return escapables is None or c in escapables
				172
				173	def _parse_squote(self, buf, result, eof):
				174	if not buf:
				175	raise NeedMore()
				176	try:
				177	pos = buf.index("'")
				178	except ValueError:
				179	raise NeedMore()
				180	result[-1] += ''.join(buf[:pos])
				181	result += ["'"]
				182	return pos+1, True
				183
				184	def _parse_bquote(self, buf, result, eof):
				185	if not buf:
				186	raise NeedMore()
				187
				188	if buf[0]=='\n':
				189	#Remove line continuations
				190	result[:] = ['', '', '']
				191	elif self._is_escapable(buf[0]):
				192	result[-1] += buf[0]
				193	result += ['']
				194	else:
				195	#Keep as such
				196	result[:] = ['', '\\'+buf[0], '']
				197
				198	return 1, True
				199
				200	def _parse_dquote(self, buf, result, eof):
				201	if not buf:
				202	raise NeedMore()
				203	pos, sep = find_chars(buf, '$\\`"')
				204	if pos==-1:
				205	raise NeedMore()
				206
				207	result[-1] += ''.join(buf[:pos])
				208	if sep=='"':
				209	result += ['"']
				210	return pos+1, True
				211	else:
				212	#Keep everything until the separator and defer processing
				213	return pos, False
				214
				215	def _parse_command(self, buf, result, eof):
				216	if not buf:
				217	raise NeedMore()
				218
				219	chars = '$\\`"\''
				220	if result[0] == '$(':
				221	chars += ')'
				222	pos, sep = find_chars(buf, chars)
				223	if pos == -1:
				224	raise NeedMore()
				225
				226	result[-1] += ''.join(buf[:pos])
				227	if (result[0]=='$(' and sep==')') or (result[0]=='`' and sep=='`'):
				228	result += [sep]
				229	return pos+1, True
				230	else:
				231	return pos, False
				232
				233	def _parse_parameter(self, buf, result, eof):
				234	if not buf:
				235	raise NeedMore()
				236
				237	pos, sep = find_chars(buf, '$\\`"\'}')
				238	if pos==-1:
				239	raise NeedMore()
				240
				241	result[-1] += ''.join(buf[:pos])
				242	if sep=='}':
				243	result += [sep]
				244	return pos+1, True
				245	else:
				246	return pos, False
				247
				248	def _parse_dollar(self, buf, result, eof):
				249	sep = result[0]
				250	if sep=='$':
				251	if not buf:
				252	#TODO: handle empty $
				253	raise NeedMore()
				254	if buf[0]=='(':
				255	if len(buf)==1:
				256	raise NeedMore()
				257
				258	if buf[1]=='(':
				259	result[0] = '$(('
				260	buf[:2] = []
				261	else:
				262	result[0] = '$('
				263	buf[:1] = []
				264
				265	elif buf[0]=='{':
				266	result[0] = '${'
				267	buf[:1] = []
				268	else:
				269	if buf[0] in self.SPECIAL_CHARSET:
				270	result[-1] = buf[0]
				271	read = 1
				272	else:
				273	for read,c in enumerate(buf):
				274	if c not in self.NAME_CHARSET:
				275	break
				276	else:
				277	if not eof:
				278	raise NeedMore()
				279	read += 1
				280
				281	result[-1] += ''.join(buf[0:read])
				282
				283	if not result[-1]:
				284	result[:] = ['', result[0], '']
				285	else:
				286	result += ['']
				287	return read,True
				288
				289	sep = result[0]
				290	if sep=='$(':
				291	parsefunc = self._parse_command
				292	elif sep=='${':
				293	parsefunc = self._parse_parameter
				294	else:
				295	raise NotImplementedError(sep)
				296
				297	pos, closed = parsefunc(buf, result, eof)
				298	return pos, closed
				299
				300	def _parse(self, eof):
				301	buf = self._buffer
				302	stack = self._stack
				303	recurse = False
				304
				305	while 1:
				306	if not stack or recurse:
				307	if not buf:
				308	raise NeedMore()
				309	if buf[0] not in ('"\\`$\''):
				310	raise ShellSyntaxError('Invalid quoted string sequence')
				311	stack.append([buf[0], ''])
				312	buf[:1] = []
				313	recurse = False
				314
				315	result = stack[-1]
				316	if result[0]=="'":
				317	parsefunc = self._parse_squote
				318	elif result[0]=='\\':
				319	parsefunc = self._parse_bquote
				320	elif result[0]=='"':
				321	parsefunc = self._parse_dquote
				322	elif result[0]=='`':
				323	parsefunc = self._parse_command
				324	elif result[0][0]=='$':
				325	parsefunc = self._parse_dollar
				326	else:
				327	raise NotImplementedError()
				328
				329	read, closed = parsefunc(buf, result, eof)
				330
				331	buf[:read] = []
				332	if closed:
				333	if len(stack)>1:
				334	#Merge in parent expression
				335	parsed = stack.pop()
				336	stack[-1] += [parsed]
				337	stack[-1] += ['']
				338	else:
				339	break
				340	else:
				341	recurse = True
				342
				343	def normalize_wordtree(wtree):
				344	"""Fold back every literal sequence (delimited with empty strings) into
				345	parent sequence.
				346	"""
				347	def normalize(wtree):
				348	result = []
				349	for part in wtree[1:-1]:
				350	if isinstance(part, list):
				351	part = normalize(part)
				352	if part[0]=='':
				353	#Move the part content back at current level
				354	result += part[1:-1]
				355	continue
				356	elif not part:
				357	#Remove empty strings
				358	continue
				359	result.append(part)
				360	if not result:
				361	result = ['']
				362	return [wtree[0]] + result + [wtree[-1]]
				363
				364	return normalize(wtree)
				365
				366
				367	def make_wordtree(token, here_document=False):
				368	"""Parse a delimited token and return a tree similar to the ones returned by
				369	WordLexer. token may contain any combinations of expansion/quoted fields and
				370	non-ones.
				371	"""
				372	tree = ['']
				373	remaining = token
				374	delimiters = '\\$`'
				375	if not here_document:
				376	delimiters += '\'"'
				377
				378	while 1:
				379	pos, sep = find_chars(remaining, delimiters)
				380	if pos==-1:
				381	tree += [remaining, '']
				382	return normalize_wordtree(tree)
				383	tree.append(remaining[:pos])
				384	remaining = remaining[pos:]
				385
				386	try:
				387	result, remaining = WordLexer(heredoc = here_document).add(remaining, True)
				388	except NeedMore:
				389	raise ShellSyntaxError('Invalid token "%s"')
				390	tree.append(result)
				391
				392
				393	def wordtree_as_string(wtree):
				394	"""Rewrite an expression tree generated by make_wordtree as string."""
				395	def visit(node, output):
				396	for child in node:
				397	if isinstance(child, list):
				398	visit(child, output)
				399	else:
				400	output.append(child)
				401
				402	output = []
				403	visit(wtree, output)
				404	return ''.join(output)
				405
				406
				407	def unquote_wordtree(wtree):
				408	"""Fold the word tree while removing quotes everywhere. Other expansion
				409	sequences are joined as such.
				410	"""
				411	def unquote(wtree):
				412	unquoted = []
				413	if wtree[0] in ('', "'", '"', '\\'):
				414	wtree = wtree[1:-1]
				415
				416	for part in wtree:
				417	if isinstance(part, list):
				418	part = unquote(part)
				419	unquoted.append(part)
				420	return ''.join(unquoted)
				421
				422	return unquote(wtree)
				423
				424
				425	class HereDocLexer:
				426	"""HereDocLexer delimits whatever comes from the here-document starting newline
				427	not included to the closing delimiter line included.
				428	"""
				429	def __init__(self, op, delim):
				430	assert op in ('<<', '<<-')
				431	if not delim:
				432	raise ShellSyntaxError('invalid here document delimiter %s' % str(delim))
				433
				434	self._op = op
				435	self._delim = delim
				436	self._buffer = []
				437	self._token = []
				438
				439	def add(self, data, eof):
				440	"""If the here-document was delimited, return a tuple (content, remaining).
				441	Raise NeedMore() otherwise.
				442	"""
				443	self._buffer += list(data)
				444	self._parse(eof)
				445	token = ''.join(self._token)
				446	remaining = ''.join(self._buffer)
				447	self._token, self._remaining = [], []
				448	return token, remaining
				449
				450	def _parse(self, eof):
				451	while 1:
				452	#Look for first unescaped newline. Quotes may be ignored
				453	escaped = False
				454	for i,c in enumerate(self._buffer):
				455	if escaped:
				456	escaped = False
				457	elif c=='\\':
				458	escaped = True
				459	elif c=='\n':
				460	break
				461	else:
				462	i = -1
				463
				464	if i==-1 or self._buffer[i]!='\n':
				465	if not eof:
				466	raise NeedMore()
				467	#No more data, maybe the last line is closing delimiter
				468	line = ''.join(self._buffer)
				469	eol = ''
				470	self._buffer[:] = []
				471	else:
				472	line = ''.join(self._buffer[:i])
				473	eol = self._buffer[i]
				474	self._buffer[:i+1] = []
				475
				476	if self._op=='<<-':
				477	line = line.lstrip('\t')
				478
				479	if line==self._delim:
				480	break
				481
				482	self._token += [line, eol]
				483	if i==-1:
				484	break
				485
				486	class Token:
				487	#TODO: check this is still in use
				488	OPERATOR = 'OPERATOR'
				489	WORD = 'WORD'
				490
				491	def __init__(self):
				492	self.value = ''
				493	self.type = None
				494
				495	def __getitem__(self, key):
				496	#Behave like a two elements tuple
				497	if key==0:
				498	return self.type
				499	if key==1:
				500	return self.value
				501	raise IndexError(key)
				502
				503
				504	class HereDoc:
				505	def __init__(self, op, name=None):
				506	self.op = op
				507	self.name = name
				508	self.pendings = []
				509
				510	TK_COMMA = 'COMMA'
				511	TK_AMPERSAND = 'AMP'
				512	TK_OP = 'OP'
				513	TK_TOKEN = 'TOKEN'
				514	TK_COMMENT = 'COMMENT'
				515	TK_NEWLINE = 'NEWLINE'
				516	TK_IONUMBER = 'IO_NUMBER'
				517	TK_ASSIGNMENT = 'ASSIGNMENT_WORD'
				518	TK_HERENAME = 'HERENAME'
				519
				520	class Lexer:
				521	"""Main lexer.
				522
				523	Call add() until the script AST is returned.
				524	"""
				525	# Here-document handling makes the whole thing more complex because they basically
				526	# force tokens to be reordered: here-content must come right after the operator
				527	# and the here-document name, while some other tokens might be following the
				528	# here-document expression on the same line.
				529	#
				530	# So, here-doc states are basically:
				531	# *self._state==ST_NORMAL
				532	# - self._heredoc.op is None: no here-document
				533	# - self._heredoc.op is not None but name is: here-document operator matched,
				534	# waiting for the document name/delimiter
				535	# - self._heredoc.op and name are not None: here-document is ready, following
				536	# tokens are being stored and will be pushed again when the document is
				537	# completely parsed.
				538	# *self._state==ST_HEREDOC
				539	# - The here-document is being delimited by self._herelexer. Once it is done
				540	# the content is pushed in front of the pending token list then all these
				541	# tokens are pushed once again.
				542	ST_NORMAL = 'ST_NORMAL'
				543	ST_OP = 'ST_OP'
				544	ST_BACKSLASH = 'ST_BACKSLASH'
				545	ST_QUOTED = 'ST_QUOTED'
				546	ST_COMMENT = 'ST_COMMENT'
				547	ST_HEREDOC = 'ST_HEREDOC'
				548
				549	#Match end of backquote strings
				550	RE_BACKQUOTE_END = re.compile(r'(?<!\\)(`)')
				551
				552	def __init__(self, parent_state = None):
				553	self._input = []
				554	self._pos = 0
				555
				556	self._token = ''
				557	self._type = TK_TOKEN
				558
				559	self._state = self.ST_NORMAL
				560	self._parent_state = parent_state
				561	self._wordlexer = None
				562
				563	self._heredoc = HereDoc(None)
				564	self._herelexer = None
				565
				566	### Following attributes are not used for delimiting token and can safely
				567	### be changed after here-document detection (see _push_toke)
				568
				569	# Count the number of tokens following a 'For' reserved word. Needed to
				570	# return an 'In' reserved word if it comes in third place.
				571	self._for_count = None
				572
				573	def add(self, data, eof=False):
				574	"""Feed the lexer with data.
				575
				576	When eof is set to True, returns unconsumed data or raise if the lexer
				577	is in the middle of a delimiting operation.
				578	Raise NeedMore otherwise.
				579	"""
				580	self._input += list(data)
				581	self._parse(eof)
				582	self._input[:self._pos] = []
				583	return ''.join(self._input)
				584
				585	def _parse(self, eof):
				586	while self._state:
				587	if self._pos>=len(self._input):
				588	if not eof:
				589	raise NeedMore()
				590	elif self._state not in (self.ST_OP, self.ST_QUOTED, self.ST_HEREDOC):
				591	#Delimit the current token and leave cleanly
				592	self._push_token('')
				593	break
				594	else:
				595	#Let the sublexer handle the eof themselves
				596	pass
				597
				598	if self._state==self.ST_NORMAL:
				599	self._parse_normal()
				600	elif self._state==self.ST_COMMENT:
				601	self._parse_comment()
				602	elif self._state==self.ST_OP:
				603	self._parse_op(eof)
				604	elif self._state==self.ST_QUOTED:
				605	self._parse_quoted(eof)
				606	elif self._state==self.ST_HEREDOC:
				607	self._parse_heredoc(eof)
				608	else:
				609	assert False, "Unknown state " + str(self._state)
				610
				611	if self._heredoc.op is not None:
				612	raise ShellSyntaxError('missing here-document delimiter')
				613
				614	def _parse_normal(self):
				615	c = self._input[self._pos]
				616	if c=='\n':
				617	self._push_token(c)
				618	self._token = c
				619	self._type = TK_NEWLINE
				620	self._push_token('')
				621	self._pos += 1
				622	elif c in ('\\', '\'', '"', '`', '$'):
				623	self._state = self.ST_QUOTED
				624	elif is_partial_op(c):
				625	self._push_token(c)
				626
				627	self._type = TK_OP
				628	self._token += c
				629	self._pos += 1
				630	self._state = self.ST_OP
				631	elif is_blank(c):
				632	self._push_token(c)
				633
				634	#Discard blanks
				635	self._pos += 1
				636	elif self._token:
				637	self._token += c
				638	self._pos += 1
				639	elif c=='#':
				640	self._state = self.ST_COMMENT
				641	self._type = TK_COMMENT
				642	self._pos += 1
				643	else:
				644	self._pos += 1
				645	self._token += c
				646
				647	def _parse_op(self, eof):
				648	assert self._token
				649
				650	while 1:
				651	if self._pos>=len(self._input):
				652	if not eof:
				653	raise NeedMore()
				654	c = ''
				655	else:
				656	c = self._input[self._pos]
				657
				658	op = self._token + c
				659	if c and is_partial_op(op):
				660	#Still parsing an operator
				661	self._token = op
				662	self._pos += 1
				663	else:
				664	#End of operator
				665	self._push_token(c)
				666	self._state = self.ST_NORMAL
				667	break
				668
				669	def _parse_comment(self):
				670	while 1:
				671	if self._pos>=len(self._input):
				672	raise NeedMore()
				673
				674	c = self._input[self._pos]
				675	if c=='\n':
				676	#End of comment, do not consume the end of line
				677	self._state = self.ST_NORMAL
				678	break
				679	else:
				680	self._token += c
				681	self._pos += 1
				682
				683	def _parse_quoted(self, eof):
				684	"""Precondition: the starting backquote/dollar is still in the input queue."""
				685	if not self._wordlexer:
				686	self._wordlexer = WordLexer()
				687
				688	if self._pos<len(self._input):
				689	#Transfer input queue character into the subparser
				690	input = self._input[self._pos:]
				691	self._pos += len(input)
				692
				693	wtree, remaining = self._wordlexer.add(input, eof)
				694	self._wordlexer = None
				695	self._token += wordtree_as_string(wtree)
				696
				697	#Put unparsed character back in the input queue
				698	if remaining:
				699	self._input[self._pos:self._pos] = list(remaining)
				700	self._state = self.ST_NORMAL
				701
				702	def _parse_heredoc(self, eof):
				703	assert not self._token
				704
				705	if self._herelexer is None:
				706	self._herelexer = HereDocLexer(self._heredoc.op, self._heredoc.name)
				707
				708	if self._pos<len(self._input):
				709	#Transfer input queue character into the subparser
				710	input = self._input[self._pos:]
				711	self._pos += len(input)
				712
				713	self._token, remaining = self._herelexer.add(input, eof)
				714
				715	#Reset here-document state
				716	self._herelexer = None
				717	heredoc, self._heredoc = self._heredoc, HereDoc(None)
				718	if remaining:
				719	self._input[self._pos:self._pos] = list(remaining)
				720	self._state = self.ST_NORMAL
				721
				722	#Push pending tokens
				723	heredoc.pendings[:0] = [(self._token, self._type, heredoc.name)]
				724	for token, type, delim in heredoc.pendings:
				725	self._token = token
				726	self._type = type
				727	self._push_token(delim)
				728
				729	def _push_token(self, delim):
				730	if not self._token:
				731	return 0
				732
				733	if self._heredoc.op is not None:
				734	if self._heredoc.name is None:
				735	#Here-document name
				736	if self._type!=TK_TOKEN:
				737	raise ShellSyntaxError("expecting here-document name, got '%s'" % self._token)
				738	self._heredoc.name = unquote_wordtree(make_wordtree(self._token))
				739	self._type = TK_HERENAME
				740	else:
				741	#Capture all tokens until the newline starting the here-document
				742	if self._type==TK_NEWLINE:
				743	assert self._state==self.ST_NORMAL
				744	self._state = self.ST_HEREDOC
				745
				746	self._heredoc.pendings.append((self._token, self._type, delim))
				747	self._token = ''
				748	self._type = TK_TOKEN
				749	return 1
				750
				751	# BEWARE: do not change parser state from here to the end of the function:
				752	# when parsing between an here-document operator to the end of the line
				753	# tokens are stored in self._heredoc.pendings. Therefore, they will not
				754	# reach the section below.
				755
				756	#Check operators
				757	if self._type==TK_OP:
				758	#False positive because of partial op matching
				759	op = is_op(self._token)
				760	if not op:
				761	self._type = TK_TOKEN
				762	else:
				763	#Map to the specific operator
				764	self._type = op
				765	if self._token in ('<<', '<<-'):
				766	#Done here rather than in _parse_op because there is no need
				767	#to change the parser state since we are still waiting for
				768	#the here-document name
				769	if self._heredoc.op is not None:
				770	raise ShellSyntaxError("syntax error near token '%s'" % self._token)
				771	assert self._heredoc.op is None
				772	self._heredoc.op = self._token
				773
				774	if self._type==TK_TOKEN:
				775	if '=' in self._token and not delim:
				776	if self._token.startswith('='):
				777	#Token is a WORD... a TOKEN that is.
				778	pass
				779	else:
				780	prev = self._token[:self._token.find('=')]
				781	if is_name(prev):
				782	self._type = TK_ASSIGNMENT
				783	else:
				784	#Just a token (unspecified)
				785	pass
				786	else:
				787	reserved = get_reserved(self._token)
				788	if reserved is not None:
				789	if reserved=='In' and self._for_count!=2:
				790	#Sorry, not a reserved word after all
				791	pass
				792	else:
				793	self._type = reserved
				794	if reserved in ('For', 'Case'):
				795	self._for_count = 0
				796	elif are_digits(self._token) and delim in ('<', '>'):
				797	#Detect IO_NUMBER
				798	self._type = TK_IONUMBER
				799	elif self._token==';':
				800	self._type = TK_COMMA
				801	elif self._token=='&':
				802	self._type = TK_AMPERSAND
				803	elif self._type==TK_COMMENT:
				804	#Comments are not part of sh grammar, ignore them
				805	self._token = ''
				806	self._type = TK_TOKEN
				807	return 0
				808
				809	if self._for_count is not None:
				810	#Track token count in 'For' expression to detect 'In' reserved words.
				811	#Can only be in third position, no need to go beyond
				812	self._for_count += 1
				813	if self._for_count==3:
				814	self._for_count = None
				815
				816	self.on_token((self._token, self._type))
				817	self._token = ''
				818	self._type = TK_TOKEN
				819	return 1
				820
				821	def on_token(self, token):
				822	raise NotImplementedError
				823
				824
				825	tokens = [
				826	TK_TOKEN,
				827	# To silence yacc unused token warnings
				828	# TK_COMMENT,
				829	TK_NEWLINE,
				830	TK_IONUMBER,
				831	TK_ASSIGNMENT,
				832	TK_HERENAME,
				833	]
				834
				835	#Add specific operators
				836	tokens += _OPERATORS.values()
				837	#Add reserved words
				838	tokens += _RESERVEDS.values()
				839
				840	class PLYLexer(Lexer):
				841	"""Bridge Lexer and PLY lexer interface."""
				842	def __init__(self):
				843	Lexer.__init__(self)
				844	self._tokens = []
				845	self._current = 0
				846	self.lineno = 0
				847
				848	def on_token(self, token):
				849	value, type = token
				850
				851	self.lineno = 0
				852	t = lex.LexToken()
				853	t.value = value
				854	t.type = type
				855	t.lexer = self
				856	t.lexpos = 0
				857	t.lineno = 0
				858
				859	self._tokens.append(t)
				860
				861	def is_empty(self):
				862	return not bool(self._tokens)
				863
				864	#PLY compliant interface
				865	def token(self):
				866	if self._current>=len(self._tokens):
				867	return None
				868	t = self._tokens[self._current]
				869	self._current += 1
				870	return t
				871
				872
				873	def get_tokens(s):
				874	"""Parse the input string and return a tuple (tokens, unprocessed) where
				875	tokens is a list of parsed tokens and unprocessed is the part of the input
				876	string left untouched by the lexer.
				877	"""
				878	lexer = PLYLexer()
				879	untouched = lexer.add(s, True)
				880	tokens = []
				881	while 1:
				882	token = lexer.token()
				883	if token is None:
				884	break
				885	tokens.append(token)
				886
				887	tokens = [(t.value, t.type) for t in tokens]
				888	return tokens, untouched