blob: 25938d6586eff8900bea91e473cb658f73fadff7 [file] [log] [blame]
Patrick Williamsc124f4f2015-09-15 14:41:29 -05001import ast
Patrick Williamsc0f7c042017-02-23 20:41:17 -06002import sys
Patrick Williamsc124f4f2015-09-15 14:41:29 -05003import codegen
4import logging
Patrick Williamsc0f7c042017-02-23 20:41:17 -06005import pickle
6import bb.pysh as pysh
Patrick Williamsc124f4f2015-09-15 14:41:29 -05007import os.path
8import bb.utils, bb.data
Patrick Williamsc0f7c042017-02-23 20:41:17 -06009import hashlib
Patrick Williamsc124f4f2015-09-15 14:41:29 -050010from itertools import chain
Patrick Williamsc0f7c042017-02-23 20:41:17 -060011from bb.pysh import pyshyacc, pyshlex, sherrors
Patrick Williamsc124f4f2015-09-15 14:41:29 -050012from bb.cache import MultiProcessCache
13
Patrick Williamsc124f4f2015-09-15 14:41:29 -050014logger = logging.getLogger('BitBake.CodeParser')
15
Patrick Williamsc0f7c042017-02-23 20:41:17 -060016def bbhash(s):
17 return hashlib.md5(s.encode("utf-8")).hexdigest()
Patrick Williamsc124f4f2015-09-15 14:41:29 -050018
19def check_indent(codestr):
20 """If the code is indented, add a top level piece of code to 'remove' the indentation"""
21
22 i = 0
23 while codestr[i] in ["\n", "\t", " "]:
24 i = i + 1
25
26 if i == 0:
27 return codestr
28
29 if codestr[i-1] == "\t" or codestr[i-1] == " ":
Patrick Williamsd8c66bc2016-06-20 12:57:21 -050030 if codestr[0] == "\n":
31 # Since we're adding a line, we need to remove one line of any empty padding
32 # to ensure line numbers are correct
33 codestr = codestr[1:]
Patrick Williamsc124f4f2015-09-15 14:41:29 -050034 return "if 1:\n" + codestr
35
36 return codestr
37
38
39# Basically pickle, in python 2.7.3 at least, does badly with data duplication
40# upon pickling and unpickling. Combine this with duplicate objects and things
41# are a mess.
42#
43# When the sets are originally created, python calls intern() on the set keys
44# which significantly improves memory usage. Sadly the pickle/unpickle process
45# doesn't call intern() on the keys and results in the same strings being duplicated
46# in memory. This also means pickle will save the same string multiple times in
47# the cache file.
48#
49# By having shell and python cacheline objects with setstate/getstate, we force
50# the object creation through our own routine where we can call intern (via internSet).
51#
52# We also use hashable frozensets and ensure we use references to these so that
53# duplicates can be removed, both in memory and in the resulting pickled data.
54#
55# By playing these games, the size of the cache file shrinks dramatically
56# meaning faster load times and the reloaded cache files also consume much less
57# memory. Smaller cache files, faster load times and lower memory usage is good.
58#
59# A custom getstate/setstate using tuples is actually worth 15% cachesize by
60# avoiding duplication of the attribute names!
61
62class SetCache(object):
63 def __init__(self):
64 self.setcache = {}
65
66 def internSet(self, items):
67
68 new = []
69 for i in items:
Patrick Williamsc0f7c042017-02-23 20:41:17 -060070 new.append(sys.intern(i))
Patrick Williamsc124f4f2015-09-15 14:41:29 -050071 s = frozenset(new)
Patrick Williamsc0f7c042017-02-23 20:41:17 -060072 h = hash(s)
73 if h in self.setcache:
74 return self.setcache[h]
75 self.setcache[h] = s
Patrick Williamsc124f4f2015-09-15 14:41:29 -050076 return s
77
78codecache = SetCache()
79
80class pythonCacheLine(object):
81 def __init__(self, refs, execs, contains):
82 self.refs = codecache.internSet(refs)
83 self.execs = codecache.internSet(execs)
84 self.contains = {}
85 for c in contains:
86 self.contains[c] = codecache.internSet(contains[c])
87
88 def __getstate__(self):
89 return (self.refs, self.execs, self.contains)
90
91 def __setstate__(self, state):
92 (refs, execs, contains) = state
93 self.__init__(refs, execs, contains)
94 def __hash__(self):
95 l = (hash(self.refs), hash(self.execs))
96 for c in sorted(self.contains.keys()):
97 l = l + (c, hash(self.contains[c]))
98 return hash(l)
99 def __repr__(self):
100 return " ".join([str(self.refs), str(self.execs), str(self.contains)])
101
102
103class shellCacheLine(object):
104 def __init__(self, execs):
105 self.execs = codecache.internSet(execs)
106
107 def __getstate__(self):
108 return (self.execs)
109
110 def __setstate__(self, state):
111 (execs) = state
112 self.__init__(execs)
113 def __hash__(self):
114 return hash(self.execs)
115 def __repr__(self):
116 return str(self.execs)
117
118class CodeParserCache(MultiProcessCache):
119 cache_file_name = "bb_codeparser.dat"
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600120 CACHE_VERSION = 8
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500121
122 def __init__(self):
123 MultiProcessCache.__init__(self)
124 self.pythoncache = self.cachedata[0]
125 self.shellcache = self.cachedata[1]
126 self.pythoncacheextras = self.cachedata_extras[0]
127 self.shellcacheextras = self.cachedata_extras[1]
128
129 # To avoid duplication in the codeparser cache, keep
130 # a lookup of hashes of objects we already have
131 self.pythoncachelines = {}
132 self.shellcachelines = {}
133
134 def newPythonCacheLine(self, refs, execs, contains):
135 cacheline = pythonCacheLine(refs, execs, contains)
136 h = hash(cacheline)
137 if h in self.pythoncachelines:
138 return self.pythoncachelines[h]
139 self.pythoncachelines[h] = cacheline
140 return cacheline
141
142 def newShellCacheLine(self, execs):
143 cacheline = shellCacheLine(execs)
144 h = hash(cacheline)
145 if h in self.shellcachelines:
146 return self.shellcachelines[h]
147 self.shellcachelines[h] = cacheline
148 return cacheline
149
150 def init_cache(self, d):
Patrick Williamsd8c66bc2016-06-20 12:57:21 -0500151 # Check if we already have the caches
152 if self.pythoncache:
153 return
154
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500155 MultiProcessCache.init_cache(self, d)
156
157 # cachedata gets re-assigned in the parent
158 self.pythoncache = self.cachedata[0]
159 self.shellcache = self.cachedata[1]
160
161 def create_cachedata(self):
162 data = [{}, {}]
163 return data
164
165codeparsercache = CodeParserCache()
166
167def parser_cache_init(d):
168 codeparsercache.init_cache(d)
169
Patrick Williamsd8c66bc2016-06-20 12:57:21 -0500170def parser_cache_save():
171 codeparsercache.save_extras()
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500172
Patrick Williamsd8c66bc2016-06-20 12:57:21 -0500173def parser_cache_savemerge():
174 codeparsercache.save_merge()
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500175
176Logger = logging.getLoggerClass()
177class BufferedLogger(Logger):
178 def __init__(self, name, level=0, target=None):
179 Logger.__init__(self, name)
180 self.setLevel(level)
181 self.buffer = []
182 self.target = target
183
184 def handle(self, record):
185 self.buffer.append(record)
186
187 def flush(self):
188 for record in self.buffer:
189 self.target.handle(record)
190 self.buffer = []
191
192class PythonParser():
193 getvars = (".getVar", ".appendVar", ".prependVar")
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600194 getvarflags = (".getVarFlag", ".appendVarFlag", ".prependVarFlag")
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500195 containsfuncs = ("bb.utils.contains", "base_contains", "bb.utils.contains_any")
196 execfuncs = ("bb.build.exec_func", "bb.build.exec_task")
197
198 def warn(self, func, arg):
199 """Warn about calls of bitbake APIs which pass a non-literal
200 argument for the variable name, as we're not able to track such
201 a reference.
202 """
203
204 try:
205 funcstr = codegen.to_source(func)
206 argstr = codegen.to_source(arg)
207 except TypeError:
208 self.log.debug(2, 'Failed to convert function and argument to source form')
209 else:
210 self.log.debug(1, self.unhandled_message % (funcstr, argstr))
211
212 def visit_Call(self, node):
213 name = self.called_node_name(node.func)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600214 if name and (name.endswith(self.getvars) or name.endswith(self.getvarflags) or name in self.containsfuncs):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500215 if isinstance(node.args[0], ast.Str):
216 varname = node.args[0].s
217 if name in self.containsfuncs and isinstance(node.args[1], ast.Str):
218 if varname not in self.contains:
219 self.contains[varname] = set()
220 self.contains[varname].add(node.args[1].s)
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600221 elif name.endswith(self.getvarflags):
222 if isinstance(node.args[1], ast.Str):
223 self.references.add('%s[%s]' % (varname, node.args[1].s))
224 else:
225 self.warn(node.func, node.args[1])
226 else:
227 self.references.add(varname)
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500228 else:
229 self.warn(node.func, node.args[0])
Patrick Williamsd8c66bc2016-06-20 12:57:21 -0500230 elif name and name.endswith(".expand"):
231 if isinstance(node.args[0], ast.Str):
232 value = node.args[0].s
233 d = bb.data.init()
234 parser = d.expandWithRefs(value, self.name)
235 self.references |= parser.references
236 self.execs |= parser.execs
237 for varname in parser.contains:
238 if varname not in self.contains:
239 self.contains[varname] = set()
240 self.contains[varname] |= parser.contains[varname]
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500241 elif name in self.execfuncs:
242 if isinstance(node.args[0], ast.Str):
243 self.var_execs.add(node.args[0].s)
244 else:
245 self.warn(node.func, node.args[0])
246 elif name and isinstance(node.func, (ast.Name, ast.Attribute)):
247 self.execs.add(name)
248
249 def called_node_name(self, node):
250 """Given a called node, return its original string form"""
251 components = []
252 while node:
253 if isinstance(node, ast.Attribute):
254 components.append(node.attr)
255 node = node.value
256 elif isinstance(node, ast.Name):
257 components.append(node.id)
258 return '.'.join(reversed(components))
259 else:
260 break
261
262 def __init__(self, name, log):
Patrick Williamsd8c66bc2016-06-20 12:57:21 -0500263 self.name = name
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500264 self.var_execs = set()
265 self.contains = {}
266 self.execs = set()
267 self.references = set()
268 self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, log)
269
270 self.unhandled_message = "in call of %s, argument '%s' is not a string literal"
271 self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message)
272
Patrick Williamsd8c66bc2016-06-20 12:57:21 -0500273 def parse_python(self, node, lineno=0, filename="<string>"):
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500274 if not node or not node.strip():
275 return
276
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600277 h = bbhash(str(node))
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500278
279 if h in codeparsercache.pythoncache:
280 self.references = set(codeparsercache.pythoncache[h].refs)
281 self.execs = set(codeparsercache.pythoncache[h].execs)
282 self.contains = {}
283 for i in codeparsercache.pythoncache[h].contains:
284 self.contains[i] = set(codeparsercache.pythoncache[h].contains[i])
285 return
286
287 if h in codeparsercache.pythoncacheextras:
288 self.references = set(codeparsercache.pythoncacheextras[h].refs)
289 self.execs = set(codeparsercache.pythoncacheextras[h].execs)
290 self.contains = {}
291 for i in codeparsercache.pythoncacheextras[h].contains:
292 self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i])
293 return
294
Patrick Williamsd8c66bc2016-06-20 12:57:21 -0500295 # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though
296 node = "\n" * int(lineno) + node
297 code = compile(check_indent(str(node)), filename, "exec",
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500298 ast.PyCF_ONLY_AST)
299
300 for n in ast.walk(code):
301 if n.__class__.__name__ == "Call":
302 self.visit_Call(n)
303
304 self.execs.update(self.var_execs)
305
306 codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains)
307
308class ShellParser():
309 def __init__(self, name, log):
310 self.funcdefs = set()
311 self.allexecs = set()
312 self.execs = set()
313 self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log)
314 self.unhandled_template = "unable to handle non-literal command '%s'"
315 self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template)
316
317 def parse_shell(self, value):
318 """Parse the supplied shell code in a string, returning the external
319 commands it executes.
320 """
321
Patrick Williamsc0f7c042017-02-23 20:41:17 -0600322 h = bbhash(str(value))
Patrick Williamsc124f4f2015-09-15 14:41:29 -0500323
324 if h in codeparsercache.shellcache:
325 self.execs = set(codeparsercache.shellcache[h].execs)
326 return self.execs
327
328 if h in codeparsercache.shellcacheextras:
329 self.execs = set(codeparsercache.shellcacheextras[h].execs)
330 return self.execs
331
332 self._parse_shell(value)
333 self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs)
334
335 codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs)
336
337 return self.execs
338
339 def _parse_shell(self, value):
340 try:
341 tokens, _ = pyshyacc.parse(value, eof=True, debug=False)
342 except pyshlex.NeedMore:
343 raise sherrors.ShellSyntaxError("Unexpected EOF")
344
345 for token in tokens:
346 self.process_tokens(token)
347
348 def process_tokens(self, tokens):
349 """Process a supplied portion of the syntax tree as returned by
350 pyshyacc.parse.
351 """
352
353 def function_definition(value):
354 self.funcdefs.add(value.name)
355 return [value.body], None
356
357 def case_clause(value):
358 # Element 0 of each item in the case is the list of patterns, and
359 # Element 1 of each item in the case is the list of commands to be
360 # executed when that pattern matches.
361 words = chain(*[item[0] for item in value.items])
362 cmds = chain(*[item[1] for item in value.items])
363 return cmds, words
364
365 def if_clause(value):
366 main = chain(value.cond, value.if_cmds)
367 rest = value.else_cmds
368 if isinstance(rest, tuple) and rest[0] == "elif":
369 return chain(main, if_clause(rest[1]))
370 else:
371 return chain(main, rest)
372
373 def simple_command(value):
374 return None, chain(value.words, (assign[1] for assign in value.assigns))
375
376 token_handlers = {
377 "and_or": lambda x: ((x.left, x.right), None),
378 "async": lambda x: ([x], None),
379 "brace_group": lambda x: (x.cmds, None),
380 "for_clause": lambda x: (x.cmds, x.items),
381 "function_definition": function_definition,
382 "if_clause": lambda x: (if_clause(x), None),
383 "pipeline": lambda x: (x.commands, None),
384 "redirect_list": lambda x: ([x.cmd], None),
385 "subshell": lambda x: (x.cmds, None),
386 "while_clause": lambda x: (chain(x.condition, x.cmds), None),
387 "until_clause": lambda x: (chain(x.condition, x.cmds), None),
388 "simple_command": simple_command,
389 "case_clause": case_clause,
390 }
391
392 for token in tokens:
393 name, value = token
394 try:
395 more_tokens, words = token_handlers[name](value)
396 except KeyError:
397 raise NotImplementedError("Unsupported token type " + name)
398
399 if more_tokens:
400 self.process_tokens(more_tokens)
401
402 if words:
403 self.process_words(words)
404
405 def process_words(self, words):
406 """Process a set of 'words' in pyshyacc parlance, which includes
407 extraction of executed commands from $() blocks, as well as grabbing
408 the command name argument.
409 """
410
411 words = list(words)
412 for word in list(words):
413 wtree = pyshlex.make_wordtree(word[1])
414 for part in wtree:
415 if not isinstance(part, list):
416 continue
417
418 if part[0] in ('`', '$('):
419 command = pyshlex.wordtree_as_string(part[1:-1])
420 self._parse_shell(command)
421
422 if word[0] in ("cmd_name", "cmd_word"):
423 if word in words:
424 words.remove(word)
425
426 usetoken = False
427 for word in words:
428 if word[0] in ("cmd_name", "cmd_word") or \
429 (usetoken and word[0] == "TOKEN"):
430 if "=" in word[1]:
431 usetoken = True
432 continue
433
434 cmd = word[1]
435 if cmd.startswith("$"):
436 self.log.debug(1, self.unhandled_template % cmd)
437 elif cmd == "eval":
438 command = " ".join(word for _, word in words[1:])
439 self._parse_shell(command)
440 else:
441 self.allexecs.add(cmd)
442 break