| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | import ast | 
|  | 2 | import codegen | 
|  | 3 | import logging | 
|  | 4 | import os.path | 
|  | 5 | import bb.utils, bb.data | 
|  | 6 | from itertools import chain | 
|  | 7 | from pysh import pyshyacc, pyshlex, sherrors | 
|  | 8 | from bb.cache import MultiProcessCache | 
|  | 9 |  | 
|  | 10 |  | 
|  | 11 | logger = logging.getLogger('BitBake.CodeParser') | 
|  | 12 |  | 
|  | 13 | try: | 
|  | 14 | import cPickle as pickle | 
|  | 15 | except ImportError: | 
|  | 16 | import pickle | 
|  | 17 | logger.info('Importing cPickle failed.  Falling back to a very slow implementation.') | 
|  | 18 |  | 
|  | 19 |  | 
|  | 20 | def check_indent(codestr): | 
|  | 21 | """If the code is indented, add a top level piece of code to 'remove' the indentation""" | 
|  | 22 |  | 
|  | 23 | i = 0 | 
|  | 24 | while codestr[i] in ["\n", "\t", " "]: | 
|  | 25 | i = i + 1 | 
|  | 26 |  | 
|  | 27 | if i == 0: | 
|  | 28 | return codestr | 
|  | 29 |  | 
|  | 30 | if codestr[i-1] == "\t" or codestr[i-1] == " ": | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 31 | if codestr[0] == "\n": | 
|  | 32 | # Since we're adding a line, we need to remove one line of any empty padding | 
|  | 33 | # to ensure line numbers are correct | 
|  | 34 | codestr = codestr[1:] | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 35 | return "if 1:\n" + codestr | 
|  | 36 |  | 
|  | 37 | return codestr | 
|  | 38 |  | 
|  | 39 |  | 
|  | 40 | # Basically pickle, in python 2.7.3 at least, does badly with data duplication | 
|  | 41 | # upon pickling and unpickling. Combine this with duplicate objects and things | 
|  | 42 | # are a mess. | 
|  | 43 | # | 
|  | 44 | # When the sets are originally created, python calls intern() on the set keys | 
|  | 45 | # which significantly improves memory usage. Sadly the pickle/unpickle process | 
|  | 46 | # doesn't call intern() on the keys and results in the same strings being duplicated | 
|  | 47 | # in memory. This also means pickle will save the same string multiple times in | 
|  | 48 | # the cache file. | 
|  | 49 | # | 
|  | 50 | # By having shell and python cacheline objects with setstate/getstate, we force | 
|  | 51 | # the object creation through our own routine where we can call intern (via internSet). | 
|  | 52 | # | 
|  | 53 | # We also use hashable frozensets and ensure we use references to these so that | 
|  | 54 | # duplicates can be removed, both in memory and in the resulting pickled data. | 
|  | 55 | # | 
|  | 56 | # By playing these games, the size of the cache file shrinks dramatically | 
|  | 57 | # meaning faster load times and the reloaded cache files also consume much less | 
|  | 58 | # memory. Smaller cache files, faster load times and lower memory usage is good. | 
|  | 59 | # | 
|  | 60 | # A custom getstate/setstate using tuples is actually worth 15% cachesize by | 
|  | 61 | # avoiding duplication of the attribute names! | 
|  | 62 |  | 
|  | 63 | class SetCache(object): | 
|  | 64 | def __init__(self): | 
|  | 65 | self.setcache = {} | 
|  | 66 |  | 
|  | 67 | def internSet(self, items): | 
|  | 68 |  | 
|  | 69 | new = [] | 
|  | 70 | for i in items: | 
|  | 71 | new.append(intern(i)) | 
|  | 72 | s = frozenset(new) | 
|  | 73 | if hash(s) in self.setcache: | 
|  | 74 | return self.setcache[hash(s)] | 
|  | 75 | self.setcache[hash(s)] = s | 
|  | 76 | return s | 
|  | 77 |  | 
|  | 78 | codecache = SetCache() | 
|  | 79 |  | 
|  | 80 | class pythonCacheLine(object): | 
|  | 81 | def __init__(self, refs, execs, contains): | 
|  | 82 | self.refs = codecache.internSet(refs) | 
|  | 83 | self.execs = codecache.internSet(execs) | 
|  | 84 | self.contains = {} | 
|  | 85 | for c in contains: | 
|  | 86 | self.contains[c] = codecache.internSet(contains[c]) | 
|  | 87 |  | 
|  | 88 | def __getstate__(self): | 
|  | 89 | return (self.refs, self.execs, self.contains) | 
|  | 90 |  | 
|  | 91 | def __setstate__(self, state): | 
|  | 92 | (refs, execs, contains) = state | 
|  | 93 | self.__init__(refs, execs, contains) | 
|  | 94 | def __hash__(self): | 
|  | 95 | l = (hash(self.refs), hash(self.execs)) | 
|  | 96 | for c in sorted(self.contains.keys()): | 
|  | 97 | l = l + (c, hash(self.contains[c])) | 
|  | 98 | return hash(l) | 
|  | 99 | def __repr__(self): | 
|  | 100 | return " ".join([str(self.refs), str(self.execs), str(self.contains)]) | 
|  | 101 |  | 
|  | 102 |  | 
|  | 103 | class shellCacheLine(object): | 
|  | 104 | def __init__(self, execs): | 
|  | 105 | self.execs = codecache.internSet(execs) | 
|  | 106 |  | 
|  | 107 | def __getstate__(self): | 
|  | 108 | return (self.execs) | 
|  | 109 |  | 
|  | 110 | def __setstate__(self, state): | 
|  | 111 | (execs) = state | 
|  | 112 | self.__init__(execs) | 
|  | 113 | def __hash__(self): | 
|  | 114 | return hash(self.execs) | 
|  | 115 | def __repr__(self): | 
|  | 116 | return str(self.execs) | 
|  | 117 |  | 
|  | 118 | class CodeParserCache(MultiProcessCache): | 
|  | 119 | cache_file_name = "bb_codeparser.dat" | 
|  | 120 | CACHE_VERSION = 7 | 
|  | 121 |  | 
|  | 122 | def __init__(self): | 
|  | 123 | MultiProcessCache.__init__(self) | 
|  | 124 | self.pythoncache = self.cachedata[0] | 
|  | 125 | self.shellcache = self.cachedata[1] | 
|  | 126 | self.pythoncacheextras = self.cachedata_extras[0] | 
|  | 127 | self.shellcacheextras = self.cachedata_extras[1] | 
|  | 128 |  | 
|  | 129 | # To avoid duplication in the codeparser cache, keep | 
|  | 130 | # a lookup of hashes of objects we already have | 
|  | 131 | self.pythoncachelines = {} | 
|  | 132 | self.shellcachelines = {} | 
|  | 133 |  | 
|  | 134 | def newPythonCacheLine(self, refs, execs, contains): | 
|  | 135 | cacheline = pythonCacheLine(refs, execs, contains) | 
|  | 136 | h = hash(cacheline) | 
|  | 137 | if h in self.pythoncachelines: | 
|  | 138 | return self.pythoncachelines[h] | 
|  | 139 | self.pythoncachelines[h] = cacheline | 
|  | 140 | return cacheline | 
|  | 141 |  | 
|  | 142 | def newShellCacheLine(self, execs): | 
|  | 143 | cacheline = shellCacheLine(execs) | 
|  | 144 | h = hash(cacheline) | 
|  | 145 | if h in self.shellcachelines: | 
|  | 146 | return self.shellcachelines[h] | 
|  | 147 | self.shellcachelines[h] = cacheline | 
|  | 148 | return cacheline | 
|  | 149 |  | 
|  | 150 | def init_cache(self, d): | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 151 | # Check if we already have the caches | 
|  | 152 | if self.pythoncache: | 
|  | 153 | return | 
|  | 154 |  | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 155 | MultiProcessCache.init_cache(self, d) | 
|  | 156 |  | 
|  | 157 | # cachedata gets re-assigned in the parent | 
|  | 158 | self.pythoncache = self.cachedata[0] | 
|  | 159 | self.shellcache = self.cachedata[1] | 
|  | 160 |  | 
|  | 161 | def create_cachedata(self): | 
|  | 162 | data = [{}, {}] | 
|  | 163 | return data | 
|  | 164 |  | 
|  | 165 | codeparsercache = CodeParserCache() | 
|  | 166 |  | 
|  | 167 | def parser_cache_init(d): | 
|  | 168 | codeparsercache.init_cache(d) | 
|  | 169 |  | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 170 | def parser_cache_save(): | 
|  | 171 | codeparsercache.save_extras() | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 172 |  | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 173 | def parser_cache_savemerge(): | 
|  | 174 | codeparsercache.save_merge() | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 175 |  | 
|  | 176 | Logger = logging.getLoggerClass() | 
|  | 177 | class BufferedLogger(Logger): | 
|  | 178 | def __init__(self, name, level=0, target=None): | 
|  | 179 | Logger.__init__(self, name) | 
|  | 180 | self.setLevel(level) | 
|  | 181 | self.buffer = [] | 
|  | 182 | self.target = target | 
|  | 183 |  | 
|  | 184 | def handle(self, record): | 
|  | 185 | self.buffer.append(record) | 
|  | 186 |  | 
|  | 187 | def flush(self): | 
|  | 188 | for record in self.buffer: | 
|  | 189 | self.target.handle(record) | 
|  | 190 | self.buffer = [] | 
|  | 191 |  | 
|  | 192 | class PythonParser(): | 
|  | 193 | getvars = (".getVar", ".appendVar", ".prependVar") | 
|  | 194 | containsfuncs = ("bb.utils.contains", "base_contains", "bb.utils.contains_any") | 
|  | 195 | execfuncs = ("bb.build.exec_func", "bb.build.exec_task") | 
|  | 196 |  | 
|  | 197 | def warn(self, func, arg): | 
|  | 198 | """Warn about calls of bitbake APIs which pass a non-literal | 
|  | 199 | argument for the variable name, as we're not able to track such | 
|  | 200 | a reference. | 
|  | 201 | """ | 
|  | 202 |  | 
|  | 203 | try: | 
|  | 204 | funcstr = codegen.to_source(func) | 
|  | 205 | argstr = codegen.to_source(arg) | 
|  | 206 | except TypeError: | 
|  | 207 | self.log.debug(2, 'Failed to convert function and argument to source form') | 
|  | 208 | else: | 
|  | 209 | self.log.debug(1, self.unhandled_message % (funcstr, argstr)) | 
|  | 210 |  | 
|  | 211 | def visit_Call(self, node): | 
|  | 212 | name = self.called_node_name(node.func) | 
|  | 213 | if name and name.endswith(self.getvars) or name in self.containsfuncs: | 
|  | 214 | if isinstance(node.args[0], ast.Str): | 
|  | 215 | varname = node.args[0].s | 
|  | 216 | if name in self.containsfuncs and isinstance(node.args[1], ast.Str): | 
|  | 217 | if varname not in self.contains: | 
|  | 218 | self.contains[varname] = set() | 
|  | 219 | self.contains[varname].add(node.args[1].s) | 
|  | 220 | else: | 
|  | 221 | self.references.add(node.args[0].s) | 
|  | 222 | else: | 
|  | 223 | self.warn(node.func, node.args[0]) | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 224 | elif name and name.endswith(".expand"): | 
|  | 225 | if isinstance(node.args[0], ast.Str): | 
|  | 226 | value = node.args[0].s | 
|  | 227 | d = bb.data.init() | 
|  | 228 | parser = d.expandWithRefs(value, self.name) | 
|  | 229 | self.references |= parser.references | 
|  | 230 | self.execs |= parser.execs | 
|  | 231 | for varname in parser.contains: | 
|  | 232 | if varname not in self.contains: | 
|  | 233 | self.contains[varname] = set() | 
|  | 234 | self.contains[varname] |= parser.contains[varname] | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 235 | elif name in self.execfuncs: | 
|  | 236 | if isinstance(node.args[0], ast.Str): | 
|  | 237 | self.var_execs.add(node.args[0].s) | 
|  | 238 | else: | 
|  | 239 | self.warn(node.func, node.args[0]) | 
|  | 240 | elif name and isinstance(node.func, (ast.Name, ast.Attribute)): | 
|  | 241 | self.execs.add(name) | 
|  | 242 |  | 
|  | 243 | def called_node_name(self, node): | 
|  | 244 | """Given a called node, return its original string form""" | 
|  | 245 | components = [] | 
|  | 246 | while node: | 
|  | 247 | if isinstance(node, ast.Attribute): | 
|  | 248 | components.append(node.attr) | 
|  | 249 | node = node.value | 
|  | 250 | elif isinstance(node, ast.Name): | 
|  | 251 | components.append(node.id) | 
|  | 252 | return '.'.join(reversed(components)) | 
|  | 253 | else: | 
|  | 254 | break | 
|  | 255 |  | 
|  | 256 | def __init__(self, name, log): | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 257 | self.name = name | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 258 | self.var_execs = set() | 
|  | 259 | self.contains = {} | 
|  | 260 | self.execs = set() | 
|  | 261 | self.references = set() | 
|  | 262 | self.log = BufferedLogger('BitBake.Data.PythonParser', logging.DEBUG, log) | 
|  | 263 |  | 
|  | 264 | self.unhandled_message = "in call of %s, argument '%s' is not a string literal" | 
|  | 265 | self.unhandled_message = "while parsing %s, %s" % (name, self.unhandled_message) | 
|  | 266 |  | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 267 | def parse_python(self, node, lineno=0, filename="<string>"): | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 268 | if not node or not node.strip(): | 
|  | 269 | return | 
|  | 270 |  | 
|  | 271 | h = hash(str(node)) | 
|  | 272 |  | 
|  | 273 | if h in codeparsercache.pythoncache: | 
|  | 274 | self.references = set(codeparsercache.pythoncache[h].refs) | 
|  | 275 | self.execs = set(codeparsercache.pythoncache[h].execs) | 
|  | 276 | self.contains = {} | 
|  | 277 | for i in codeparsercache.pythoncache[h].contains: | 
|  | 278 | self.contains[i] = set(codeparsercache.pythoncache[h].contains[i]) | 
|  | 279 | return | 
|  | 280 |  | 
|  | 281 | if h in codeparsercache.pythoncacheextras: | 
|  | 282 | self.references = set(codeparsercache.pythoncacheextras[h].refs) | 
|  | 283 | self.execs = set(codeparsercache.pythoncacheextras[h].execs) | 
|  | 284 | self.contains = {} | 
|  | 285 | for i in codeparsercache.pythoncacheextras[h].contains: | 
|  | 286 | self.contains[i] = set(codeparsercache.pythoncacheextras[h].contains[i]) | 
|  | 287 | return | 
|  | 288 |  | 
| Patrick Williams | d8c66bc | 2016-06-20 12:57:21 -0500 | [diff] [blame] | 289 | # We can't add to the linenumbers for compile, we can pad to the correct number of blank lines though | 
|  | 290 | node = "\n" * int(lineno) + node | 
|  | 291 | code = compile(check_indent(str(node)), filename, "exec", | 
| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 292 | ast.PyCF_ONLY_AST) | 
|  | 293 |  | 
|  | 294 | for n in ast.walk(code): | 
|  | 295 | if n.__class__.__name__ == "Call": | 
|  | 296 | self.visit_Call(n) | 
|  | 297 |  | 
|  | 298 | self.execs.update(self.var_execs) | 
|  | 299 |  | 
|  | 300 | codeparsercache.pythoncacheextras[h] = codeparsercache.newPythonCacheLine(self.references, self.execs, self.contains) | 
|  | 301 |  | 
|  | 302 | class ShellParser(): | 
|  | 303 | def __init__(self, name, log): | 
|  | 304 | self.funcdefs = set() | 
|  | 305 | self.allexecs = set() | 
|  | 306 | self.execs = set() | 
|  | 307 | self.log = BufferedLogger('BitBake.Data.%s' % name, logging.DEBUG, log) | 
|  | 308 | self.unhandled_template = "unable to handle non-literal command '%s'" | 
|  | 309 | self.unhandled_template = "while parsing %s, %s" % (name, self.unhandled_template) | 
|  | 310 |  | 
|  | 311 | def parse_shell(self, value): | 
|  | 312 | """Parse the supplied shell code in a string, returning the external | 
|  | 313 | commands it executes. | 
|  | 314 | """ | 
|  | 315 |  | 
|  | 316 | h = hash(str(value)) | 
|  | 317 |  | 
|  | 318 | if h in codeparsercache.shellcache: | 
|  | 319 | self.execs = set(codeparsercache.shellcache[h].execs) | 
|  | 320 | return self.execs | 
|  | 321 |  | 
|  | 322 | if h in codeparsercache.shellcacheextras: | 
|  | 323 | self.execs = set(codeparsercache.shellcacheextras[h].execs) | 
|  | 324 | return self.execs | 
|  | 325 |  | 
|  | 326 | self._parse_shell(value) | 
|  | 327 | self.execs = set(cmd for cmd in self.allexecs if cmd not in self.funcdefs) | 
|  | 328 |  | 
|  | 329 | codeparsercache.shellcacheextras[h] = codeparsercache.newShellCacheLine(self.execs) | 
|  | 330 |  | 
|  | 331 | return self.execs | 
|  | 332 |  | 
|  | 333 | def _parse_shell(self, value): | 
|  | 334 | try: | 
|  | 335 | tokens, _ = pyshyacc.parse(value, eof=True, debug=False) | 
|  | 336 | except pyshlex.NeedMore: | 
|  | 337 | raise sherrors.ShellSyntaxError("Unexpected EOF") | 
|  | 338 |  | 
|  | 339 | for token in tokens: | 
|  | 340 | self.process_tokens(token) | 
|  | 341 |  | 
|  | 342 | def process_tokens(self, tokens): | 
|  | 343 | """Process a supplied portion of the syntax tree as returned by | 
|  | 344 | pyshyacc.parse. | 
|  | 345 | """ | 
|  | 346 |  | 
|  | 347 | def function_definition(value): | 
|  | 348 | self.funcdefs.add(value.name) | 
|  | 349 | return [value.body], None | 
|  | 350 |  | 
|  | 351 | def case_clause(value): | 
|  | 352 | # Element 0 of each item in the case is the list of patterns, and | 
|  | 353 | # Element 1 of each item in the case is the list of commands to be | 
|  | 354 | # executed when that pattern matches. | 
|  | 355 | words = chain(*[item[0] for item in value.items]) | 
|  | 356 | cmds  = chain(*[item[1] for item in value.items]) | 
|  | 357 | return cmds, words | 
|  | 358 |  | 
|  | 359 | def if_clause(value): | 
|  | 360 | main = chain(value.cond, value.if_cmds) | 
|  | 361 | rest = value.else_cmds | 
|  | 362 | if isinstance(rest, tuple) and rest[0] == "elif": | 
|  | 363 | return chain(main, if_clause(rest[1])) | 
|  | 364 | else: | 
|  | 365 | return chain(main, rest) | 
|  | 366 |  | 
|  | 367 | def simple_command(value): | 
|  | 368 | return None, chain(value.words, (assign[1] for assign in value.assigns)) | 
|  | 369 |  | 
|  | 370 | token_handlers = { | 
|  | 371 | "and_or": lambda x: ((x.left, x.right), None), | 
|  | 372 | "async": lambda x: ([x], None), | 
|  | 373 | "brace_group": lambda x: (x.cmds, None), | 
|  | 374 | "for_clause": lambda x: (x.cmds, x.items), | 
|  | 375 | "function_definition": function_definition, | 
|  | 376 | "if_clause": lambda x: (if_clause(x), None), | 
|  | 377 | "pipeline": lambda x: (x.commands, None), | 
|  | 378 | "redirect_list": lambda x: ([x.cmd], None), | 
|  | 379 | "subshell": lambda x: (x.cmds, None), | 
|  | 380 | "while_clause": lambda x: (chain(x.condition, x.cmds), None), | 
|  | 381 | "until_clause": lambda x: (chain(x.condition, x.cmds), None), | 
|  | 382 | "simple_command": simple_command, | 
|  | 383 | "case_clause": case_clause, | 
|  | 384 | } | 
|  | 385 |  | 
|  | 386 | for token in tokens: | 
|  | 387 | name, value = token | 
|  | 388 | try: | 
|  | 389 | more_tokens, words = token_handlers[name](value) | 
|  | 390 | except KeyError: | 
|  | 391 | raise NotImplementedError("Unsupported token type " + name) | 
|  | 392 |  | 
|  | 393 | if more_tokens: | 
|  | 394 | self.process_tokens(more_tokens) | 
|  | 395 |  | 
|  | 396 | if words: | 
|  | 397 | self.process_words(words) | 
|  | 398 |  | 
|  | 399 | def process_words(self, words): | 
|  | 400 | """Process a set of 'words' in pyshyacc parlance, which includes | 
|  | 401 | extraction of executed commands from $() blocks, as well as grabbing | 
|  | 402 | the command name argument. | 
|  | 403 | """ | 
|  | 404 |  | 
|  | 405 | words = list(words) | 
|  | 406 | for word in list(words): | 
|  | 407 | wtree = pyshlex.make_wordtree(word[1]) | 
|  | 408 | for part in wtree: | 
|  | 409 | if not isinstance(part, list): | 
|  | 410 | continue | 
|  | 411 |  | 
|  | 412 | if part[0] in ('`', '$('): | 
|  | 413 | command = pyshlex.wordtree_as_string(part[1:-1]) | 
|  | 414 | self._parse_shell(command) | 
|  | 415 |  | 
|  | 416 | if word[0] in ("cmd_name", "cmd_word"): | 
|  | 417 | if word in words: | 
|  | 418 | words.remove(word) | 
|  | 419 |  | 
|  | 420 | usetoken = False | 
|  | 421 | for word in words: | 
|  | 422 | if word[0] in ("cmd_name", "cmd_word") or \ | 
|  | 423 | (usetoken and word[0] == "TOKEN"): | 
|  | 424 | if "=" in word[1]: | 
|  | 425 | usetoken = True | 
|  | 426 | continue | 
|  | 427 |  | 
|  | 428 | cmd = word[1] | 
|  | 429 | if cmd.startswith("$"): | 
|  | 430 | self.log.debug(1, self.unhandled_template % cmd) | 
|  | 431 | elif cmd == "eval": | 
|  | 432 | command = " ".join(word for _, word in words[1:]) | 
|  | 433 | self._parse_shell(command) | 
|  | 434 | else: | 
|  | 435 | self.allexecs.add(cmd) | 
|  | 436 | break |