| Patrick Williams | c124f4f | 2015-09-15 14:41:29 -0500 | [diff] [blame] | 1 | # ----------------------------------------------------------------------------- | 
|  | 2 | # ply: lex.py | 
|  | 3 | # | 
|  | 4 | # Copyright (C) 2001-2009, | 
|  | 5 | # David M. Beazley (Dabeaz LLC) | 
|  | 6 | # All rights reserved. | 
|  | 7 | # | 
|  | 8 | # Redistribution and use in source and binary forms, with or without | 
|  | 9 | # modification, are permitted provided that the following conditions are | 
|  | 10 | # met: | 
|  | 11 | # | 
|  | 12 | # * Redistributions of source code must retain the above copyright notice, | 
|  | 13 | #   this list of conditions and the following disclaimer. | 
|  | 14 | # * Redistributions in binary form must reproduce the above copyright notice, | 
|  | 15 | #   this list of conditions and the following disclaimer in the documentation | 
|  | 16 | #   and/or other materials provided with the distribution. | 
|  | 17 | # * Neither the name of the David Beazley or Dabeaz LLC may be used to | 
|  | 18 | #   endorse or promote products derived from this software without | 
|  | 19 | #  specific prior written permission. | 
|  | 20 | # | 
|  | 21 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | 
|  | 22 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | 
|  | 23 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | 
|  | 24 | # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | 
|  | 25 | # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | 
|  | 26 | # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | 
|  | 27 | # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | 
|  | 28 | # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | 
|  | 29 | # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | 
|  | 30 | # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | 
|  | 31 | # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | 
|  | 32 | # ----------------------------------------------------------------------------- | 
|  | 33 |  | 
|  | 34 | __version__    = "3.3" | 
|  | 35 | __tabversion__ = "3.2"       # Version of table file used | 
|  | 36 |  | 
|  | 37 | import re, sys, types, copy, os | 
|  | 38 |  | 
|  | 39 | # This tuple contains known string types | 
|  | 40 | try: | 
|  | 41 | # Python 2.6 | 
|  | 42 | StringTypes = (types.StringType, types.UnicodeType) | 
|  | 43 | except AttributeError: | 
|  | 44 | # Python 3.0 | 
|  | 45 | StringTypes = (str, bytes) | 
|  | 46 |  | 
|  | 47 | # Extract the code attribute of a function. Different implementations | 
|  | 48 | # are for Python 2/3 compatibility. | 
|  | 49 |  | 
|  | 50 | if sys.version_info[0] < 3: | 
|  | 51 | def func_code(f): | 
|  | 52 | return f.func_code | 
|  | 53 | else: | 
|  | 54 | def func_code(f): | 
|  | 55 | return f.__code__ | 
|  | 56 |  | 
|  | 57 | # This regular expression is used to match valid token names | 
|  | 58 | _is_identifier = re.compile(r'^[a-zA-Z0-9_]+$') | 
|  | 59 |  | 
|  | 60 | # Exception thrown when invalid token encountered and no default error | 
|  | 61 | # handler is defined. | 
|  | 62 |  | 
|  | 63 | class LexError(Exception): | 
|  | 64 | def __init__(self,message,s): | 
|  | 65 | self.args = (message,) | 
|  | 66 | self.text = s | 
|  | 67 |  | 
|  | 68 | # Token class.  This class is used to represent the tokens produced. | 
|  | 69 | class LexToken(object): | 
|  | 70 | def __str__(self): | 
|  | 71 | return "LexToken(%s,%r,%d,%d)" % (self.type,self.value,self.lineno,self.lexpos) | 
|  | 72 | def __repr__(self): | 
|  | 73 | return str(self) | 
|  | 74 |  | 
|  | 75 | # This object is a stand-in for a logging object created by the | 
|  | 76 | # logging module. | 
|  | 77 |  | 
|  | 78 | class PlyLogger(object): | 
|  | 79 | def __init__(self,f): | 
|  | 80 | self.f = f | 
|  | 81 | def critical(self,msg,*args,**kwargs): | 
|  | 82 | self.f.write((msg % args) + "\n") | 
|  | 83 |  | 
|  | 84 | def warning(self,msg,*args,**kwargs): | 
|  | 85 | self.f.write("WARNING: "+ (msg % args) + "\n") | 
|  | 86 |  | 
|  | 87 | def error(self,msg,*args,**kwargs): | 
|  | 88 | self.f.write("ERROR: " + (msg % args) + "\n") | 
|  | 89 |  | 
|  | 90 | info = critical | 
|  | 91 | debug = critical | 
|  | 92 |  | 
|  | 93 | # Null logger is used when no output is generated. Does nothing. | 
|  | 94 | class NullLogger(object): | 
|  | 95 | def __getattribute__(self,name): | 
|  | 96 | return self | 
|  | 97 | def __call__(self,*args,**kwargs): | 
|  | 98 | return self | 
|  | 99 |  | 
|  | 100 | # ----------------------------------------------------------------------------- | 
|  | 101 | #                        === Lexing Engine === | 
|  | 102 | # | 
|  | 103 | # The following Lexer class implements the lexer runtime.   There are only | 
|  | 104 | # a few public methods and attributes: | 
|  | 105 | # | 
|  | 106 | #    input()          -  Store a new string in the lexer | 
|  | 107 | #    token()          -  Get the next token | 
|  | 108 | #    clone()          -  Clone the lexer | 
|  | 109 | # | 
|  | 110 | #    lineno           -  Current line number | 
|  | 111 | #    lexpos           -  Current position in the input string | 
|  | 112 | # ----------------------------------------------------------------------------- | 
|  | 113 |  | 
|  | 114 | class Lexer: | 
|  | 115 | def __init__(self): | 
|  | 116 | self.lexre = None             # Master regular expression. This is a list of | 
|  | 117 | # tuples (re,findex) where re is a compiled | 
|  | 118 | # regular expression and findex is a list | 
|  | 119 | # mapping regex group numbers to rules | 
|  | 120 | self.lexretext = None         # Current regular expression strings | 
|  | 121 | self.lexstatere = {}          # Dictionary mapping lexer states to master regexs | 
|  | 122 | self.lexstateretext = {}      # Dictionary mapping lexer states to regex strings | 
|  | 123 | self.lexstaterenames = {}     # Dictionary mapping lexer states to symbol names | 
|  | 124 | self.lexstate = "INITIAL"     # Current lexer state | 
|  | 125 | self.lexstatestack = []       # Stack of lexer states | 
|  | 126 | self.lexstateinfo = None      # State information | 
|  | 127 | self.lexstateignore = {}      # Dictionary of ignored characters for each state | 
|  | 128 | self.lexstateerrorf = {}      # Dictionary of error functions for each state | 
|  | 129 | self.lexreflags = 0           # Optional re compile flags | 
|  | 130 | self.lexdata = None           # Actual input data (as a string) | 
|  | 131 | self.lexpos = 0               # Current position in input text | 
|  | 132 | self.lexlen = 0               # Length of the input text | 
|  | 133 | self.lexerrorf = None         # Error rule (if any) | 
|  | 134 | self.lextokens = None         # List of valid tokens | 
|  | 135 | self.lexignore = ""           # Ignored characters | 
|  | 136 | self.lexliterals = ""         # Literal characters that can be passed through | 
|  | 137 | self.lexmodule = None         # Module | 
|  | 138 | self.lineno = 1               # Current line number | 
|  | 139 | self.lexoptimize = 0          # Optimized mode | 
|  | 140 |  | 
|  | 141 | def clone(self,object=None): | 
|  | 142 | c = copy.copy(self) | 
|  | 143 |  | 
|  | 144 | # If the object parameter has been supplied, it means we are attaching the | 
|  | 145 | # lexer to a new object.  In this case, we have to rebind all methods in | 
|  | 146 | # the lexstatere and lexstateerrorf tables. | 
|  | 147 |  | 
|  | 148 | if object: | 
|  | 149 | newtab = { } | 
|  | 150 | for key, ritem in self.lexstatere.items(): | 
|  | 151 | newre = [] | 
|  | 152 | for cre, findex in ritem: | 
|  | 153 | newfindex = [] | 
|  | 154 | for f in findex: | 
|  | 155 | if not f or not f[0]: | 
|  | 156 | newfindex.append(f) | 
|  | 157 | continue | 
|  | 158 | newfindex.append((getattr(object,f[0].__name__),f[1])) | 
|  | 159 | newre.append((cre,newfindex)) | 
|  | 160 | newtab[key] = newre | 
|  | 161 | c.lexstatere = newtab | 
|  | 162 | c.lexstateerrorf = { } | 
|  | 163 | for key, ef in self.lexstateerrorf.items(): | 
|  | 164 | c.lexstateerrorf[key] = getattr(object,ef.__name__) | 
|  | 165 | c.lexmodule = object | 
|  | 166 | return c | 
|  | 167 |  | 
|  | 168 | # ------------------------------------------------------------ | 
|  | 169 | # writetab() - Write lexer information to a table file | 
|  | 170 | # ------------------------------------------------------------ | 
|  | 171 | def writetab(self,tabfile,outputdir=""): | 
|  | 172 | if isinstance(tabfile,types.ModuleType): | 
|  | 173 | return | 
|  | 174 | basetabfilename = tabfile.split(".")[-1] | 
|  | 175 | filename = os.path.join(outputdir,basetabfilename)+".py" | 
|  | 176 | tf = open(filename,"w") | 
|  | 177 | tf.write("# %s.py. This file automatically created by PLY (version %s). Don't edit!\n" % (tabfile,__version__)) | 
|  | 178 | tf.write("_tabversion   = %s\n" % repr(__version__)) | 
|  | 179 | tf.write("_lextokens    = %s\n" % repr(self.lextokens)) | 
|  | 180 | tf.write("_lexreflags   = %s\n" % repr(self.lexreflags)) | 
|  | 181 | tf.write("_lexliterals  = %s\n" % repr(self.lexliterals)) | 
|  | 182 | tf.write("_lexstateinfo = %s\n" % repr(self.lexstateinfo)) | 
|  | 183 |  | 
|  | 184 | tabre = { } | 
|  | 185 | # Collect all functions in the initial state | 
|  | 186 | initial = self.lexstatere["INITIAL"] | 
|  | 187 | initialfuncs = [] | 
|  | 188 | for part in initial: | 
|  | 189 | for f in part[1]: | 
|  | 190 | if f and f[0]: | 
|  | 191 | initialfuncs.append(f) | 
|  | 192 |  | 
|  | 193 | for key, lre in self.lexstatere.items(): | 
|  | 194 | titem = [] | 
|  | 195 | for i in range(len(lre)): | 
|  | 196 | titem.append((self.lexstateretext[key][i],_funcs_to_names(lre[i][1],self.lexstaterenames[key][i]))) | 
|  | 197 | tabre[key] = titem | 
|  | 198 |  | 
|  | 199 | tf.write("_lexstatere   = %s\n" % repr(tabre)) | 
|  | 200 | tf.write("_lexstateignore = %s\n" % repr(self.lexstateignore)) | 
|  | 201 |  | 
|  | 202 | taberr = { } | 
|  | 203 | for key, ef in self.lexstateerrorf.items(): | 
|  | 204 | if ef: | 
|  | 205 | taberr[key] = ef.__name__ | 
|  | 206 | else: | 
|  | 207 | taberr[key] = None | 
|  | 208 | tf.write("_lexstateerrorf = %s\n" % repr(taberr)) | 
|  | 209 | tf.close() | 
|  | 210 |  | 
|  | 211 | # ------------------------------------------------------------ | 
|  | 212 | # readtab() - Read lexer information from a tab file | 
|  | 213 | # ------------------------------------------------------------ | 
|  | 214 | def readtab(self,tabfile,fdict): | 
|  | 215 | if isinstance(tabfile,types.ModuleType): | 
|  | 216 | lextab = tabfile | 
|  | 217 | else: | 
|  | 218 | if sys.version_info[0] < 3: | 
|  | 219 | exec("import %s as lextab" % tabfile) | 
|  | 220 | else: | 
|  | 221 | env = { } | 
|  | 222 | exec("import %s as lextab" % tabfile, env,env) | 
|  | 223 | lextab = env['lextab'] | 
|  | 224 |  | 
|  | 225 | if getattr(lextab,"_tabversion","0.0") != __version__: | 
|  | 226 | raise ImportError("Inconsistent PLY version") | 
|  | 227 |  | 
|  | 228 | self.lextokens      = lextab._lextokens | 
|  | 229 | self.lexreflags     = lextab._lexreflags | 
|  | 230 | self.lexliterals    = lextab._lexliterals | 
|  | 231 | self.lexstateinfo   = lextab._lexstateinfo | 
|  | 232 | self.lexstateignore = lextab._lexstateignore | 
|  | 233 | self.lexstatere     = { } | 
|  | 234 | self.lexstateretext = { } | 
|  | 235 | for key,lre in lextab._lexstatere.items(): | 
|  | 236 | titem = [] | 
|  | 237 | txtitem = [] | 
|  | 238 | for i in range(len(lre)): | 
|  | 239 | titem.append((re.compile(lre[i][0],lextab._lexreflags | re.VERBOSE),_names_to_funcs(lre[i][1],fdict))) | 
|  | 240 | txtitem.append(lre[i][0]) | 
|  | 241 | self.lexstatere[key] = titem | 
|  | 242 | self.lexstateretext[key] = txtitem | 
|  | 243 | self.lexstateerrorf = { } | 
|  | 244 | for key,ef in lextab._lexstateerrorf.items(): | 
|  | 245 | self.lexstateerrorf[key] = fdict[ef] | 
|  | 246 | self.begin('INITIAL') | 
|  | 247 |  | 
|  | 248 | # ------------------------------------------------------------ | 
|  | 249 | # input() - Push a new string into the lexer | 
|  | 250 | # ------------------------------------------------------------ | 
|  | 251 | def input(self,s): | 
|  | 252 | # Pull off the first character to see if s looks like a string | 
|  | 253 | c = s[:1] | 
|  | 254 | if not isinstance(c,StringTypes): | 
|  | 255 | raise ValueError("Expected a string") | 
|  | 256 | self.lexdata = s | 
|  | 257 | self.lexpos = 0 | 
|  | 258 | self.lexlen = len(s) | 
|  | 259 |  | 
|  | 260 | # ------------------------------------------------------------ | 
|  | 261 | # begin() - Changes the lexing state | 
|  | 262 | # ------------------------------------------------------------ | 
|  | 263 | def begin(self,state): | 
|  | 264 | if not state in self.lexstatere: | 
|  | 265 | raise ValueError("Undefined state") | 
|  | 266 | self.lexre = self.lexstatere[state] | 
|  | 267 | self.lexretext = self.lexstateretext[state] | 
|  | 268 | self.lexignore = self.lexstateignore.get(state,"") | 
|  | 269 | self.lexerrorf = self.lexstateerrorf.get(state,None) | 
|  | 270 | self.lexstate = state | 
|  | 271 |  | 
|  | 272 | # ------------------------------------------------------------ | 
|  | 273 | # push_state() - Changes the lexing state and saves old on stack | 
|  | 274 | # ------------------------------------------------------------ | 
|  | 275 | def push_state(self,state): | 
|  | 276 | self.lexstatestack.append(self.lexstate) | 
|  | 277 | self.begin(state) | 
|  | 278 |  | 
|  | 279 | # ------------------------------------------------------------ | 
|  | 280 | # pop_state() - Restores the previous state | 
|  | 281 | # ------------------------------------------------------------ | 
|  | 282 | def pop_state(self): | 
|  | 283 | self.begin(self.lexstatestack.pop()) | 
|  | 284 |  | 
|  | 285 | # ------------------------------------------------------------ | 
|  | 286 | # current_state() - Returns the current lexing state | 
|  | 287 | # ------------------------------------------------------------ | 
|  | 288 | def current_state(self): | 
|  | 289 | return self.lexstate | 
|  | 290 |  | 
|  | 291 | # ------------------------------------------------------------ | 
|  | 292 | # skip() - Skip ahead n characters | 
|  | 293 | # ------------------------------------------------------------ | 
|  | 294 | def skip(self,n): | 
|  | 295 | self.lexpos += n | 
|  | 296 |  | 
|  | 297 | # ------------------------------------------------------------ | 
|  | 298 | # opttoken() - Return the next token from the Lexer | 
|  | 299 | # | 
|  | 300 | # Note: This function has been carefully implemented to be as fast | 
|  | 301 | # as possible.  Don't make changes unless you really know what | 
|  | 302 | # you are doing | 
|  | 303 | # ------------------------------------------------------------ | 
|  | 304 | def token(self): | 
|  | 305 | # Make local copies of frequently referenced attributes | 
|  | 306 | lexpos    = self.lexpos | 
|  | 307 | lexlen    = self.lexlen | 
|  | 308 | lexignore = self.lexignore | 
|  | 309 | lexdata   = self.lexdata | 
|  | 310 |  | 
|  | 311 | while lexpos < lexlen: | 
|  | 312 | # This code provides some short-circuit code for whitespace, tabs, and other ignored characters | 
|  | 313 | if lexdata[lexpos] in lexignore: | 
|  | 314 | lexpos += 1 | 
|  | 315 | continue | 
|  | 316 |  | 
|  | 317 | # Look for a regular expression match | 
|  | 318 | for lexre,lexindexfunc in self.lexre: | 
|  | 319 | m = lexre.match(lexdata,lexpos) | 
|  | 320 | if not m: continue | 
|  | 321 |  | 
|  | 322 | # Create a token for return | 
|  | 323 | tok = LexToken() | 
|  | 324 | tok.value = m.group() | 
|  | 325 | tok.lineno = self.lineno | 
|  | 326 | tok.lexpos = lexpos | 
|  | 327 |  | 
|  | 328 | i = m.lastindex | 
|  | 329 | func,tok.type = lexindexfunc[i] | 
|  | 330 |  | 
|  | 331 | if not func: | 
|  | 332 | # If no token type was set, it's an ignored token | 
|  | 333 | if tok.type: | 
|  | 334 | self.lexpos = m.end() | 
|  | 335 | return tok | 
|  | 336 | else: | 
|  | 337 | lexpos = m.end() | 
|  | 338 | break | 
|  | 339 |  | 
|  | 340 | lexpos = m.end() | 
|  | 341 |  | 
|  | 342 | # If token is processed by a function, call it | 
|  | 343 |  | 
|  | 344 | tok.lexer = self      # Set additional attributes useful in token rules | 
|  | 345 | self.lexmatch = m | 
|  | 346 | self.lexpos = lexpos | 
|  | 347 |  | 
|  | 348 | newtok = func(tok) | 
|  | 349 |  | 
|  | 350 | # Every function must return a token, if nothing, we just move to next token | 
|  | 351 | if not newtok: | 
|  | 352 | lexpos    = self.lexpos         # This is here in case user has updated lexpos. | 
|  | 353 | lexignore = self.lexignore      # This is here in case there was a state change | 
|  | 354 | break | 
|  | 355 |  | 
|  | 356 | # Verify type of the token.  If not in the token map, raise an error | 
|  | 357 | if not self.lexoptimize: | 
|  | 358 | if not newtok.type in self.lextokens: | 
|  | 359 | raise LexError("%s:%d: Rule '%s' returned an unknown token type '%s'" % ( | 
|  | 360 | func_code(func).co_filename, func_code(func).co_firstlineno, | 
|  | 361 | func.__name__, newtok.type),lexdata[lexpos:]) | 
|  | 362 |  | 
|  | 363 | return newtok | 
|  | 364 | else: | 
|  | 365 | # No match, see if in literals | 
|  | 366 | if lexdata[lexpos] in self.lexliterals: | 
|  | 367 | tok = LexToken() | 
|  | 368 | tok.value = lexdata[lexpos] | 
|  | 369 | tok.lineno = self.lineno | 
|  | 370 | tok.type = tok.value | 
|  | 371 | tok.lexpos = lexpos | 
|  | 372 | self.lexpos = lexpos + 1 | 
|  | 373 | return tok | 
|  | 374 |  | 
|  | 375 | # No match. Call t_error() if defined. | 
|  | 376 | if self.lexerrorf: | 
|  | 377 | tok = LexToken() | 
|  | 378 | tok.value = self.lexdata[lexpos:] | 
|  | 379 | tok.lineno = self.lineno | 
|  | 380 | tok.type = "error" | 
|  | 381 | tok.lexer = self | 
|  | 382 | tok.lexpos = lexpos | 
|  | 383 | self.lexpos = lexpos | 
|  | 384 | newtok = self.lexerrorf(tok) | 
|  | 385 | if lexpos == self.lexpos: | 
|  | 386 | # Error method didn't change text position at all. This is an error. | 
|  | 387 | raise LexError("Scanning error. Illegal character '%s'" % (lexdata[lexpos]), lexdata[lexpos:]) | 
|  | 388 | lexpos = self.lexpos | 
|  | 389 | if not newtok: continue | 
|  | 390 | return newtok | 
|  | 391 |  | 
|  | 392 | self.lexpos = lexpos | 
|  | 393 | raise LexError("Illegal character '%s' at index %d" % (lexdata[lexpos],lexpos), lexdata[lexpos:]) | 
|  | 394 |  | 
|  | 395 | self.lexpos = lexpos + 1 | 
|  | 396 | if self.lexdata is None: | 
|  | 397 | raise RuntimeError("No input string given with input()") | 
|  | 398 | return None | 
|  | 399 |  | 
|  | 400 | # Iterator interface | 
|  | 401 | def __iter__(self): | 
|  | 402 | return self | 
|  | 403 |  | 
|  | 404 | def next(self): | 
|  | 405 | t = self.token() | 
|  | 406 | if t is None: | 
|  | 407 | raise StopIteration | 
|  | 408 | return t | 
|  | 409 |  | 
|  | 410 | __next__ = next | 
|  | 411 |  | 
|  | 412 | # ----------------------------------------------------------------------------- | 
|  | 413 | #                           ==== Lex Builder === | 
|  | 414 | # | 
|  | 415 | # The functions and classes below are used to collect lexing information | 
|  | 416 | # and build a Lexer object from it. | 
|  | 417 | # ----------------------------------------------------------------------------- | 
|  | 418 |  | 
|  | 419 | # ----------------------------------------------------------------------------- | 
|  | 420 | # get_caller_module_dict() | 
|  | 421 | # | 
|  | 422 | # This function returns a dictionary containing all of the symbols defined within | 
|  | 423 | # a caller further down the call stack.  This is used to get the environment | 
|  | 424 | # associated with the yacc() call if none was provided. | 
|  | 425 | # ----------------------------------------------------------------------------- | 
|  | 426 |  | 
|  | 427 | def get_caller_module_dict(levels): | 
|  | 428 | try: | 
|  | 429 | raise RuntimeError | 
|  | 430 | except RuntimeError: | 
|  | 431 | e,b,t = sys.exc_info() | 
|  | 432 | f = t.tb_frame | 
|  | 433 | while levels > 0: | 
|  | 434 | f = f.f_back | 
|  | 435 | levels -= 1 | 
|  | 436 | ldict = f.f_globals.copy() | 
|  | 437 | if f.f_globals != f.f_locals: | 
|  | 438 | ldict.update(f.f_locals) | 
|  | 439 |  | 
|  | 440 | return ldict | 
|  | 441 |  | 
|  | 442 | # ----------------------------------------------------------------------------- | 
|  | 443 | # _funcs_to_names() | 
|  | 444 | # | 
|  | 445 | # Given a list of regular expression functions, this converts it to a list | 
|  | 446 | # suitable for output to a table file | 
|  | 447 | # ----------------------------------------------------------------------------- | 
|  | 448 |  | 
|  | 449 | def _funcs_to_names(funclist,namelist): | 
|  | 450 | result = [] | 
|  | 451 | for f,name in zip(funclist,namelist): | 
|  | 452 | if f and f[0]: | 
|  | 453 | result.append((name, f[1])) | 
|  | 454 | else: | 
|  | 455 | result.append(f) | 
|  | 456 | return result | 
|  | 457 |  | 
|  | 458 | # ----------------------------------------------------------------------------- | 
|  | 459 | # _names_to_funcs() | 
|  | 460 | # | 
|  | 461 | # Given a list of regular expression function names, this converts it back to | 
|  | 462 | # functions. | 
|  | 463 | # ----------------------------------------------------------------------------- | 
|  | 464 |  | 
|  | 465 | def _names_to_funcs(namelist,fdict): | 
|  | 466 | result = [] | 
|  | 467 | for n in namelist: | 
|  | 468 | if n and n[0]: | 
|  | 469 | result.append((fdict[n[0]],n[1])) | 
|  | 470 | else: | 
|  | 471 | result.append(n) | 
|  | 472 | return result | 
|  | 473 |  | 
|  | 474 | # ----------------------------------------------------------------------------- | 
|  | 475 | # _form_master_re() | 
|  | 476 | # | 
|  | 477 | # This function takes a list of all of the regex components and attempts to | 
|  | 478 | # form the master regular expression.  Given limitations in the Python re | 
|  | 479 | # module, it may be necessary to break the master regex into separate expressions. | 
|  | 480 | # ----------------------------------------------------------------------------- | 
|  | 481 |  | 
|  | 482 | def _form_master_re(relist,reflags,ldict,toknames): | 
|  | 483 | if not relist: return [] | 
|  | 484 | regex = "|".join(relist) | 
|  | 485 | try: | 
|  | 486 | lexre = re.compile(regex,re.VERBOSE | reflags) | 
|  | 487 |  | 
|  | 488 | # Build the index to function map for the matching engine | 
|  | 489 | lexindexfunc = [ None ] * (max(lexre.groupindex.values())+1) | 
|  | 490 | lexindexnames = lexindexfunc[:] | 
|  | 491 |  | 
|  | 492 | for f,i in lexre.groupindex.items(): | 
|  | 493 | handle = ldict.get(f,None) | 
|  | 494 | if type(handle) in (types.FunctionType, types.MethodType): | 
|  | 495 | lexindexfunc[i] = (handle,toknames[f]) | 
|  | 496 | lexindexnames[i] = f | 
|  | 497 | elif handle is not None: | 
|  | 498 | lexindexnames[i] = f | 
|  | 499 | if f.find("ignore_") > 0: | 
|  | 500 | lexindexfunc[i] = (None,None) | 
|  | 501 | else: | 
|  | 502 | lexindexfunc[i] = (None, toknames[f]) | 
|  | 503 |  | 
|  | 504 | return [(lexre,lexindexfunc)],[regex],[lexindexnames] | 
|  | 505 | except Exception: | 
|  | 506 | m = int(len(relist)/2) | 
|  | 507 | if m == 0: m = 1 | 
|  | 508 | llist, lre, lnames = _form_master_re(relist[:m],reflags,ldict,toknames) | 
|  | 509 | rlist, rre, rnames = _form_master_re(relist[m:],reflags,ldict,toknames) | 
|  | 510 | return llist+rlist, lre+rre, lnames+rnames | 
|  | 511 |  | 
|  | 512 | # ----------------------------------------------------------------------------- | 
|  | 513 | # def _statetoken(s,names) | 
|  | 514 | # | 
|  | 515 | # Given a declaration name s of the form "t_" and a dictionary whose keys are | 
|  | 516 | # state names, this function returns a tuple (states,tokenname) where states | 
|  | 517 | # is a tuple of state names and tokenname is the name of the token.  For example, | 
|  | 518 | # calling this with s = "t_foo_bar_SPAM" might return (('foo','bar'),'SPAM') | 
|  | 519 | # ----------------------------------------------------------------------------- | 
|  | 520 |  | 
|  | 521 | def _statetoken(s,names): | 
|  | 522 | nonstate = 1 | 
|  | 523 | parts = s.split("_") | 
|  | 524 | for i in range(1,len(parts)): | 
|  | 525 | if not parts[i] in names and parts[i] != 'ANY': break | 
|  | 526 | if i > 1: | 
|  | 527 | states = tuple(parts[1:i]) | 
|  | 528 | else: | 
|  | 529 | states = ('INITIAL',) | 
|  | 530 |  | 
|  | 531 | if 'ANY' in states: | 
|  | 532 | states = tuple(names) | 
|  | 533 |  | 
|  | 534 | tokenname = "_".join(parts[i:]) | 
|  | 535 | return (states,tokenname) | 
|  | 536 |  | 
|  | 537 |  | 
|  | 538 | # ----------------------------------------------------------------------------- | 
|  | 539 | # LexerReflect() | 
|  | 540 | # | 
|  | 541 | # This class represents information needed to build a lexer as extracted from a | 
|  | 542 | # user's input file. | 
|  | 543 | # ----------------------------------------------------------------------------- | 
|  | 544 | class LexerReflect(object): | 
|  | 545 | def __init__(self,ldict,log=None,reflags=0): | 
|  | 546 | self.ldict      = ldict | 
|  | 547 | self.error_func = None | 
|  | 548 | self.tokens     = [] | 
|  | 549 | self.reflags    = reflags | 
|  | 550 | self.stateinfo  = { 'INITIAL' : 'inclusive'} | 
|  | 551 | self.files      = {} | 
|  | 552 | self.error      = 0 | 
|  | 553 |  | 
|  | 554 | if log is None: | 
|  | 555 | self.log = PlyLogger(sys.stderr) | 
|  | 556 | else: | 
|  | 557 | self.log = log | 
|  | 558 |  | 
|  | 559 | # Get all of the basic information | 
|  | 560 | def get_all(self): | 
|  | 561 | self.get_tokens() | 
|  | 562 | self.get_literals() | 
|  | 563 | self.get_states() | 
|  | 564 | self.get_rules() | 
|  | 565 |  | 
|  | 566 | # Validate all of the information | 
|  | 567 | def validate_all(self): | 
|  | 568 | self.validate_tokens() | 
|  | 569 | self.validate_literals() | 
|  | 570 | self.validate_rules() | 
|  | 571 | return self.error | 
|  | 572 |  | 
|  | 573 | # Get the tokens map | 
|  | 574 | def get_tokens(self): | 
|  | 575 | tokens = self.ldict.get("tokens",None) | 
|  | 576 | if not tokens: | 
|  | 577 | self.log.error("No token list is defined") | 
|  | 578 | self.error = 1 | 
|  | 579 | return | 
|  | 580 |  | 
|  | 581 | if not isinstance(tokens,(list, tuple)): | 
|  | 582 | self.log.error("tokens must be a list or tuple") | 
|  | 583 | self.error = 1 | 
|  | 584 | return | 
|  | 585 |  | 
|  | 586 | if not tokens: | 
|  | 587 | self.log.error("tokens is empty") | 
|  | 588 | self.error = 1 | 
|  | 589 | return | 
|  | 590 |  | 
|  | 591 | self.tokens = tokens | 
|  | 592 |  | 
|  | 593 | # Validate the tokens | 
|  | 594 | def validate_tokens(self): | 
|  | 595 | terminals = {} | 
|  | 596 | for n in self.tokens: | 
|  | 597 | if not _is_identifier.match(n): | 
|  | 598 | self.log.error("Bad token name '%s'",n) | 
|  | 599 | self.error = 1 | 
|  | 600 | if n in terminals: | 
|  | 601 | self.log.warning("Token '%s' multiply defined", n) | 
|  | 602 | terminals[n] = 1 | 
|  | 603 |  | 
|  | 604 | # Get the literals specifier | 
|  | 605 | def get_literals(self): | 
|  | 606 | self.literals = self.ldict.get("literals","") | 
|  | 607 |  | 
|  | 608 | # Validate literals | 
|  | 609 | def validate_literals(self): | 
|  | 610 | try: | 
|  | 611 | for c in self.literals: | 
|  | 612 | if not isinstance(c,StringTypes) or len(c) > 1: | 
|  | 613 | self.log.error("Invalid literal %s. Must be a single character", repr(c)) | 
|  | 614 | self.error = 1 | 
|  | 615 | continue | 
|  | 616 |  | 
|  | 617 | except TypeError: | 
|  | 618 | self.log.error("Invalid literals specification. literals must be a sequence of characters") | 
|  | 619 | self.error = 1 | 
|  | 620 |  | 
|  | 621 | def get_states(self): | 
|  | 622 | self.states = self.ldict.get("states",None) | 
|  | 623 | # Build statemap | 
|  | 624 | if self.states: | 
|  | 625 | if not isinstance(self.states,(tuple,list)): | 
|  | 626 | self.log.error("states must be defined as a tuple or list") | 
|  | 627 | self.error = 1 | 
|  | 628 | else: | 
|  | 629 | for s in self.states: | 
|  | 630 | if not isinstance(s,tuple) or len(s) != 2: | 
|  | 631 | self.log.error("Invalid state specifier %s. Must be a tuple (statename,'exclusive|inclusive')",repr(s)) | 
|  | 632 | self.error = 1 | 
|  | 633 | continue | 
|  | 634 | name, statetype = s | 
|  | 635 | if not isinstance(name,StringTypes): | 
|  | 636 | self.log.error("State name %s must be a string", repr(name)) | 
|  | 637 | self.error = 1 | 
|  | 638 | continue | 
|  | 639 | if not (statetype == 'inclusive' or statetype == 'exclusive'): | 
|  | 640 | self.log.error("State type for state %s must be 'inclusive' or 'exclusive'",name) | 
|  | 641 | self.error = 1 | 
|  | 642 | continue | 
|  | 643 | if name in self.stateinfo: | 
|  | 644 | self.log.error("State '%s' already defined",name) | 
|  | 645 | self.error = 1 | 
|  | 646 | continue | 
|  | 647 | self.stateinfo[name] = statetype | 
|  | 648 |  | 
|  | 649 | # Get all of the symbols with a t_ prefix and sort them into various | 
|  | 650 | # categories (functions, strings, error functions, and ignore characters) | 
|  | 651 |  | 
|  | 652 | def get_rules(self): | 
|  | 653 | tsymbols = [f for f in self.ldict if f[:2] == 't_' ] | 
|  | 654 |  | 
|  | 655 | # Now build up a list of functions and a list of strings | 
|  | 656 |  | 
|  | 657 | self.toknames = { }        # Mapping of symbols to token names | 
|  | 658 | self.funcsym =  { }        # Symbols defined as functions | 
|  | 659 | self.strsym =   { }        # Symbols defined as strings | 
|  | 660 | self.ignore   = { }        # Ignore strings by state | 
|  | 661 | self.errorf   = { }        # Error functions by state | 
|  | 662 |  | 
|  | 663 | for s in self.stateinfo: | 
|  | 664 | self.funcsym[s] = [] | 
|  | 665 | self.strsym[s] = [] | 
|  | 666 |  | 
|  | 667 | if len(tsymbols) == 0: | 
|  | 668 | self.log.error("No rules of the form t_rulename are defined") | 
|  | 669 | self.error = 1 | 
|  | 670 | return | 
|  | 671 |  | 
|  | 672 | for f in tsymbols: | 
|  | 673 | t = self.ldict[f] | 
|  | 674 | states, tokname = _statetoken(f,self.stateinfo) | 
|  | 675 | self.toknames[f] = tokname | 
|  | 676 |  | 
|  | 677 | if hasattr(t,"__call__"): | 
|  | 678 | if tokname == 'error': | 
|  | 679 | for s in states: | 
|  | 680 | self.errorf[s] = t | 
|  | 681 | elif tokname == 'ignore': | 
|  | 682 | line = func_code(t).co_firstlineno | 
|  | 683 | file = func_code(t).co_filename | 
|  | 684 | self.log.error("%s:%d: Rule '%s' must be defined as a string",file,line,t.__name__) | 
|  | 685 | self.error = 1 | 
|  | 686 | else: | 
|  | 687 | for s in states: | 
|  | 688 | self.funcsym[s].append((f,t)) | 
|  | 689 | elif isinstance(t, StringTypes): | 
|  | 690 | if tokname == 'ignore': | 
|  | 691 | for s in states: | 
|  | 692 | self.ignore[s] = t | 
|  | 693 | if "\\" in t: | 
|  | 694 | self.log.warning("%s contains a literal backslash '\\'",f) | 
|  | 695 |  | 
|  | 696 | elif tokname == 'error': | 
|  | 697 | self.log.error("Rule '%s' must be defined as a function", f) | 
|  | 698 | self.error = 1 | 
|  | 699 | else: | 
|  | 700 | for s in states: | 
|  | 701 | self.strsym[s].append((f,t)) | 
|  | 702 | else: | 
|  | 703 | self.log.error("%s not defined as a function or string", f) | 
|  | 704 | self.error = 1 | 
|  | 705 |  | 
|  | 706 | # Sort the functions by line number | 
|  | 707 | for f in self.funcsym.values(): | 
|  | 708 | if sys.version_info[0] < 3: | 
|  | 709 | f.sort(lambda x,y: cmp(func_code(x[1]).co_firstlineno,func_code(y[1]).co_firstlineno)) | 
|  | 710 | else: | 
|  | 711 | # Python 3.0 | 
|  | 712 | f.sort(key=lambda x: func_code(x[1]).co_firstlineno) | 
|  | 713 |  | 
|  | 714 | # Sort the strings by regular expression length | 
|  | 715 | for s in self.strsym.values(): | 
|  | 716 | if sys.version_info[0] < 3: | 
|  | 717 | s.sort(lambda x,y: (len(x[1]) < len(y[1])) - (len(x[1]) > len(y[1]))) | 
|  | 718 | else: | 
|  | 719 | # Python 3.0 | 
|  | 720 | s.sort(key=lambda x: len(x[1]),reverse=True) | 
|  | 721 |  | 
|  | 722 | # Validate all of the t_rules collected | 
|  | 723 | def validate_rules(self): | 
|  | 724 | for state in self.stateinfo: | 
|  | 725 | # Validate all rules defined by functions | 
|  | 726 |  | 
|  | 727 |  | 
|  | 728 |  | 
|  | 729 | for fname, f in self.funcsym[state]: | 
|  | 730 | line = func_code(f).co_firstlineno | 
|  | 731 | file = func_code(f).co_filename | 
|  | 732 | self.files[file] = 1 | 
|  | 733 |  | 
|  | 734 | tokname = self.toknames[fname] | 
|  | 735 | if isinstance(f, types.MethodType): | 
|  | 736 | reqargs = 2 | 
|  | 737 | else: | 
|  | 738 | reqargs = 1 | 
|  | 739 | nargs = func_code(f).co_argcount | 
|  | 740 | if nargs > reqargs: | 
|  | 741 | self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) | 
|  | 742 | self.error = 1 | 
|  | 743 | continue | 
|  | 744 |  | 
|  | 745 | if nargs < reqargs: | 
|  | 746 | self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) | 
|  | 747 | self.error = 1 | 
|  | 748 | continue | 
|  | 749 |  | 
|  | 750 | if not f.__doc__: | 
|  | 751 | self.log.error("%s:%d: No regular expression defined for rule '%s'",file,line,f.__name__) | 
|  | 752 | self.error = 1 | 
|  | 753 | continue | 
|  | 754 |  | 
|  | 755 | try: | 
|  | 756 | c = re.compile("(?P<%s>%s)" % (fname,f.__doc__), re.VERBOSE | self.reflags) | 
|  | 757 | if c.match(""): | 
|  | 758 | self.log.error("%s:%d: Regular expression for rule '%s' matches empty string", file,line,f.__name__) | 
|  | 759 | self.error = 1 | 
|  | 760 | except re.error: | 
|  | 761 | _etype, e, _etrace = sys.exc_info() | 
|  | 762 | self.log.error("%s:%d: Invalid regular expression for rule '%s'. %s", file,line,f.__name__,e) | 
|  | 763 | if '#' in f.__doc__: | 
|  | 764 | self.log.error("%s:%d. Make sure '#' in rule '%s' is escaped with '\\#'",file,line, f.__name__) | 
|  | 765 | self.error = 1 | 
|  | 766 |  | 
|  | 767 | # Validate all rules defined by strings | 
|  | 768 | for name,r in self.strsym[state]: | 
|  | 769 | tokname = self.toknames[name] | 
|  | 770 | if tokname == 'error': | 
|  | 771 | self.log.error("Rule '%s' must be defined as a function", name) | 
|  | 772 | self.error = 1 | 
|  | 773 | continue | 
|  | 774 |  | 
|  | 775 | if not tokname in self.tokens and tokname.find("ignore_") < 0: | 
|  | 776 | self.log.error("Rule '%s' defined for an unspecified token %s",name,tokname) | 
|  | 777 | self.error = 1 | 
|  | 778 | continue | 
|  | 779 |  | 
|  | 780 | try: | 
|  | 781 | c = re.compile("(?P<%s>%s)" % (name,r),re.VERBOSE | self.reflags) | 
|  | 782 | if (c.match("")): | 
|  | 783 | self.log.error("Regular expression for rule '%s' matches empty string",name) | 
|  | 784 | self.error = 1 | 
|  | 785 | except re.error: | 
|  | 786 | _etype, e, _etrace = sys.exc_info() | 
|  | 787 | self.log.error("Invalid regular expression for rule '%s'. %s",name,e) | 
|  | 788 | if '#' in r: | 
|  | 789 | self.log.error("Make sure '#' in rule '%s' is escaped with '\\#'",name) | 
|  | 790 | self.error = 1 | 
|  | 791 |  | 
|  | 792 | if not self.funcsym[state] and not self.strsym[state]: | 
|  | 793 | self.log.error("No rules defined for state '%s'",state) | 
|  | 794 | self.error = 1 | 
|  | 795 |  | 
|  | 796 | # Validate the error function | 
|  | 797 | efunc = self.errorf.get(state,None) | 
|  | 798 | if efunc: | 
|  | 799 | f = efunc | 
|  | 800 | line = func_code(f).co_firstlineno | 
|  | 801 | file = func_code(f).co_filename | 
|  | 802 | self.files[file] = 1 | 
|  | 803 |  | 
|  | 804 | if isinstance(f, types.MethodType): | 
|  | 805 | reqargs = 2 | 
|  | 806 | else: | 
|  | 807 | reqargs = 1 | 
|  | 808 | nargs = func_code(f).co_argcount | 
|  | 809 | if nargs > reqargs: | 
|  | 810 | self.log.error("%s:%d: Rule '%s' has too many arguments",file,line,f.__name__) | 
|  | 811 | self.error = 1 | 
|  | 812 |  | 
|  | 813 | if nargs < reqargs: | 
|  | 814 | self.log.error("%s:%d: Rule '%s' requires an argument", file,line,f.__name__) | 
|  | 815 | self.error = 1 | 
|  | 816 |  | 
|  | 817 | for f in self.files: | 
|  | 818 | self.validate_file(f) | 
|  | 819 |  | 
|  | 820 |  | 
|  | 821 | # ----------------------------------------------------------------------------- | 
|  | 822 | # validate_file() | 
|  | 823 | # | 
|  | 824 | # This checks to see if there are duplicated t_rulename() functions or strings | 
|  | 825 | # in the parser input file.  This is done using a simple regular expression | 
|  | 826 | # match on each line in the given file. | 
|  | 827 | # ----------------------------------------------------------------------------- | 
|  | 828 |  | 
|  | 829 | def validate_file(self,filename): | 
|  | 830 | import os.path | 
|  | 831 | base,ext = os.path.splitext(filename) | 
|  | 832 | if ext != '.py': return         # No idea what the file is. Return OK | 
|  | 833 |  | 
|  | 834 | try: | 
|  | 835 | f = open(filename) | 
|  | 836 | lines = f.readlines() | 
|  | 837 | f.close() | 
|  | 838 | except IOError: | 
|  | 839 | return                      # Couldn't find the file.  Don't worry about it | 
|  | 840 |  | 
|  | 841 | fre = re.compile(r'\s*def\s+(t_[a-zA-Z_0-9]*)\(') | 
|  | 842 | sre = re.compile(r'\s*(t_[a-zA-Z_0-9]*)\s*=') | 
|  | 843 |  | 
|  | 844 | counthash = { } | 
|  | 845 | linen = 1 | 
|  | 846 | for l in lines: | 
|  | 847 | m = fre.match(l) | 
|  | 848 | if not m: | 
|  | 849 | m = sre.match(l) | 
|  | 850 | if m: | 
|  | 851 | name = m.group(1) | 
|  | 852 | prev = counthash.get(name) | 
|  | 853 | if not prev: | 
|  | 854 | counthash[name] = linen | 
|  | 855 | else: | 
|  | 856 | self.log.error("%s:%d: Rule %s redefined. Previously defined on line %d",filename,linen,name,prev) | 
|  | 857 | self.error = 1 | 
|  | 858 | linen += 1 | 
|  | 859 |  | 
|  | 860 | # ----------------------------------------------------------------------------- | 
|  | 861 | # lex(module) | 
|  | 862 | # | 
|  | 863 | # Build all of the regular expression rules from definitions in the supplied module | 
|  | 864 | # ----------------------------------------------------------------------------- | 
|  | 865 | def lex(module=None,object=None,debug=0,optimize=0,lextab="lextab",reflags=0,nowarn=0,outputdir="", debuglog=None, errorlog=None): | 
|  | 866 | global lexer | 
|  | 867 | ldict = None | 
|  | 868 | stateinfo  = { 'INITIAL' : 'inclusive'} | 
|  | 869 | lexobj = Lexer() | 
|  | 870 | lexobj.lexoptimize = optimize | 
|  | 871 | global token,input | 
|  | 872 |  | 
|  | 873 | if errorlog is None: | 
|  | 874 | errorlog = PlyLogger(sys.stderr) | 
|  | 875 |  | 
|  | 876 | if debug: | 
|  | 877 | if debuglog is None: | 
|  | 878 | debuglog = PlyLogger(sys.stderr) | 
|  | 879 |  | 
|  | 880 | # Get the module dictionary used for the lexer | 
|  | 881 | if object: module = object | 
|  | 882 |  | 
|  | 883 | if module: | 
|  | 884 | _items = [(k,getattr(module,k)) for k in dir(module)] | 
|  | 885 | ldict = dict(_items) | 
|  | 886 | else: | 
|  | 887 | ldict = get_caller_module_dict(2) | 
|  | 888 |  | 
|  | 889 | # Collect parser information from the dictionary | 
|  | 890 | linfo = LexerReflect(ldict,log=errorlog,reflags=reflags) | 
|  | 891 | linfo.get_all() | 
|  | 892 | if not optimize: | 
|  | 893 | if linfo.validate_all(): | 
|  | 894 | raise SyntaxError("Can't build lexer") | 
|  | 895 |  | 
|  | 896 | if optimize and lextab: | 
|  | 897 | try: | 
|  | 898 | lexobj.readtab(lextab,ldict) | 
|  | 899 | token = lexobj.token | 
|  | 900 | input = lexobj.input | 
|  | 901 | lexer = lexobj | 
|  | 902 | return lexobj | 
|  | 903 |  | 
|  | 904 | except ImportError: | 
|  | 905 | pass | 
|  | 906 |  | 
|  | 907 | # Dump some basic debugging information | 
|  | 908 | if debug: | 
|  | 909 | debuglog.info("lex: tokens   = %r", linfo.tokens) | 
|  | 910 | debuglog.info("lex: literals = %r", linfo.literals) | 
|  | 911 | debuglog.info("lex: states   = %r", linfo.stateinfo) | 
|  | 912 |  | 
|  | 913 | # Build a dictionary of valid token names | 
|  | 914 | lexobj.lextokens = { } | 
|  | 915 | for n in linfo.tokens: | 
|  | 916 | lexobj.lextokens[n] = 1 | 
|  | 917 |  | 
|  | 918 | # Get literals specification | 
|  | 919 | if isinstance(linfo.literals,(list,tuple)): | 
|  | 920 | lexobj.lexliterals = type(linfo.literals[0])().join(linfo.literals) | 
|  | 921 | else: | 
|  | 922 | lexobj.lexliterals = linfo.literals | 
|  | 923 |  | 
|  | 924 | # Get the stateinfo dictionary | 
|  | 925 | stateinfo = linfo.stateinfo | 
|  | 926 |  | 
|  | 927 | regexs = { } | 
|  | 928 | # Build the master regular expressions | 
|  | 929 | for state in stateinfo: | 
|  | 930 | regex_list = [] | 
|  | 931 |  | 
|  | 932 | # Add rules defined by functions first | 
|  | 933 | for fname, f in linfo.funcsym[state]: | 
|  | 934 | line = func_code(f).co_firstlineno | 
|  | 935 | file = func_code(f).co_filename | 
|  | 936 | regex_list.append("(?P<%s>%s)" % (fname,f.__doc__)) | 
|  | 937 | if debug: | 
|  | 938 | debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",fname,f.__doc__, state) | 
|  | 939 |  | 
|  | 940 | # Now add all of the simple rules | 
|  | 941 | for name,r in linfo.strsym[state]: | 
|  | 942 | regex_list.append("(?P<%s>%s)" % (name,r)) | 
|  | 943 | if debug: | 
|  | 944 | debuglog.info("lex: Adding rule %s -> '%s' (state '%s')",name,r, state) | 
|  | 945 |  | 
|  | 946 | regexs[state] = regex_list | 
|  | 947 |  | 
|  | 948 | # Build the master regular expressions | 
|  | 949 |  | 
|  | 950 | if debug: | 
|  | 951 | debuglog.info("lex: ==== MASTER REGEXS FOLLOW ====") | 
|  | 952 |  | 
|  | 953 | for state in regexs: | 
|  | 954 | lexre, re_text, re_names = _form_master_re(regexs[state],reflags,ldict,linfo.toknames) | 
|  | 955 | lexobj.lexstatere[state] = lexre | 
|  | 956 | lexobj.lexstateretext[state] = re_text | 
|  | 957 | lexobj.lexstaterenames[state] = re_names | 
|  | 958 | if debug: | 
|  | 959 | for i in range(len(re_text)): | 
|  | 960 | debuglog.info("lex: state '%s' : regex[%d] = '%s'",state, i, re_text[i]) | 
|  | 961 |  | 
|  | 962 | # For inclusive states, we need to add the regular expressions from the INITIAL state | 
|  | 963 | for state,stype in stateinfo.items(): | 
|  | 964 | if state != "INITIAL" and stype == 'inclusive': | 
|  | 965 | lexobj.lexstatere[state].extend(lexobj.lexstatere['INITIAL']) | 
|  | 966 | lexobj.lexstateretext[state].extend(lexobj.lexstateretext['INITIAL']) | 
|  | 967 | lexobj.lexstaterenames[state].extend(lexobj.lexstaterenames['INITIAL']) | 
|  | 968 |  | 
|  | 969 | lexobj.lexstateinfo = stateinfo | 
|  | 970 | lexobj.lexre = lexobj.lexstatere["INITIAL"] | 
|  | 971 | lexobj.lexretext = lexobj.lexstateretext["INITIAL"] | 
|  | 972 | lexobj.lexreflags = reflags | 
|  | 973 |  | 
|  | 974 | # Set up ignore variables | 
|  | 975 | lexobj.lexstateignore = linfo.ignore | 
|  | 976 | lexobj.lexignore = lexobj.lexstateignore.get("INITIAL","") | 
|  | 977 |  | 
|  | 978 | # Set up error functions | 
|  | 979 | lexobj.lexstateerrorf = linfo.errorf | 
|  | 980 | lexobj.lexerrorf = linfo.errorf.get("INITIAL",None) | 
|  | 981 | if not lexobj.lexerrorf: | 
|  | 982 | errorlog.warning("No t_error rule is defined") | 
|  | 983 |  | 
|  | 984 | # Check state information for ignore and error rules | 
|  | 985 | for s,stype in stateinfo.items(): | 
|  | 986 | if stype == 'exclusive': | 
|  | 987 | if not s in linfo.errorf: | 
|  | 988 | errorlog.warning("No error rule is defined for exclusive state '%s'", s) | 
|  | 989 | if not s in linfo.ignore and lexobj.lexignore: | 
|  | 990 | errorlog.warning("No ignore rule is defined for exclusive state '%s'", s) | 
|  | 991 | elif stype == 'inclusive': | 
|  | 992 | if not s in linfo.errorf: | 
|  | 993 | linfo.errorf[s] = linfo.errorf.get("INITIAL",None) | 
|  | 994 | if not s in linfo.ignore: | 
|  | 995 | linfo.ignore[s] = linfo.ignore.get("INITIAL","") | 
|  | 996 |  | 
|  | 997 | # Create global versions of the token() and input() functions | 
|  | 998 | token = lexobj.token | 
|  | 999 | input = lexobj.input | 
|  | 1000 | lexer = lexobj | 
|  | 1001 |  | 
|  | 1002 | # If in optimize mode, we write the lextab | 
|  | 1003 | if lextab and optimize: | 
|  | 1004 | lexobj.writetab(lextab,outputdir) | 
|  | 1005 |  | 
|  | 1006 | return lexobj | 
|  | 1007 |  | 
|  | 1008 | # ----------------------------------------------------------------------------- | 
|  | 1009 | # runmain() | 
|  | 1010 | # | 
|  | 1011 | # This runs the lexer as a main program | 
|  | 1012 | # ----------------------------------------------------------------------------- | 
|  | 1013 |  | 
|  | 1014 | def runmain(lexer=None,data=None): | 
|  | 1015 | if not data: | 
|  | 1016 | try: | 
|  | 1017 | filename = sys.argv[1] | 
|  | 1018 | f = open(filename) | 
|  | 1019 | data = f.read() | 
|  | 1020 | f.close() | 
|  | 1021 | except IndexError: | 
|  | 1022 | sys.stdout.write("Reading from standard input (type EOF to end):\n") | 
|  | 1023 | data = sys.stdin.read() | 
|  | 1024 |  | 
|  | 1025 | if lexer: | 
|  | 1026 | _input = lexer.input | 
|  | 1027 | else: | 
|  | 1028 | _input = input | 
|  | 1029 | _input(data) | 
|  | 1030 | if lexer: | 
|  | 1031 | _token = lexer.token | 
|  | 1032 | else: | 
|  | 1033 | _token = token | 
|  | 1034 |  | 
|  | 1035 | while 1: | 
|  | 1036 | tok = _token() | 
|  | 1037 | if not tok: break | 
|  | 1038 | sys.stdout.write("(%s,%r,%d,%d)\n" % (tok.type, tok.value, tok.lineno,tok.lexpos)) | 
|  | 1039 |  | 
|  | 1040 | # ----------------------------------------------------------------------------- | 
|  | 1041 | # @TOKEN(regex) | 
|  | 1042 | # | 
|  | 1043 | # This decorator function can be used to set the regex expression on a function | 
|  | 1044 | # when its docstring might need to be set in an alternative way | 
|  | 1045 | # ----------------------------------------------------------------------------- | 
|  | 1046 |  | 
|  | 1047 | def TOKEN(r): | 
|  | 1048 | def set_doc(f): | 
|  | 1049 | if hasattr(r,"__call__"): | 
|  | 1050 | f.__doc__ = r.__doc__ | 
|  | 1051 | else: | 
|  | 1052 | f.__doc__ = r | 
|  | 1053 | return f | 
|  | 1054 | return set_doc | 
|  | 1055 |  | 
|  | 1056 | # Alternative spelling of the TOKEN decorator | 
|  | 1057 | Token = TOKEN | 
|  | 1058 |  |