# ----------------------------------------------------------------------------- # sly: lex.py # # Copyright (C) 2016 - 2018 # David M. Beazley (Dabeaz LLC) # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # * Redistributions of source code must retain the above copyright notice, # this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # * Neither the name of the David Beazley or Dabeaz LLC may be used to # endorse or promote products derived from this software without # specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- __all__ = ["Lexer", "LexerStateChange"] import re import copy class LexError(Exception): """ Exception raised if an invalid character is encountered and no default error handler function is defined. The .text attribute of the exception contains all remaining untokenized text. The .error_index is the index location of the error. """ def __init__(self, message, text, error_index): self.args = (message,) self.text = text self.error_index = error_index class PatternError(Exception): """ Exception raised if there's some kind of problem with the specified regex patterns in the lexer. """ pass class LexerBuildError(Exception): """ Exception raised if there's some sort of problem building the lexer. """ pass class LexerStateChange(Exception): """ Exception raised to force a lexing state change """ def __init__(self, newstate, tok=None): self.newstate = newstate self.tok = tok class Token(object): """ Representation of a single token. """ __slots__ = ("type", "value", "lineno", "index") def __repr__(self): return f"Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})" class TokenStr(str): @staticmethod def __new__(cls, value, key=None, remap=None): self = super().__new__(cls, value) self.key = key self.remap = remap return self # Implementation of TOKEN[value] = NEWTOKEN def __setitem__(self, key, value): if self.remap is not None: self.remap[self.key, key] = value # Implementation of del TOKEN[value] def __delitem__(self, key): if self.remap is not None: self.remap[self.key, key] = self.key class _Before: def __init__(self, tok, pattern): self.tok = tok self.pattern = pattern class LexerMetaDict(dict): """ Special dictionary that prohibits duplicate definitions in lexer specifications. """ def __init__(self): self.before = {} self.delete = [] self.remap = {} def __setitem__(self, key, value): if isinstance(value, str): value = TokenStr(value, key, self.remap) if isinstance(value, _Before): self.before[key] = value.tok value = TokenStr(value.pattern, key, self.remap) if key in self and not isinstance(value, property): prior = self[key] if isinstance(prior, str): if callable(value): value.pattern = prior else: raise AttributeError(f"Name {key} redefined") super().__setitem__(key, value) def __delitem__(self, key): self.delete.append(key) if key not in self and key.isupper(): pass else: return super().__delitem__(key) def __getitem__(self, key): if key not in self and key.split("ignore_")[-1].isupper() and key[:1] != "_": return TokenStr(key, key, self.remap) else: return super().__getitem__(key) class LexerMeta(type): """ Metaclass for collecting lexing rules """ @classmethod def __prepare__(meta, name, bases): d = LexerMetaDict() def _(pattern, *extra): patterns = [pattern, *extra] def decorate(func): pattern = "|".join(f"({pat})" for pat in patterns) if hasattr(func, "pattern"): func.pattern = pattern + "|" + func.pattern else: func.pattern = pattern return func return decorate d["_"] = _ d["before"] = _Before return d def __new__(meta, clsname, bases, attributes): del attributes["_"] del attributes["before"] # Create attributes for use in the actual class body cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val for key, val in attributes.items() } cls = super().__new__(meta, clsname, bases, cls_attributes) # Attach various metadata to the class cls._attributes = dict(attributes) cls._remap = attributes.remap cls._before = attributes.before cls._delete = attributes.delete cls._build() return cls class Lexer(metaclass=LexerMeta): # These attributes may be defined in subclasses tokens = set() literals = set() ignore = "" reflags = 0 regex_module = re _token_names = set() _token_funcs = {} _ignored_tokens = set() _remapping = {} _delete = {} _remap = {} # Internal attributes __state_stack = None __set_state = None @classmethod def _collect_rules(cls): # Collect all of the rules from class definitions that look like token # information. There are a few things that govern this: # # 1. Any definition of the form NAME = str is a token if NAME is # is defined in the tokens set. # # 2. Any definition of the form ignore_NAME = str is a rule for an ignored # token. # # 3. Any function defined with a 'pattern' attribute is treated as a rule. # Such functions can be created with the @_ decorator or by defining # function with the same name as a previously defined string. # # This function is responsible for keeping rules in order. # Collect all previous rules from base classes rules = [] for base in cls.__bases__: if isinstance(base, LexerMeta): rules.extend(base._rules) # Dictionary of previous rules existing = dict(rules) for key, value in cls._attributes.items(): if ( (key in cls._token_names) or key.startswith("ignore_") or hasattr(value, "pattern") ): if callable(value) and not hasattr(value, "pattern"): raise LexerBuildError( f"function {value} doesn't have a regex pattern" ) if key in existing: # The definition matches something that already existed in the base class. # We replace it, but keep the original ordering n = rules.index((key, existing[key])) rules[n] = (key, value) existing[key] = value elif isinstance(value, TokenStr) and key in cls._before: before = cls._before[key] if before in existing: # Position the token before another specified token n = rules.index((before, existing[before])) rules.insert(n, (key, value)) else: # Put at the end of the rule list rules.append((key, value)) existing[key] = value else: rules.append((key, value)) existing[key] = value elif ( isinstance(value, str) and not key.startswith("_") and key not in {"ignore", "literals"} ): raise LexerBuildError(f"{key} does not match a name in tokens") # Apply deletion rules rules = [(key, value) for key, value in rules if key not in cls._delete] cls._rules = rules @classmethod def _build(cls): """ Build the lexer object from the collected tokens and regular expressions. Validate the rules to make sure they look sane. """ if "tokens" not in vars(cls): raise LexerBuildError( f"{cls.__qualname__} class does not define a tokens attribute" ) # Pull definitions created for any parent classes cls._token_names = cls._token_names | set(cls.tokens) cls._ignored_tokens = set(cls._ignored_tokens) cls._token_funcs = dict(cls._token_funcs) cls._remapping = dict(cls._remapping) for (key, val), newtok in cls._remap.items(): if key not in cls._remapping: cls._remapping[key] = {} cls._remapping[key][val] = newtok remapped_toks = set() for d in cls._remapping.values(): remapped_toks.update(d.values()) undefined = remapped_toks - set(cls._token_names) if undefined: missing = ", ".join(undefined) raise LexerBuildError(f"{missing} not included in token(s)") cls._collect_rules() parts = [] for tokname, value in cls._rules: if tokname.startswith("ignore_"): tokname = tokname[7:] cls._ignored_tokens.add(tokname) if isinstance(value, str): pattern = value elif callable(value): cls._token_funcs[tokname] = value pattern = getattr(value, "pattern") # Form the regular expression component part = f"(?P<{tokname}>{pattern})" # Make sure the individual regex compiles properly try: cpat = cls.regex_module.compile(part, cls.reflags) except Exception as e: raise PatternError(f"Invalid regex for token {tokname}") from e # Verify that the pattern doesn't match the empty string if cpat.match(""): raise PatternError(f"Regex for token {tokname} matches empty input") parts.append(part) if not parts: return # Form the master regular expression # previous = ('|' + cls._master_re.pattern) if cls._master_re else '' # cls._master_re = cls.regex_module.compile('|'.join(parts) + previous, cls.reflags) cls._master_re = cls.regex_module.compile("|".join(parts), cls.reflags) # Verify that that ignore and literals specifiers match the input type if not isinstance(cls.ignore, str): raise LexerBuildError("ignore specifier must be a string") if not all(isinstance(lit, str) for lit in cls.literals): raise LexerBuildError("literals must be specified as strings") def begin(self, cls): """ Begin a new lexer state """ assert isinstance(cls, LexerMeta), "state must be a subclass of Lexer" if self.__set_state: self.__set_state(cls) self.__class__ = cls def push_state(self, cls): """ Push a new lexer state onto the stack """ if self.__state_stack is None: self.__state_stack = [] self.__state_stack.append(type(self)) self.begin(cls) def pop_state(self): """ Pop a lexer state from the stack """ self.begin(self.__state_stack.pop()) def tokenize(self, text, lineno=1, index=0): _ignored_tokens = ( _master_re ) = _ignore = _token_funcs = _literals = _remapping = None def _set_state(cls): nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping _ignored_tokens = cls._ignored_tokens _master_re = cls._master_re _ignore = cls.ignore _token_funcs = cls._token_funcs _literals = cls.literals _remapping = cls._remapping self.__set_state = _set_state _set_state(type(self)) self.text = text try: while True: try: if text[index] in _ignore: index += 1 continue except IndexError: return tok = Token() tok.lineno = lineno tok.index = index m = _master_re.match(text, index) if m: index = m.end() tok.value = m.group() tok.type = m.lastgroup if tok.type in _remapping: tok.type = _remapping[tok.type].get(tok.value, tok.type) if tok.type in _token_funcs: self.index = index self.lineno = lineno tok = _token_funcs[tok.type](self, tok) index = self.index lineno = self.lineno if not tok: continue if tok.type in _ignored_tokens: continue yield tok else: # No match, see if the character is in literals if text[index] in _literals: tok.value = text[index] tok.type = tok.value index += 1 yield tok else: # A lexing error self.index = index self.lineno = lineno tok.type = "ERROR" tok.value = text[index:] tok = self.error(tok) if tok is not None: yield tok index = self.index lineno = self.lineno # Set the final state of the lexer before exiting (even if exception) finally: self.text = text self.index = index self.lineno = lineno # Default implementations of the error handler. May be changed in subclasses def error(self, t): raise LexError( f"Illegal character {t.value[0]!r} at index {self.index}", t.value, self.index, )