diff options
Diffstat (limited to 'lib/sly/lex.py')
-rw-r--r-- | lib/sly/lex.py | 178 |
1 files changed, 109 insertions, 69 deletions
diff --git a/lib/sly/lex.py b/lib/sly/lex.py index 246dd9e..0ab0160 100644 --- a/lib/sly/lex.py +++ b/lib/sly/lex.py @@ -31,51 +31,63 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # ----------------------------------------------------------------------------- -__all__ = ['Lexer', 'LexerStateChange'] +__all__ = ["Lexer", "LexerStateChange"] import re import copy + class LexError(Exception): - ''' + """ Exception raised if an invalid character is encountered and no default error handler function is defined. The .text attribute of the exception contains all remaining untokenized text. The .error_index is the index location of the error. - ''' + """ + def __init__(self, message, text, error_index): self.args = (message,) self.text = text self.error_index = error_index + class PatternError(Exception): - ''' + """ Exception raised if there's some kind of problem with the specified regex patterns in the lexer. - ''' + """ + pass + class LexerBuildError(Exception): - ''' + """ Exception raised if there's some sort of problem building the lexer. - ''' + """ + pass + class LexerStateChange(Exception): - ''' + """ Exception raised to force a lexing state change - ''' + """ + def __init__(self, newstate, tok=None): self.newstate = newstate self.tok = tok + class Token(object): - ''' + """ Representation of a single token. - ''' - __slots__ = ('type', 'value', 'lineno', 'index') + """ + + __slots__ = ("type", "value", "lineno", "index") + def __repr__(self): - return f'Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})' + return f"Token(type={self.type!r}, value={self.value!r}, lineno={self.lineno}, index={self.index})" + class TokenStr(str): @staticmethod @@ -95,35 +107,38 @@ class TokenStr(str): if self.remap is not None: self.remap[self.key, key] = self.key + class _Before: def __init__(self, tok, pattern): self.tok = tok self.pattern = pattern + class LexerMetaDict(dict): - ''' + """ Special dictionary that prohibits duplicate definitions in lexer specifications. - ''' + """ + def __init__(self): - self.before = { } - self.delete = [ ] - self.remap = { } + self.before = {} + self.delete = [] + self.remap = {} def __setitem__(self, key, value): if isinstance(value, str): value = TokenStr(value, key, self.remap) - + if isinstance(value, _Before): self.before[key] = value.tok value = TokenStr(value.pattern, key, self.remap) - + if key in self and not isinstance(value, property): prior = self[key] if isinstance(prior, str): if callable(value): value.pattern = prior else: - raise AttributeError(f'Name {key} redefined') + raise AttributeError(f"Name {key} redefined") super().__setitem__(key, value) @@ -135,41 +150,47 @@ class LexerMetaDict(dict): return super().__delitem__(key) def __getitem__(self, key): - if key not in self and key.split('ignore_')[-1].isupper() and key[:1] != '_': + if key not in self and key.split("ignore_")[-1].isupper() and key[:1] != "_": return TokenStr(key, key, self.remap) else: return super().__getitem__(key) + class LexerMeta(type): - ''' + """ Metaclass for collecting lexing rules - ''' + """ + @classmethod def __prepare__(meta, name, bases): d = LexerMetaDict() def _(pattern, *extra): patterns = [pattern, *extra] + def decorate(func): - pattern = '|'.join(f'({pat})' for pat in patterns ) - if hasattr(func, 'pattern'): - func.pattern = pattern + '|' + func.pattern + pattern = "|".join(f"({pat})" for pat in patterns) + if hasattr(func, "pattern"): + func.pattern = pattern + "|" + func.pattern else: func.pattern = pattern return func + return decorate - d['_'] = _ - d['before'] = _Before + d["_"] = _ + d["before"] = _Before return d def __new__(meta, clsname, bases, attributes): - del attributes['_'] - del attributes['before'] + del attributes["_"] + del attributes["before"] # Create attributes for use in the actual class body - cls_attributes = { str(key): str(val) if isinstance(val, TokenStr) else val - for key, val in attributes.items() } + cls_attributes = { + str(key): str(val) if isinstance(val, TokenStr) else val + for key, val in attributes.items() + } cls = super().__new__(meta, clsname, bases, cls_attributes) # Attach various metadata to the class @@ -180,11 +201,12 @@ class LexerMeta(type): cls._build() return cls + class Lexer(metaclass=LexerMeta): # These attributes may be defined in subclasses tokens = set() literals = set() - ignore = '' + ignore = "" reflags = 0 regex_module = re @@ -214,7 +236,7 @@ class Lexer(metaclass=LexerMeta): # Such functions can be created with the @_ decorator or by defining # function with the same name as a previously defined string. # - # This function is responsible for keeping rules in order. + # This function is responsible for keeping rules in order. # Collect all previous rules from base classes rules = [] @@ -222,15 +244,21 @@ class Lexer(metaclass=LexerMeta): for base in cls.__bases__: if isinstance(base, LexerMeta): rules.extend(base._rules) - + # Dictionary of previous rules existing = dict(rules) for key, value in cls._attributes.items(): - if (key in cls._token_names) or key.startswith('ignore_') or hasattr(value, 'pattern'): - if callable(value) and not hasattr(value, 'pattern'): - raise LexerBuildError(f"function {value} doesn't have a regex pattern") - + if ( + (key in cls._token_names) + or key.startswith("ignore_") + or hasattr(value, "pattern") + ): + if callable(value) and not hasattr(value, "pattern"): + raise LexerBuildError( + f"function {value} doesn't have a regex pattern" + ) + if key in existing: # The definition matches something that already existed in the base class. # We replace it, but keep the original ordering @@ -252,21 +280,27 @@ class Lexer(metaclass=LexerMeta): rules.append((key, value)) existing[key] = value - elif isinstance(value, str) and not key.startswith('_') and key not in {'ignore', 'literals'}: - raise LexerBuildError(f'{key} does not match a name in tokens') + elif ( + isinstance(value, str) + and not key.startswith("_") + and key not in {"ignore", "literals"} + ): + raise LexerBuildError(f"{key} does not match a name in tokens") # Apply deletion rules - rules = [ (key, value) for key, value in rules if key not in cls._delete ] + rules = [(key, value) for key, value in rules if key not in cls._delete] cls._rules = rules @classmethod def _build(cls): - ''' + """ Build the lexer object from the collected tokens and regular expressions. Validate the rules to make sure they look sane. - ''' - if 'tokens' not in vars(cls): - raise LexerBuildError(f'{cls.__qualname__} class does not define a tokens attribute') + """ + if "tokens" not in vars(cls): + raise LexerBuildError( + f"{cls.__qualname__} class does not define a tokens attribute" + ) # Pull definitions created for any parent classes cls._token_names = cls._token_names | set(cls.tokens) @@ -282,17 +316,17 @@ class Lexer(metaclass=LexerMeta): remapped_toks = set() for d in cls._remapping.values(): remapped_toks.update(d.values()) - + undefined = remapped_toks - set(cls._token_names) if undefined: - missing = ', '.join(undefined) - raise LexerBuildError(f'{missing} not included in token(s)') + missing = ", ".join(undefined) + raise LexerBuildError(f"{missing} not included in token(s)") cls._collect_rules() parts = [] for tokname, value in cls._rules: - if tokname.startswith('ignore_'): + if tokname.startswith("ignore_"): tokname = tokname[7:] cls._ignored_tokens.add(tokname) @@ -301,20 +335,20 @@ class Lexer(metaclass=LexerMeta): elif callable(value): cls._token_funcs[tokname] = value - pattern = getattr(value, 'pattern') + pattern = getattr(value, "pattern") # Form the regular expression component - part = f'(?P<{tokname}>{pattern})' + part = f"(?P<{tokname}>{pattern})" # Make sure the individual regex compiles properly try: cpat = cls.regex_module.compile(part, cls.reflags) except Exception as e: - raise PatternError(f'Invalid regex for token {tokname}') from e + raise PatternError(f"Invalid regex for token {tokname}") from e # Verify that the pattern doesn't match the empty string - if cpat.match(''): - raise PatternError(f'Regex for token {tokname} matches empty input') + if cpat.match(""): + raise PatternError(f"Regex for token {tokname} matches empty input") parts.append(part) @@ -322,43 +356,45 @@ class Lexer(metaclass=LexerMeta): return # Form the master regular expression - #previous = ('|' + cls._master_re.pattern) if cls._master_re else '' + # previous = ('|' + cls._master_re.pattern) if cls._master_re else '' # cls._master_re = cls.regex_module.compile('|'.join(parts) + previous, cls.reflags) - cls._master_re = cls.regex_module.compile('|'.join(parts), cls.reflags) + cls._master_re = cls.regex_module.compile("|".join(parts), cls.reflags) # Verify that that ignore and literals specifiers match the input type if not isinstance(cls.ignore, str): - raise LexerBuildError('ignore specifier must be a string') + raise LexerBuildError("ignore specifier must be a string") if not all(isinstance(lit, str) for lit in cls.literals): - raise LexerBuildError('literals must be specified as strings') + raise LexerBuildError("literals must be specified as strings") def begin(self, cls): - ''' + """ Begin a new lexer state - ''' + """ assert isinstance(cls, LexerMeta), "state must be a subclass of Lexer" if self.__set_state: self.__set_state(cls) self.__class__ = cls def push_state(self, cls): - ''' + """ Push a new lexer state onto the stack - ''' + """ if self.__state_stack is None: self.__state_stack = [] self.__state_stack.append(type(self)) self.begin(cls) def pop_state(self): - ''' + """ Pop a lexer state from the stack - ''' + """ self.begin(self.__state_stack.pop()) def tokenize(self, text, lineno=1, index=0): - _ignored_tokens = _master_re = _ignore = _token_funcs = _literals = _remapping = None + _ignored_tokens = ( + _master_re + ) = _ignore = _token_funcs = _literals = _remapping = None def _set_state(cls): nonlocal _ignored_tokens, _master_re, _ignore, _token_funcs, _literals, _remapping @@ -419,7 +455,7 @@ class Lexer(metaclass=LexerMeta): # A lexing error self.index = index self.lineno = lineno - tok.type = 'ERROR' + tok.type = "ERROR" tok.value = text[index:] tok = self.error(tok) if tok is not None: @@ -436,4 +472,8 @@ class Lexer(metaclass=LexerMeta): # Default implementations of the error handler. May be changed in subclasses def error(self, t): - raise LexError(f'Illegal character {t.value[0]!r} at index {self.index}', t.value, self.index) + raise LexError( + f"Illegal character {t.value[0]!r} at index {self.index}", + t.value, + self.index, + ) |