Source code for regparser.grammar.utils

import re
from collections import namedtuple

import pyparsing as pp
from six.moves import reduce

Position = namedtuple('Position', ['start', 'end'])


[docs]def keep_pos(expr): """Transform a pyparsing grammar by inserting an attribute, "pos", on the match which describes position information""" loc_marker = pp.Empty().setParseAction(lambda s, loc, t: loc) end_loc_marker = loc_marker.copy() end_loc_marker.callPreparse = False # don't allow the cursor to move return ( loc_marker.setResultsName("pos_start") + expr + end_loc_marker.setResultsName("pos_end") ).setParseAction(parse_position)
[docs]def parse_position(source, location, tokens): """A pyparsing parse action which pulls out (and removes) the position information and replaces it with a Position object""" start, end = tokens['pos_start'], tokens['pos_end'] del tokens[0] del tokens[-1] del tokens['pos_start'] del tokens['pos_end'] tokens['pos'] = Position(start, end) return tokens
[docs]class DocLiteral(pp.Literal): """Setting an objects name to a unicode string causes Sphinx to freak out. Instead, we'll replace with the provided (ascii) text.""" def __init__(self, literal, ascii_text): super(DocLiteral, self).__init__(literal) self.name = ascii_text
[docs]def WordBoundaries(grammar): # noqa - we treat this like a pyparsing class return (pp.WordStart(pp.alphanums) + grammar + pp.WordEnd(pp.alphanums))
[docs]def Marker(txt): # noqa - we treat this like a pyparsing class return pp.Suppress(WordBoundaries(pp.CaselessLiteral(txt)))
[docs]def SuffixMarker(txt): # noqa - we treat this like a pyparsing class return pp.Suppress(pp.CaselessLiteral(txt) + pp.WordEnd(pp.alphanums))
[docs]class QuickSearchable(pp.ParseElementEnhance): """Pyparsing's `scanString` (i.e. searching for a grammar over a string) tests each index within its search string. While that offers maximum flexibility, it is rather slow for our needs. This enhanced grammar type wraps other grammars, deriving from them a first regular expression to use when `scanString`ing. This cuts search time considerably.""" cases = [] def __init__(self, expr, force_regex_str=None): super(QuickSearchable, self).__init__(expr) regex_strs = [] if force_regex_str is not None: regex_strs.append(force_regex_str) else: for regex_str in QuickSearchable.initial_regex(expr): if '|' in regex_str: # If the regex includes an "or", we need to wrap it in # parens regex_str = '(' + regex_str + ')' regex_strs.append(regex_str) # Combine all potential initial_regexes with an "or". Match # Pyparsing's naming convention self.reString = '|'.join(regex_strs) self.re = re.compile( self.reString, # Be as forgiving as possible with flags; false negatives aren't # acceptable but false positives are fine re.IGNORECASE | re.UNICODE | re.MULTILINE | re.DOTALL) self.parseImpl = expr.parseImpl
[docs] def scanString(self, instring, maxMatches=None, overlap=False): # noqa """Override `scanString` to attempt parsing only where there's a regex search match (as opposed to every index). Does not implement the full scanString interface.""" if maxMatches is not None or overlap: raise ValueError("QuickScannable does not implement the full " "scanString interface") search_idx = 0 while search_idx < len(instring): match = self.re.search(instring, search_idx) if match: try: pre_loc = self.expr.preParse(instring, match.start()) next_loc, tokens = self.expr._parse( instring, match.start(), callPreParse=False) if next_loc > match.start(): yield tokens, pre_loc, next_loc search_idx = next_loc else: search_idx += 1 except pp.ParseException: search_idx = match.start() + 1 else: search_idx = len(instring)
@classmethod
[docs] def initial_regex(cls, grammar): """Given a Pyparsing grammar, derive a set of suitable initial regular expressions to aid our search. As grammars may `Or` together multiple sub-expressions, this always returns a `set` of possible regular expression strings. This is _not_ a complete conversion to regexes nor does it account for every Pyparsing element; add as needed""" for case in cls.cases: if case.matches(grammar): return case(grammar) # Grammar type that we've not accounted for. Fail fast raise Exception("Unknown grammar type: {0}".format(grammar.__class__))
@classmethod
[docs] def case(cls, *match_classes): """Add a "case" which will match grammars based on the provided class types. If there's a match, we'll execute the function""" def inner(process_fn): process_fn.matches = lambda g: isinstance(g, match_classes) cls.cases.append(process_fn) return process_fn return inner
@classmethod
[docs] def and_case(cls, *first_classes): """"And" grammars are relatively common; while we generally just want to look at their first terms, this decorator lets us describe special cases based on the class type of the first component of the clause""" def inner(process_fn): process_fn.matches = (lambda g: isinstance(g, pp.And) and isinstance(g.exprs[0], first_classes)) cls.cases.append(process_fn) return process_fn return inner
@QuickSearchable.and_case(pp.WordStart)
[docs]def wordstart(grammar): """Optimization: WordStart is generally followed by a more specific identifier. Rather than searching for the start of a word alone, search for that identifier as well""" boundry, next_expr = grammar.exprs[:2] word_chars = ''.join(re.escape(char) for char in boundry.wordChars) return {'(?<![{0}])'.format(word_chars) + regex_str for regex_str in QuickSearchable.initial_regex(next_expr)}
@QuickSearchable.and_case(pp.Optional)
[docs]def optional(grammar): with_grammar = QuickSearchable.initial_regex(grammar.exprs[0].expr) without_grammar = QuickSearchable.initial_regex(grammar.exprs[1]) return with_grammar | without_grammar
@QuickSearchable.and_case(pp.Empty)
[docs]def empty(grammar): return QuickSearchable.initial_regex(grammar.exprs[1])
@QuickSearchable.case(pp.And)
[docs]def match_and(grammar): return QuickSearchable.initial_regex(grammar.exprs[0])
@QuickSearchable.case(pp.MatchFirst, pp.Or)
[docs]def match_or(grammar): return reduce( lambda so_far, expr: so_far | QuickSearchable.initial_regex(expr), grammar.exprs, set() )
@QuickSearchable.case(pp.Suppress)
[docs]def suppress(grammar): return QuickSearchable.initial_regex(grammar.expr)
@QuickSearchable.case(pp.Regex, pp.Word, QuickSearchable)
[docs]def has_re_string(grammar): return {grammar.reString}
@QuickSearchable.case(pp.LineStart)
[docs]def line_start(grammar): return {'^'}
@QuickSearchable.case(pp.Literal)
[docs]def literal(grammar): return {re.escape(grammar.match)}