import re
from collections import namedtuple
import pyparsing as pp
from six.moves import reduce
Position = namedtuple('Position', ['start', 'end'])
[docs]def keep_pos(expr):
"""Transform a pyparsing grammar by inserting an attribute, "pos", on the
match which describes position information"""
loc_marker = pp.Empty().setParseAction(lambda s, loc, t: loc)
end_loc_marker = loc_marker.copy()
end_loc_marker.callPreparse = False # don't allow the cursor to move
return (
loc_marker.setResultsName("pos_start") +
expr +
end_loc_marker.setResultsName("pos_end")
).setParseAction(parse_position)
[docs]def parse_position(source, location, tokens):
"""A pyparsing parse action which pulls out (and removes) the position
information and replaces it with a Position object"""
start, end = tokens['pos_start'], tokens['pos_end']
del tokens[0]
del tokens[-1]
del tokens['pos_start']
del tokens['pos_end']
tokens['pos'] = Position(start, end)
return tokens
[docs]class DocLiteral(pp.Literal):
"""Setting an objects name to a unicode string causes Sphinx to freak
out. Instead, we'll replace with the provided (ascii) text."""
def __init__(self, literal, ascii_text):
super(DocLiteral, self).__init__(literal)
self.name = ascii_text
[docs]def WordBoundaries(grammar): # noqa - we treat this like a pyparsing class
return (pp.WordStart(pp.alphanums) +
grammar +
pp.WordEnd(pp.alphanums))
[docs]def Marker(txt): # noqa - we treat this like a pyparsing class
return pp.Suppress(WordBoundaries(pp.CaselessLiteral(txt)))
[docs]def SuffixMarker(txt): # noqa - we treat this like a pyparsing class
return pp.Suppress(pp.CaselessLiteral(txt) + pp.WordEnd(pp.alphanums))
[docs]class QuickSearchable(pp.ParseElementEnhance):
"""Pyparsing's `scanString` (i.e. searching for a grammar over a string)
tests each index within its search string. While that offers maximum
flexibility, it is rather slow for our needs. This enhanced grammar type
wraps other grammars, deriving from them a first regular expression to use
when `scanString`ing. This cuts search time considerably."""
cases = []
def __init__(self, expr, force_regex_str=None):
super(QuickSearchable, self).__init__(expr)
regex_strs = []
if force_regex_str is not None:
regex_strs.append(force_regex_str)
else:
for regex_str in QuickSearchable.initial_regex(expr):
if '|' in regex_str:
# If the regex includes an "or", we need to wrap it in
# parens
regex_str = '(' + regex_str + ')'
regex_strs.append(regex_str)
# Combine all potential initial_regexes with an "or". Match
# Pyparsing's naming convention
self.reString = '|'.join(regex_strs)
self.re = re.compile(
self.reString,
# Be as forgiving as possible with flags; false negatives aren't
# acceptable but false positives are fine
re.IGNORECASE | re.UNICODE | re.MULTILINE | re.DOTALL)
self.parseImpl = expr.parseImpl
[docs] def scanString(self, instring, maxMatches=None, overlap=False): # noqa
"""Override `scanString` to attempt parsing only where there's a regex
search match (as opposed to every index). Does not implement the full
scanString interface."""
if maxMatches is not None or overlap:
raise ValueError("QuickScannable does not implement the full "
"scanString interface")
search_idx = 0
while search_idx < len(instring):
match = self.re.search(instring, search_idx)
if match:
try:
pre_loc = self.expr.preParse(instring, match.start())
next_loc, tokens = self.expr._parse(
instring, match.start(), callPreParse=False)
if next_loc > match.start():
yield tokens, pre_loc, next_loc
search_idx = next_loc
else:
search_idx += 1
except pp.ParseException:
search_idx = match.start() + 1
else:
search_idx = len(instring)
@classmethod
[docs] def initial_regex(cls, grammar):
"""Given a Pyparsing grammar, derive a set of suitable initial regular
expressions to aid our search. As grammars may `Or` together multiple
sub-expressions, this always returns a `set` of possible regular
expression strings. This is _not_ a complete conversion to regexes nor
does it account for every Pyparsing element; add as needed"""
for case in cls.cases:
if case.matches(grammar):
return case(grammar)
# Grammar type that we've not accounted for. Fail fast
raise Exception("Unknown grammar type: {0}".format(grammar.__class__))
@classmethod
[docs] def case(cls, *match_classes):
"""Add a "case" which will match grammars based on the provided
class types. If there's a match, we'll execute the function"""
def inner(process_fn):
process_fn.matches = lambda g: isinstance(g, match_classes)
cls.cases.append(process_fn)
return process_fn
return inner
@classmethod
[docs] def and_case(cls, *first_classes):
""""And" grammars are relatively common; while we generally just want
to look at their first terms, this decorator lets us describe special
cases based on the class type of the first component of the clause"""
def inner(process_fn):
process_fn.matches = (lambda g: isinstance(g, pp.And)
and isinstance(g.exprs[0], first_classes))
cls.cases.append(process_fn)
return process_fn
return inner
@QuickSearchable.and_case(pp.WordStart)
[docs]def wordstart(grammar):
"""Optimization: WordStart is generally followed by a more specific
identifier. Rather than searching for the start of a word alone, search
for that identifier as well"""
boundry, next_expr = grammar.exprs[:2]
word_chars = ''.join(re.escape(char)
for char in boundry.wordChars)
return {'(?<![{0}])'.format(word_chars) + regex_str
for regex_str in QuickSearchable.initial_regex(next_expr)}
@QuickSearchable.and_case(pp.Optional)
[docs]def optional(grammar):
with_grammar = QuickSearchable.initial_regex(grammar.exprs[0].expr)
without_grammar = QuickSearchable.initial_regex(grammar.exprs[1])
return with_grammar | without_grammar
@QuickSearchable.and_case(pp.Empty)
[docs]def empty(grammar):
return QuickSearchable.initial_regex(grammar.exprs[1])
@QuickSearchable.case(pp.And)
[docs]def match_and(grammar):
return QuickSearchable.initial_regex(grammar.exprs[0])
@QuickSearchable.case(pp.MatchFirst, pp.Or)
[docs]def match_or(grammar):
return reduce(
lambda so_far, expr: so_far | QuickSearchable.initial_regex(expr),
grammar.exprs, set()
)
@QuickSearchable.case(pp.Suppress)
[docs]def suppress(grammar):
return QuickSearchable.initial_regex(grammar.expr)
@QuickSearchable.case(pp.Regex, pp.Word, QuickSearchable)
[docs]def has_re_string(grammar):
return {grammar.reString}
@QuickSearchable.case(pp.LineStart)
[docs]def line_start(grammar):
return {'^'}
@QuickSearchable.case(pp.Literal)
[docs]def literal(grammar):
return {re.escape(grammar.match)}