Source code for regparser.layer.layer

import abc
from collections import defaultdict, namedtuple

import six

SearchReplace = namedtuple('SearchReplace',
                           ['text', 'locations', 'representative'])


[docs]class Layer(six.with_metaclass(abc.ABCMeta)):
    """Base class for all of the Layer generators. Defines the interface they
    must implement"""
    def __init__(self, tree, **context):
        """Different layers may need different contextual information, such as
        which version of a regulation is being processed, which CFR title is
        under inspection, etc. We'd like to call the constructor of each
        different layer in the same way (so we can just iterate over all
        layers), so we silently eat all kwargs"""
        self.tree = tree
        self.layer = {}

[docs]    def pre_process(self):
        """ Take the whole tree and do any pre-processing """
        pass

    @abc.abstractproperty
    def shorthand(self):
        """Unique identifier for this layer"""
        raise NotImplementedError()

    @abc.abstractmethod
[docs]    def process(self, node):
        """ Construct the element of the layer relevant to processing the given
        node, so it returns (pargraph_id, layer_content) or None if there is no
        relevant information. """

        raise NotImplementedError()

[docs]    def builder(self, node, cache=None):
        if cache:
            layer_element = cache.fetch_or_process(self, node)
        else:
            layer_element = self.process(node)
        if layer_element:
            self.layer[node.label_id()] = layer_element

        for c in node.children:
            self.builder(c, cache)

[docs]    def build(self, cache=None):
        self.pre_process()
        self.builder(self.tree, cache)
        return self.layer

    @staticmethod
[docs]    def convert_to_search_replace(matches, text, start_fn, end_fn):
        """We'll often have a bunch of text matches based on offsets. To use
        the "search-replace" encoding (which is a bit more resilient to minor
        variations in text), we need to convert these offsets into "locations"
        -- i.e. of all of the instances of a string in this text, which should
        be matched. Yields `SearchReplace` tuples"""
        text_to_matches = defaultdict(list)
        for match in matches:
            text_to_matches[text[start_fn(match):end_fn(match)]].append(match)

        for match_text, matches in sorted(text_to_matches.items()):
            locations, location = [], 0
            idx = text.find(match_text)
            while idx != -1:
                if any(start_fn(match) == idx for match in matches):
                    locations.append(location)
                location += 1
                idx = text.find(match_text, idx + 1)

            yield SearchReplace(match_text, locations,
                                representative=matches[0])