Source code for regparser.tree.xml_parser.us_code

import re

import six

from regparser.tree.depth import markers as mtypes
from regparser.tree.depth import optional_rules
from regparser.tree.struct import Node
from regparser.tree.xml_parser import paragraph_processor, tree_utils


[docs]class USCodeParagraphMatcher(paragraph_processor.BaseMatcher):
    """Convert a paragraph found in the US Code into appropriate Nodes"""
    _MARKER_RE = re.compile(r'\((?P<marker>[a-z]+|[A-Z]+|[0-9]+)\)')

[docs]    def matches(self, xml):
        return xml.tag == 'P'

[docs]    def paragraph_markers(self, text):
        """We can't use tree_utils.get_paragraph_markers as that makes
        assumptions about the order of paragraph markers (specifically
        that the markers will match the order found in regulations). This is
        simpler, looking only at multiple markers at the beginning of the
        paragraph"""
        markers = []
        match = self._MARKER_RE.match(text)
        while match:
            markers.append(match.group('marker'))
            text = text[match.end():].strip()
            match = self._MARKER_RE.match(text)
        return markers

[docs]    def derive_nodes(self, xml, processor=None):
        nodes = []
        text = tree_utils.get_node_text(xml).strip()
        tagged_text = tree_utils.get_node_text_tags_preserved(xml).strip()
        markers_list = self.paragraph_markers(text)
        with_parens = ['({0})'.format(m) for m in markers_list]
        triplets = zip(markers_list,
                       tree_utils.split_text(text, with_parens),
                       tree_utils.split_text(tagged_text, with_parens))
        for m, text, tagged_text in triplets:
            nodes.append(Node(
                text=text.strip(), label=[m], source_xml=xml,
                tagged_text=six.text_type(tagged_text.strip())
            ))
        return nodes


[docs]class USCodeProcessor(paragraph_processor.ParagraphProcessor):
    """ParagraphProcessor which converts a chunk of XML into Nodes. Only
    processes P nodes and limits the type of paragraph markers to those found
    in US Code"""
    MATCHERS = [USCodeParagraphMatcher()]

[docs]    def additional_constraints(self):
        return [optional_rules.limit_sequence_gap(),
                optional_rules.limit_paragraph_types(
                    mtypes.lower, mtypes.ints, mtypes.upper, mtypes.roman,
                    mtypes.upper_roman)]


[docs]class USCodeMatcher(paragraph_processor.BaseMatcher):
    """Matches a custom `USCODE` tag and parses it's contents with the
    USCodeProcessor. Does not use a custom node type at the moment"""
[docs]    def matches(self, xml):
        return xml.tag == 'USCODE'

[docs]    def derive_nodes(self, xml, processor=None):
        processor = USCodeProcessor()
        node = Node(label=[mtypes.MARKERLESS], source_xml=xml)
        return [processor.process(xml, node)]