Source code for regparser.tree.xml_parser.simple_hierarchy_processor

import re

import six

from regparser.tree.depth import markers as mtypes
from regparser.tree.depth import optional_rules
from regparser.tree.struct import Node
from regparser.tree.xml_parser import paragraph_processor, tree_utils


[docs]class DepthParagraphMatcher(paragraph_processor.BaseMatcher):
    """Convert a paragraph with an optional prefixing paragraph marker into an
    appropriate node. Does not know about collapsed markers nor most types of
    nodes."""
    _MARKER_STR = r'(?P<marker>[a-z]|[ivx]{1,5}|\d{1,2})'
    _PAREN_REGEX = re.compile(r'\({0}\)'.format(_MARKER_STR))
    _PERIOD_REGEX = re.compile(r'{0}\.'.format(_MARKER_STR))

[docs]    def matches(self, xml):
        return xml.tag == 'P'

[docs]    def derive_nodes(self, xml, processor=None):
        text = tree_utils.get_node_text(xml).strip()
        node = Node(text=text, source_xml=xml)
        node.tagged_text = six.text_type(
            tree_utils.get_node_text_tags_preserved(xml).strip())

        regex = self._PAREN_REGEX if text[:1] == '(' else self._PERIOD_REGEX
        match = regex.match(text)
        if match:
            node.label = [match.group('marker')]
        else:
            node.label = [mtypes.MARKERLESS]

        return [node]


[docs]class SimpleHierarchyProcessor(paragraph_processor.ParagraphProcessor):
    """ParagraphProcessor which attempts to pull out whatever paragraph marker
    is available and derive a hierarchy from that."""
    MATCHERS = [DepthParagraphMatcher()]

[docs]    def additional_constraints(self):
        return [optional_rules.limit_paragraph_types(
            mtypes.lower, mtypes.ints, mtypes.roman, mtypes.markerless)]


[docs]class SimpleHierarchyMatcher(paragraph_processor.BaseMatcher):
    """Detects tags passed to it on init and converts the contents of any
    matches into a hierarchy based on the SimpleHierarchyProcessor. Sets the
    node_type of the subtree's root"""
    def __init__(self, tags, node_type):
        self.tags = list(tags)
        self.node_type = node_type

[docs]    def matches(self, xml):
        return xml.tag in self.tags

[docs]    def derive_nodes(self, xml, processor=None):
        processor = SimpleHierarchyProcessor()
        node = Node(label=[mtypes.MARKERLESS], source_xml=xml,
                    node_type=self.node_type)
        return [processor.process(xml, node)]