Source code for regparser.tree.xml_parser.import_category

import logging
import re
from copy import deepcopy

from regparser.tree.depth import markers as mtypes
from regparser.tree.paragraph import hash_for_paragraph
from regparser.tree.struct import Node
from regparser.tree.xml_parser import paragraph_processor, tree_utils


[docs]class ImportCategoryMatcher(paragraph_processor.BaseMatcher):
    """The IMPORTCATEGORY gets converted into a subtree with an appropriate
    title and unique paragraph marker"""
    CATEGORY_RE = re.compile(r'categor(y|ies) (?P<category>[ivx]+).*',
                             re.IGNORECASE)

[docs]    def matches(self, xml):
        return xml.tag == 'IMPORTCATEGORY'

[docs]    def derive_nodes(self, xml, processor=None):
        """Finds and deletes the category header before recursing. Adds this
        header as a title."""
        xml = deepcopy(xml)     # we'll be modifying this
        header = xml.xpath('./HD')[0]
        xml.remove(header)
        header_text = tree_utils.get_node_text(header)

        node = Node(title=header_text, label=[self.marker(header_text)])
        return [processor.process(xml, node)]

    @classmethod
[docs]    def marker(cls, header_text):
        """Derive a unique, repeatable identifier for this subtree. This
        allows the same category to be reordered (e.g. if a note has been
        added), or a header with multiple reserved categories to be split
        (which would also re-order the categories that followed)"""
        match = cls.CATEGORY_RE.match(header_text)
        if match:
            return 'p{0}'.format(hash_for_paragraph(match.group('category')))
        else:
            logging.warning("Couldn't derive category: %s", header_text)
            return mtypes.MARKERLESS