Source code for regparser.tree.xml_parser.import_category

import logging
import re
from copy import deepcopy

from regparser.tree.depth import markers as mtypes
from regparser.tree.paragraph import hash_for_paragraph
from regparser.tree.struct import Node
from regparser.tree.xml_parser import paragraph_processor, tree_utils


[docs]class ImportCategoryMatcher(paragraph_processor.BaseMatcher): """The IMPORTCATEGORY gets converted into a subtree with an appropriate title and unique paragraph marker""" CATEGORY_RE = re.compile(r'categor(y|ies) (?P<category>[ivx]+).*', re.IGNORECASE)
[docs] def matches(self, xml): return xml.tag == 'IMPORTCATEGORY'
[docs] def derive_nodes(self, xml, processor=None): """Finds and deletes the category header before recursing. Adds this header as a title.""" xml = deepcopy(xml) # we'll be modifying this header = xml.xpath('./HD')[0] xml.remove(header) header_text = tree_utils.get_node_text(header) node = Node(title=header_text, label=[self.marker(header_text)]) return [processor.process(xml, node)]
@classmethod
[docs] def marker(cls, header_text): """Derive a unique, repeatable identifier for this subtree. This allows the same category to be reordered (e.g. if a note has been added), or a header with multiple reserved categories to be split (which would also re-order the categories that followed)""" match = cls.CATEGORY_RE.match(header_text) if match: return 'p{0}'.format(hash_for_paragraph(match.group('category'))) else: logging.warning("Couldn't derive category: %s", header_text) return mtypes.MARKERLESS