Source code for regparser.tree.xml_parser.tree_utils

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from copy import deepcopy
from functools import wraps
from itertools import chain

from lxml import etree
from six.moves.html_parser import HTMLParser

from regparser.tree.priority_stack import PriorityStack


[docs]def prepend_parts(parts_prefix, n):
    """ Recursively preprend parts_prefix to the parts of the node
    n. Parts is a list of markers that indicates where you are in the
    regulation text. """

    n.label = parts_prefix + n.label

    for c in n.children:
        prepend_parts(parts_prefix, c)
    return n


[docs]class NodeStack(PriorityStack):
    """ The NodeStack aids our construction of a struct.Node tree. We process
    xml one paragraph at a time; using a priority stack allows us to insert
    items at their proper depth and unwind the stack (collecting children) as
    necessary"""
[docs]    def unwind(self):
        """ Unwind the stack, collapsing sub-paragraphs that are on the stack
        into the children of the previous level. """
        children = self.pop()
        parts_prefix = self.peek_last()[1].label
        children = [prepend_parts(parts_prefix, c[1]) for c in children]
        self.peek_last()[1].children = children

[docs]    def collapse(self):
        """After all of the nodes have been inserted at their proper levels,
        collapse them into a single root node"""
        while self.size() > 1:
            self.unwind()
        return self.peek_last()[1]


[docs]def split_text(text, tokens):
    """
        Given a body of text that contains tokens,
        splice the text along those tokens.
    """
    starts = [text.find(t) for t in tokens]
    if not starts or starts[0] != 0:
        starts.insert(0, 0)
    slices = zip(starts, starts[1:])
    texts = [text[i[0]:i[1]] for i in slices] + [text[starts[-1]:]]
    return texts


def _combine_with_space(prev_text, next_text, add_space_if_needed):
    """Logic to determine where to add spaces to XML. Generally this is just
    as matter of checking for space characters, but there are some
    outliers"""
    prev_text, next_text = prev_text or "", next_text or ""
    prev_char, next_char = prev_text[-1:], next_text[:1]
    needs_space = (not prev_char.isspace() and
                   not next_char.isspace() and
                   next_char and
                   prev_char not in u'([/<—-' and
                   next_char not in u').;,]>/—-')
    if add_space_if_needed and needs_space:
        return prev_text + " " + next_text
    else:
        return prev_text + next_text


[docs]def replace_xml_node_with_text(node, text):
    """There are some complications w/ lxml when determining where to add the
    replacement text. Account for all of that here."""
    parent, prev = node.getparent(), node.getprevious()
    if prev is not None:
        prev.tail = (prev.tail or '') + text
    else:
        parent.text = (parent.text or '') + text
    parent.remove(node)


[docs]def replace_xpath(xpath):
    """Decorator to convert all elements matching the provided xpath in to
    plain text. This'll convert the wrapped function into a new function which
    will search for the provided xpath and replace all matches"""
    def decorator(fn):
        @wraps(fn)
        def wrapped(node, add_spaces):
            for element in node.xpath(xpath):
                text = fn(element)
                text = _combine_with_space(text, element.tail, add_spaces)
                replace_xml_node_with_text(element, text)
        return wrapped
    return decorator


@replace_xpath(".//E[@T='52' or @T='54']")
[docs]def subscript_to_plaintext(element):
    return "_{{{0}}}".format(element.text)


@replace_xpath(".//E[@T='51' or @T='53']|.//SU[not(@footnote)]")
[docs]def superscript_to_plaintext(element):
    return "^{{{0}}}".format(element.text)


@replace_xpath(".//SU[@footnote]")
[docs]def footnotes_to_plaintext(element):
    footnote = element.attrib['footnote']
    footnote = footnote.replace('(', r'\(').replace(')', r'\)')
    return u"[^{0}]({1})".format(element.text, footnote)


[docs]def get_node_text(node, add_spaces=False):
    """ Extract all the text from an XML node (including the text of it's
    children). """
    node = deepcopy(node)
    subscript_to_plaintext(node, add_spaces)
    superscript_to_plaintext(node, add_spaces)
    footnotes_to_plaintext(node, add_spaces)

    parts = [node.text] + list(
        chain(*([c.text, c.tail] for c in node.getchildren())))

    final_text = ''
    for part in filter(bool, parts):
        final_text = _combine_with_space(final_text, part, add_spaces)
    return final_text.strip()


_tag_black_list = ('PRTPAGE', )


[docs]def get_node_text_tags_preserved(xml_node):
    """Get the body of an XML node as a string, avoiding a specific blacklist
    of bad tags."""
    xml_node = deepcopy(xml_node)
    etree.strip_tags(xml_node, *_tag_black_list)

    # Remove the wrapping tag
    node_text = xml_node.text or ''
    node_text += ''.join(etree.tounicode(child) for child in xml_node)

    node_text = HTMLParser().unescape(node_text)
    return node_text