Source code for regparser.layer.formatting

"""Find and abstracts formatting information from the regulation tree. In many
ways, this is like a markdown parser."""
import abc
import re
from collections import OrderedDict

import six
from lxml import etree

from regparser.layer.layer import Layer
from regparser.tree import struct
from regparser.tree.priority_stack import PriorityStack
from regparser.tree.xml_parser import tree_utils


[docs]class HeaderStack(PriorityStack):
    """Used to determine Table Headers -- indeed, they are complicated
    enough to warrant their own stack"""
[docs]    def unwind(self):
        children = [pair[1] for pair in self.pop()]
        self.peek_last()[1].children = children


[docs]class TableHeaderNode(object):
    """Represents a cell in a table's header"""
    def __init__(self, text, level):
        self.text = text
        self.level = level
        self.children = []

[docs]    def height(self):
        child_heights = [0] + [c.height() for c in self.children]
        return 1 + max(child_heights)

[docs]    def width(self):
        if not self.children:
            return 1
        return sum(c.width() for c in self.children)


[docs]def build_header(xml_nodes):
    """Builds a TableHeaderNode tree, with an empty root. Each node in the tree
    includes its colspan/rowspan"""
    def add_element(stack, xml_node, level=None):
        text = tree_utils.get_node_text(xml_node, add_spaces=True).strip()
        stack.add(level, TableHeaderNode(text, level))

    stack = HeaderStack()
    stack.add(0, TableHeaderNode(None, 0))  # Root

    for xml_node in xml_nodes:
        level = int(xml_node.attrib['H'])
        add_element(stack, xml_node, level=level)

    while stack.size() > 1:
        stack.unwind()
    root = stack.m_stack[0][0][1]

    max_height = root.height()

    def set_colspan(n):
        n.colspan = n.width()
    struct.walk(root, set_colspan)

    root = build_header_rowspans(root, max_height)

    return root


[docs]def build_header_rowspans(tree_root, max_height):
    """
    The following table is an example of why we need a relatively complicated
    approach to setting rowspan:

    |R1C1     |R1C2               |
    |R2C1|R2C2|R2C3     |R2C4     |
    |    |    |R3C1|R3C2|R3C3|R3C4|

    If we set the rowspan of each node to::

        max_height - node.height() - node.level + 1

    R1C1 will end up with a rowspan of 2 instead of 1, because of difficulties
    handling the implicit rowspans for R2C1 and R2C2.

    Instead, we generate a list of the paths to each leaf and then set
    rowspan based on that.

    Rowspan for leaves is ``max_height - node.height() - node.level + 1``, and
    for root is simply 1. Other nodes' rowspans are set to the level of the
    node after them minus their own level.
    """

    paths = []

    def collect_paths(node, path):
        if node.children:
            for child in node.children:
                collect_paths(child, path + [node])
        else:
            paths.append(path + [node])
    collect_paths(tree_root, [])

    for path in paths:
        for i, node in enumerate(path):
            if i == 0:  # root
                node.rowspan = 1
            elif i + 1 == len(path):    # leaves
                node.rowspan = max_height - node.height() - node.level + 1
            else:   # intermediate nodes
                node.rowspan = path[i + 1].level - node.level

    return tree_root


[docs]def table_xml_to_plaintext(xml_node):
    """Markdown representation of a table. Note that this doesn't account
    for all the options needed to display the table properly, but works fine
    for simple tables. This gets included in the reg plain text"""
    header = [tree_utils.get_node_text(hd, add_spaces=True).strip()
              for hd in xml_node.xpath('./BOXHD/CHED|./TTITLE')]
    divider = ['---'] * len(header)
    rows = []
    for tr in xml_node.xpath('./ROW'):
        rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
                     for td in tr.xpath('./ENT')])
    table = []
    for row in [header] + [divider] + rows:
        table.append('|' + '|'.join(row) + '|')
    return '\n'.join(table)


[docs]def table_xml_to_data(xml_node):
    """Construct a data structure of the table data. We provide a different
    structure than the native XML as the XML encodes too much logic. This
    structure can be used to generate semi-complex tables which could not be
    generated from the markdown above"""
    header_root = build_header(xml_node.xpath('./BOXHD/CHED'))
    header = [[] for _ in range(header_root.height())]

    def per_node(node):
        header[node.level].append({'text': node.text,
                                   'colspan': node.colspan,
                                   'rowspan': node.rowspan})
    struct.walk(header_root, per_node)
    header = header[1:]     # skip the root

    rows = []
    for row in xml_node.xpath('./ROW'):
        rows.append([tree_utils.get_node_text(td, add_spaces=True).strip()
                     for td in row.xpath('./ENT')])

    table_data = {'header': header, 'rows': rows}

    caption_nodes = xml_node.xpath('./TTITLE')
    if len(caption_nodes):
        text = tree_utils.get_node_text(caption_nodes[0]).strip()
        table_data["caption"] = text

    return table_data


[docs]class PlaintextFormatData(six.with_metaclass(abc.ABCMeta)):
    """Base class for formatting information which can be derived from the
    plaintext of a regulation node"""
    @abc.abstractproperty
    def REGEX(self):    # noqa - this is a property
        """Regular expression used to find matches in the plain text"""
        raise NotImplementedError()

    @abc.abstractmethod
[docs]    def match_data(self, match):
        """Derive data structure (as a dict) from the regex match"""
        raise NotImplementedError()

[docs]    def process(self, text):
        """Find all matches of self.REGEX, transform them into the appropriate
        data structure, return these as a list"""
        # [string] -> (match object, count)
        match_text_counter = OrderedDict()
        for match in self.REGEX.finditer(text):
            match_text = match.group(0)
            existing = match_text_counter.get(match_text, (None, 0))
            count = existing[1]
            match_text_counter[match_text] = (match, count + 1)

        for match, count in match_text_counter.values():
            data = {'text': match.group(0),
                    'locations': list(range(count))}
            data.update(self.match_data(match))
            yield data


[docs]class FencedData(PlaintextFormatData):
    """E.g.
        ```note
        Line 1
        Line 2
        ```
    """
    REGEX = re.compile(r"```(?P<type>[a-zA-Z0-9 ]+)\w*\n"
                       r"(?P<lines>([^\n]*\n)+)"
                       r"```")

[docs]    def match_data(self, match):
        return {'fence_data': {
            'type': match.group('type'),
            'lines': [l for l in match.group('lines').split("\n") if l]
        }}


[docs]class Subscript(PlaintextFormatData):
    """E.g.     a_{0}"""
    REGEX = re.compile(r"_\{(?P<subscript>[^\}]+)\}")

[docs]    def match_data(self, match):
        return {'subscript_data': {'subscript': match.group('subscript')}}


[docs]class Superscript(PlaintextFormatData):
    """E.g.     x^{2}"""
    REGEX = re.compile(r"\^\{(?P<superscript>[^\}]+)\}")

[docs]    def match_data(self, match):
        return {
            'superscript_data': {'superscript': match.group('superscript')}}


[docs]class Dashes(PlaintextFormatData):
    """E.g.     Some text some text_____"""
    REGEX = re.compile(r"(?P<text>.*)(?P<dashes>_{5,})$")

[docs]    def match_data(self, match):
        return {'dash_data': {'text': match.group('text')}}


[docs]class Footnotes(PlaintextFormatData):
    """E.g.     [^4](Contents of footnote)
       The footnote may also contain parens if they are escaped with a
       backslash"""
    # Note: we don't want to use \(\) is the example in the docstring as we'd
    # need to double-escape or mark the docstring as raw.

    _ref_regex = r"\[\^(?P<ref>[^\]]*)\]"   # [^\]]* = take until hitting a ]
    _begin_note_regex = r"\((?P<note>.*?)"
    _close_paren = r"(?<!\\)\)"     # neg lookbehind for skipping escaped \)
    REGEX = re.compile(_ref_regex + _begin_note_regex + _close_paren)

[docs]    def match_data(self, match):
        # Un-escape parens
        note = match.group('note').replace(r'\(', '(').replace(r'\)', ')')
        return {'footnote_data': {'ref': match.group('ref'), 'note': note}}


[docs]def node_to_table_xml_els(node):
    """Search in a few places for GPOTABLE xml elements"""
    if node.source_xml is not None:
        root_xml_el = node.source_xml
    else:
        # tagged_text isn't quite XML -- it's often a fragment with unescaped
        # characters. Clean it up before searching it
        tagged_text = node.tagged_text.replace('&', '&amp;')
        tagged_text = u'<ROOT>{0}</ROOT>'.format(tagged_text)
        root_xml_el = etree.fromstring(tagged_text)

    return root_xml_el.xpath('self::GPOTABLE|.//GPOTABLE')


[docs]class Formatting(Layer):
    """Layer responsible for tables, subscripts, and other formatting-related
    information"""
    shorthand = 'formatting'

[docs]    def process(self, node):
        layer_el = []
        for table_el in node_to_table_xml_els(node):
            layer_el.append({'text': table_xml_to_plaintext(table_el),
                             'locations': [0],
                             'table_data': table_xml_to_data(table_el)})

        for finder_class in PlaintextFormatData.__subclasses__():
            layer_el.extend(finder_class().process(node.text))

        if layer_el:
            return layer_el