Source code for regparser.notice.changes

""" This module contains functions to help parse the changes in a notice.
Changes are the exact details of how the pargraphs, sections etc. in a
regulation have changed.  """

import copy
import logging
from collections import OrderedDict, namedtuple

from regparser.grammar import amdpar
from regparser.grammar.tokens import Verb
from regparser.layer.paragraph_markers import marker_of
from regparser.tree import struct
from regparser.tree.paragraph import p_levels

logger = logging.getLogger(__name__)
Change = namedtuple('Change', ['label_id', 'content'])


[docs]def node_to_dict(node):
    """ Convert a node to a dictionary representation. We skip the
    children, turning them instead into a list of labels instead. """
    if not hasattr(node, 'child_labels'):
        node.child_labels = [c.label_id() for c in node.children]

    node_dict = {}
    for k, v in node.__dict__.items():
        if k not in ('children', 'source_xml'):
            node_dict[k] = v
    return node_dict


[docs]def bad_label(node):
    """ Look through a node label, and return True if it's a badly formed
    label. We can do this because we know what type of character should up at
    what point in the label. """

    if node.node_type == struct.Node.REGTEXT:
        for i, l in enumerate(node.label):
            if i == 0 and not l.isdigit():
                return True
            elif i == 1 and not l.isdigit():
                return True
            elif i > 1 and l not in p_levels[i - 2]:
                return True
    return False


[docs]def impossible_label(n, amended_labels):
    """ Return True if n is not in the same family as amended_labels. """
    test = n.label_id().startswith
    return not any(filter(test, amended_labels))


[docs]def find_candidate(root, label_last, amended_labels):
    """
        Look through the tree for a node that has the same paragraph marker as
        the one we're looking for (and also has no children).  That might be a
        mis-parsed node. Because we're parsing partial sections in the notices,
        it's likely we might not be able to disambiguate between paragraph
        markers.
    """
    def check(node):
        """ Match last part of label."""
        if node.label[-1] == label_last:
            return node

    candidates = struct.walk(root, check)
    if len(candidates) > 1:
        # Look for mal-formed labels, labels that can't exist (because we're
        # not amending that part of the reg, or eventually a parent with no
        # children.

        bad_labels = [n for n in candidates if bad_label(n)]
        impossible_labels = [n for n in candidates
                             if impossible_label(n, amended_labels)]
        no_children = [n for n in candidates if n.children == []]

        # If we have a single option in any of the categories, return that.
        if len(bad_labels) == 1:
            return bad_labels
        elif len(impossible_labels) == 1:
            return impossible_labels
        elif len(no_children) == 1:
            return no_children
    return candidates


[docs]def resolve_candidates(amend_map, warn=True):
    """Ensure candidate isn't actually accounted for elsewhere, and fix
    it's label. """
    for label, nodes in list(amend_map.items()):
        for node in filter(lambda n: 'node' in n and n['candidate'], nodes):
            node_label = node['node'].label_id()
            if node_label not in amend_map:
                node['node'].label = label.split('-')
            elif label in amend_map:
                del amend_map[label]
                if warn:
                    mesg = 'Unable to match amendment to change for: %s'
                    logger.warning(mesg, label)


[docs]def find_misparsed_node(section_node, label, change, amended_labels):
    """ Nodes can get misparsed in the sense that we don't always know where
    they are in the tree or have their correct label. The first part
    corrects markerless labeled nodes by updating the node's label if
    the source text has been changed to include the markerless paragraph
    (ex. 123-44-p6 for paragraph 6). we know this because `label` here
    is parsed from that change. The second part uses label to find a
    candidate for a mis-parsed node and creates an appropriate change. """

    is_markerless = struct.Node.is_markerless_label(label)
    markerless_paragraphs = struct.filter_walk(
        section_node,
        struct.Node.is_markerless_label)
    if is_markerless and len(markerless_paragraphs) == 1:
        change['node'] = markerless_paragraphs[0]
        change['candidate'] = True
        return change

    candidates = find_candidate(section_node, label[-1], amended_labels)
    if len(candidates) == 1:
        candidate = candidates[0]
        change['node'] = candidate
        change['candidate'] = True
        return change


[docs]def match_labels_and_changes(amendments, section_node):
    """ Given the list of amendments, and the parsed section node, match the
    two so that we're only changing what's been flagged as changing. This helps
    eliminate paragraphs that are just stars for positioning, for example.  """
    amended_labels = [a.label_id() for a in amendments]

    amend_map = OrderedDict()
    for amend in amendments:
        existing = amend_map.get(amend.label_id(), [])
        change = {'action': amend.action, 'amdpar_xml': amend.amdpar_xml}
        if amend.field is not None:
            change['field'] = amend.field

        if amend.action == 'MOVE':
            change['destination'] = amend.destination
            amend_map[amend.label_id()] = existing + [change]
        elif amend.action == 'DELETE':
            amend_map[amend.label_id()] = existing + [change]
        elif section_node is not None:
            node = struct.find(section_node, amend.label_id())
            if node is None:
                candidate = find_misparsed_node(
                    section_node, amend.label, change, amended_labels)
                if candidate:
                    amend_map[amend.label_id()] = existing + [candidate]
            else:
                change['node'] = node
                change['candidate'] = False
                level2 = amend.tree_format_level2()
                if level2 and node.is_section():
                    change['parent_label'] = level2
                amend_map[amend.label_id()] = existing + [change]

    resolve_candidates(amend_map)
    return amend_map


[docs]def format_node(node, amendment, parent_label=None):
    """ Format a node into a dict, and add in amendment information. """
    node_as_dict = {
        'node': node_to_dict(node),
        'action': amendment['action'],
    }

    if 'extras' in amendment:
        node_as_dict.update(amendment['extras'])

    if 'field' in amendment:
        node_as_dict['field'] = amendment['field']
    if parent_label:
        node_as_dict['parent_label'] = parent_label
    return Change(node.label_id(), node_as_dict)


[docs]def create_field_amendment(label, amendment):
    """ If an amendment is changing just a field (text, title) then we
    don't need to package the rest of the paragraphs with it. Those get
    dealt with later, if appropriate. """

    nodes_list = []
    flatten_tree(nodes_list, amendment['node'])

    changed_nodes = [n for n in nodes_list if n.label_id() == label]
    nodes = [format_node(n, amendment) for n in changed_nodes]
    return nodes


[docs]def create_add_amendment(amendment, subpart_label=None):
    """ An amendment comes in with a whole tree structure. We break apart the
    tree here (this is what flatten does), convert the Node objects to JSON
    representations. This ensures that each amendment only acts on one node.
    In addition, this futzes with the change's field when stars are present.
    """

    nodes_list = []
    flatten_tree(nodes_list, amendment['node'])
    changes = []
    for node in nodes_list:
        is_root = node.label == amendment['node'].label
        if is_root:
            parent_label = amendment.get('parent_label')
        elif len(node.label) == 2:
            parent_label = subpart_label
        else:
            parent_label = None
        changes.append(format_node(node, amendment, parent_label))

    puts = [c for c in changes if c.content['action'] == 'PUT']
    for label, change in puts:
        node = struct.find(amendment['node'], label)
        text = node.text.strip()
        marker = marker_of(node)
        text = text[len(marker):].strip()
        # Text is stars, but this is not the root. Explicitly try to keep
        # this node
        if text == '* * *':
            change['action'] = Verb.KEEP

        # If text ends with a colon and is followed by stars, assume we are
        # only modifying the intro text
        if (text[-1:] == ':' and node.label == amendment['node'].label and
                node.source_xml is not None):
            following = node.source_xml.getnext()
            if following is not None and following.tag == 'STARS':
                change['field'] = '[text]'

    return changes


[docs]def create_reserve_amendment(amendment):
    """ Create a RESERVE related amendment. """
    return format_node(amendment['node'], amendment)


[docs]def create_subpart_amendment(subpart_node):
    """ Create an amendment that describes a subpart. In particular
    when the list of nodes added gets flattened, each node specifies which
    subpart it's part of. """

    amendment = {
        'node': subpart_node,
        'action': 'POST',
    }
    return create_add_amendment(amendment, subpart_node.label)


[docs]def flatten_tree(node_list, node):
    """ Flatten a tree, removing all hierarchical information, making a
    list out of all the nodes. """
    for c in node.children:
        flatten_tree(node_list, c)

    # Don't be destructive.
    no_kids = copy.deepcopy(node)
    no_kids.children = []
    node_list.append(no_kids)


[docs]class NoticeChanges(object):
    """ Notice changes. """
    def __init__(self):
        self._changes_by_xml = OrderedDict()

[docs]    def add_change(self, amdpar_xml, change):
        """ Track another change. This is cognizant of the fact that a single
        label can have more than one change. Do not add the same change twice
        (as may occur if both the parent and child are marked as added)"""
        existing = self[amdpar_xml].get(change.label_id, [])
        if change.content not in existing:
            existing.append(change.content)
        self[amdpar_xml][change.label_id] = existing

    def __getitem__(self, key):
        """Fetch changes by XML"""
        if key not in self._changes_by_xml:
            self._changes_by_xml[key] = OrderedDict()
        return self._changes_by_xml[key]


[docs]def fix_section_node(paragraphs, amdpar_xml):
    """ When notices are corrected, the XML for notices doesn't follow the
    normal syntax. Namely, pargraphs aren't inside section tags. We fix that
    here, by finding the preceding section tag and appending paragraphs to it.
    """

    sections = [s for s in amdpar_xml.itersiblings(preceding=True)
                if s.tag == 'SECTION']

    # Let's only do this if we find one section tag.
    if len(sections) == 1:
        section = copy.deepcopy(sections[0])
        for paragraph in paragraphs:
            section.append(copy.deepcopy(paragraph))
        return section


[docs]def find_subpart(amdpar_tag):
    """ Look amongst an amdpar tag's siblings to find a subpart. """
    for sibling in amdpar_tag.itersiblings():
        if sibling.tag == 'SUBPART':
            return sibling


[docs]def new_subpart_added(amendment):
    """ Return True if label indicates that a new subpart was added """
    new_subpart = amendment.action == 'POST'
    label = amendment.original_label
    m = [t for t, _, _ in amdpar.subpart_label.scanString(label)]
    return m and new_subpart