Source code for regparser.notice.sxs

from copy import deepcopy
from itertools import dropwhile, takewhile

from lxml import etree

from regparser.citations import Label, internal_citations
from regparser.notice.util import (body_to_string, spaces_then_remove,
                                   swap_emphasis_tags)


[docs]def remove_extract(xml_tree):
    """Occasionally, the paragraphs/etc. useful to us are inside an EXTRACT
    tag. To normalize, move everything in an EXTRACT tag out"""
    xml_tree = deepcopy(xml_tree)
    for extract in xml_tree.xpath('//EXTRACT'):
        parent = extract.getparent()
        insert_idx = parent.index(extract)
        for child in extract:
            extract.remove(child)
            parent.insert(insert_idx, child)
            insert_idx += 1
        parent.remove(extract)
    return xml_tree


[docs]def find_section_by_section(xml_tree):
    """Find the section-by-section analysis of this notice"""
    xml_children = remove_extract(xml_tree).xpath('//SUPLINF/*')
    sxs = dropwhile(lambda el: (
        el.tag != 'HD' or
        el.get('SOURCE') != 'HD1' or
        'section-by-section' not in el.text.lower()), xml_children)

    try:
        # Ignore Header
        next(sxs)
        # Remove any intro paragraphs
        sxs = dropwhile(lambda el: el.tag != 'HD', sxs)
        sxs = takewhile(
            lambda el: el.tag != 'HD' or el.get('SOURCE') != 'HD1', sxs)

        return list(sxs)
    except StopIteration:
        return []


[docs]def find_page(xml, index_line, page_number):
    """Find the FR page that includes the indexed line"""
    for prtpage in takewhile(lambda p: p.sourceline < index_line,
                             xml.xpath('//PRTPAGE')):
        if prtpage.get('P'):
            page_number = int(prtpage.get('P'))

    return page_number


[docs]def build_section_by_section(sxs, fr_start_page, previous_label):
    """Given a list of xml nodes in the section by section analysis, pull
    out hierarchical data into a structure. Previous label is carried along to
    merge analyses of the same section."""
    structures = []
    while len(sxs):  # while sxs: is deprecated
        cfr_part = previous_label.split('-')[0]
        title, text_els, sub_sections, sxs = split_into_ttsr(sxs, cfr_part)

        page = find_page(title, title.sourceline, fr_start_page)
        paragraph_xmls = [deepcopy(el) for el in text_els
                          if el.tag == 'P' or el.tag == 'FP']
        footnotes = []
        for p_idx, paragraph_xml in enumerate(paragraph_xmls):
            spaces_then_remove(paragraph_xml, 'PRTPAGE')
            spaces_then_remove(paragraph_xml, 'FTREF')
            swap_emphasis_tags(paragraph_xml)
            # Anything inside a SU can also be ignored
            for su in paragraph_xml.xpath('./SU'):
                su_text = etree.tounicode(su)
                footnotes.append({
                    'paragraph': p_idx,
                    'reference': su.text,
                    'offset': body_to_string(paragraph_xml).find(su_text)})
                if su.tail and su.getprevious() is not None:
                    su.getprevious().tail = (su.getprevious().tail or '')
                    su.getprevious().tail += su.tail
                elif su.tail:
                    su.getparent().text = (su.getparent().text or '')
                    su.getparent().text += su.tail
                su.getparent().remove(su)

        paragraphs = [body_to_string(el) for el in paragraph_xmls]
        label_for_children = previous_label
        labels = parse_into_labels(title.text, cfr_part)
        if labels:
            label_for_children = labels[-1]

        # recursively build children. Be sure to give them the proper label
        children = build_section_by_section(sub_sections, page,
                                            label_for_children)

        next_structure = {
            'page': page,
            'title': add_spaces_to_title(title.text),
            'paragraphs': paragraphs,
            'children': children,
            'footnote_refs': footnotes
        }

        if (labels and  # No label => subheader
                # Concatenate if repeat label or backtrack
                not all(label == previous_label or
                        is_backtrack(previous_label, label)
                        for label in labels)):
            previous_label = labels[-1]
            next_structure['labels'] = labels
        structures.append(next_structure)

    return structures


[docs]def add_spaces_to_title(title):
    """Federal Register often seems to miss spaces in the title of SxS
    sections. Make sure spaces get added if appropriate"""
    for citation in internal_citations(title, Label()):
        end = citation.end
        # Next char is an alpha and last char isn't a space
        if end < len(title) and title[end].isalpha() and title[end - 1] != ' ':
            title = title[:end] + ' ' + title[end:]
            break   # Assumes there is only one paragraph in a title
    return title


[docs]def is_backtrack(previous_label, next_label):
    """If we've already processes a header with 22(c) in it, we can assume
    that any following headers with 1111.22 are *not* supposed to be an
    analysis of 1111.22"""
    previous_label = previous_label or []
    next_label = next_label or []
    trimmed = previous_label[:len(next_label)]
    return (next_label and len(previous_label) > len(next_label) and
            trimmed == next_label)


[docs]def is_child_of(child_xml, header_xml, cfr_part, header_citations=None):
    """Children are paragraphs, have lower 'source', the header has
    citations and the child does not, the citations for header and child
    are the same or the citation in a child is incorrect"""
    if child_xml.tag != 'HD':
        return True
    else:
        if header_citations is None:
            header_citations = parse_into_labels(header_xml.text, cfr_part)
        child_citations = parse_into_labels(child_xml.text, cfr_part)
        deeper_source = child_xml.get('SOURCE') > header_xml.get('SOURCE')
        no_child_cites = header_citations and not child_citations
        matching_child = header_citations and child_citations and (
            header_citations[-1] == child_citations[0])
        if deeper_source or no_child_cites or matching_child:
            return True
        elif header_citations and child_citations:
            return is_backtrack(header_citations[-1].split('-'),
                                child_citations[0].split('-'))
        else:
            return False


[docs]def split_into_ttsr(sxs, cfr_part):
    """Split the provided list of xml nodes into a node with a title, a
    sequence of text nodes, a sequence of nodes associated with the sub
    sections of this header, and the remaining xml nodes"""
    title = sxs[0]
    title_citations = parse_into_labels(title.text, cfr_part)
    section = list(takewhile(lambda e: is_child_of(e, title, cfr_part,
                                                   title_citations), sxs[1:]))
    text_elements = list(takewhile(lambda e: e.tag != 'HD', section))
    sub_sections = section[len(text_elements):]
    remaining = sxs[1 + len(text_elements) + len(sub_sections):]
    return (title, text_elements, sub_sections, remaining)


[docs]def parse_into_labels(txt, part):
    """Find what part+section+(paragraph) (could be multiple) this text is
    related to."""
    citations = internal_citations(txt, Label(part=part))
    # odd corner case: headers shouldn't include both an appendix and regtext
    labels = [c.label for c in citations]
    if any('appendix' in l.settings for l in labels):
        labels = [l for l in labels if 'appendix' in l.settings]
    labels = ['-'.join(l.to_list()) for l in labels]
    return labels