Source code for regparser.grammar.amdpar

# -*- coding: utf-8 -*-
# @todo: this file is becoming too large; refactor
import logging
import string

import attr
from pyparsing import (CaselessLiteral, FollowedBy, LineEnd, Literal,
                       OneOrMore, Optional, QuotedString, Suppress, Word,
                       ZeroOrMore)
from six.moves import reduce

from regparser.grammar import atomic, tokens, unified
from regparser.grammar.utils import Marker, QuickSearchable, WordBoundaries
from regparser.tree.paragraph import hash_for_paragraph, p_levels
from regparser.tree.reg_text import subjgrp_label

logger = logging.getLogger(__name__)


intro_text_marker = (
    (Marker("introductory") + WordBoundaries(CaselessLiteral("text"))) |
    (Marker("subject") + Marker("heading")).setParseAction(lambda _: "text")
)

of_connective = (Marker("of") | Marker("for") | Marker("to"))

passive_marker = (
    Marker("is") | Marker("are") | Marker("was") | Marker("were") |
    Marker("and").setResultsName("and_prefix").setParseAction(
        lambda _: True))


and_token = Marker("and").setParseAction(lambda _: tokens.AndToken())


# Verbs
[docs]def generate_verb(word_list, verb, active):
    """Short hand for making tokens.Verb from a list of trigger words"""
    word_list = [CaselessLiteral(w) for w in word_list]
    if not active:
        word_list = [passive_marker + w for w in word_list]
    grammar = reduce(lambda l, r: l | r, word_list)
    grammar = WordBoundaries(grammar)
    grammar = grammar.setParseAction(
        lambda m: tokens.Verb(verb, active, bool(m.and_prefix)))
    return grammar


put_active = generate_verb(
    ['revising', 'revise', 'correcting', 'correct'],
    tokens.Verb.PUT, active=True)

put_passive = generate_verb(
    ['revised', 'corrected'], tokens.Verb.PUT,
    active=False)

post_active = generate_verb(['adding', 'add'], tokens.Verb.POST, active=True)
post_passive = generate_verb(['added'], tokens.Verb.POST, active=False)

delete_active = generate_verb(
    ['removing', 'remove'], tokens.Verb.DELETE, active=True)
delete_passive = generate_verb(['removed'], tokens.Verb.DELETE, active=False)

move_active = generate_verb(
    ['redesignating', 'redesignate'], tokens.Verb.MOVE, active=True)
move_passive = generate_verb(['redesignated'], tokens.Verb.MOVE, active=False)

designate_active = generate_verb(
    ['designate'],
    tokens.Verb.DESIGNATE, active=True)

reserve_active = generate_verb(['reserve', 'reserving'],
                               tokens.Verb.RESERVE, active=True)

insert_in_order = Literal("[insert-in-order]").setParseAction(
    lambda m: tokens.Verb(tokens.Verb.INSERT, active=True))


#   Context
context_certainty = Optional(
    Marker("in") | Marker("to") | Marker("of") | (
        Marker("under") + Optional(
            Marker("subheading")))).setResultsName("certain")

interp = (
    context_certainty + atomic.comment_marker + unified.marker_part
).setParseAction(
    lambda m: tokens.Context([m.part, 'Interpretations'], bool(m.certain)))


# This may be a regtext paragraph or it may be an interpretation
paragraph_context = (
    atomic.section +
    unified.depth1_p + ~
    FollowedBy("-")
).setParseAction(
    lambda m: tokens.Context([None, None, m.section, m.p1, m.p2, m.p3, m.p4,
                              m.plaintext_p5, m.plaintext_p6]))


def _paren_join(elements):
    return '(' + ')('.join(el for el in elements if el) + ')'


def _paren_from_match(match):
    values = [match.p1, match.p2, match.p3, match.p4, match.plaintext_p5,
              match.plaintext_p6]
    return _paren_join(values)


marker_subpart = (
    context_certainty +
    unified.marker_subpart
).setParseAction(
    lambda m: tokens.Context([None, 'Subpart:' + m.subpart], bool(m.certain)))
comment_context_with_section = (
    context_certainty +
    #   Confusingly, these are sometimes "comments", sometimes "paragraphs"
    (Marker("comment") | Marker("paragraph")) +
    atomic.section +
    unified.depth1_p + ~
    FollowedBy("-")
).setParseAction(lambda m: tokens.Context(
    [None, 'Interpretations', m.section, _paren_from_match(m)],
    bool(m.certain)))
# Mild modification of the above; catches "under 2(b)"
comment_context_under_with_section = (
    Marker("under") +
    atomic.section +
    unified.depth1_p
).setParseAction(lambda m: tokens.Context(
    [None, 'Interpretations', m.section, _paren_from_match(m)], True))
comment_context_without_section = (
    context_certainty +
    atomic.paragraph_marker +
    unified.depth2_p
).setParseAction(
    lambda m: tokens.Context(
        [None, 'Interpretations', None,
         _paren_join([m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6])],
        bool(m.certain)))
appendix = (
    context_certainty +
    unified.marker_appendix +
    Optional(Marker("to") + unified.marker_part)
).setParseAction(
    lambda m: tokens.Context([m.part, 'Appendix:' + m.appendix],
                             bool(m.certain)))
section = (
    context_certainty +
    atomic.section_marker +
    unified.part_section
).setParseAction(
    lambda m: tokens.Context([m.part, None, m.section], bool(m.certain)))


#   Paragraph components (used when not replacing the whole paragraph)
section_heading = Marker("heading").setParseAction(
    lambda _: tokens.Paragraph([], field=tokens.Paragraph.HEADING_FIELD))
intro_text = intro_text_marker.copy().setParseAction(
    lambda _: tokens.Paragraph([], field=tokens.Paragraph.TEXT_FIELD))


#   Paragraphs
comment_p = (
    Word(string.digits).setResultsName("level2") +
    Optional(
        Suppress(".") + Word("ivxlcdm").setResultsName('level3') +
        Optional(
            Suppress(".") +
            Word(string.ascii_uppercase).setResultsName("level4"))))

section_heading_of = (
    Marker("heading") + of_connective +
    unified.marker_part_section
).setParseAction(
    lambda m: tokens.Paragraph.make(part=m.part, section=m.section,
                                    field=tokens.Paragraph.HEADING_FIELD))

section_paragraph_heading_of = (
    Marker("heading") + of_connective +
    (atomic.paragraph_marker | Marker("comment")) +
    atomic.section +
    unified.depth1_p
).setParseAction(
    lambda m: tokens.Paragraph.make(
        is_interp=True, section=m.section,
        paragraphs=[_paren_join([m.p1, m.p2, m.p3, m.p4, m.p5])],
        field=tokens.Paragraph.HEADING_FIELD))

appendix_subheading = (
    Marker("subheading") +
    unified.marker_appendix
).setParseAction(
    # Use '()' to pad the label out to what's expected of interpretations
    lambda m: tokens.Paragraph.make(
        is_interp=True, section=m.appendix, paragraphs=['()'],
        field=tokens.Paragraph.HEADING_FIELD))


paragraph_heading_of = (
    Marker("heading") + of_connective +
    unified.marker_paragraph.copy()
).setParseAction(
    lambda m: tokens.Paragraph.make(
        paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6],
        field=tokens.Paragraph.KEYTERM_FIELD))

comment_heading = (
    Marker("heading") +
    Optional(of_connective) +
    atomic.section +
    unified.depth1_p
).setParseAction(
    lambda m: tokens.Paragraph.make(
        is_interp=True, section=m.section,
        paragraphs=[_paren_join([m.p1, m.p2, m.p3, m.p4, m.p5])],
        field=tokens.Paragraph.HEADING_FIELD))

# e.g. "introductory text of paragraph (a)(5)(ii)"
intro_text_of = (
    intro_text_marker + of_connective +
    atomic.paragraph_marker +
    unified.depth1_p
).setParseAction(
    lambda m: tokens.Paragraph.make(
        paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6],
        field=tokens.Paragraph.TEXT_FIELD))

intro_text_of_interp = (
    intro_text_marker + of_connective +
    atomic.paragraph_marker +
    comment_p
).setParseAction(
    lambda m: tokens.Paragraph.make(
        is_interp=True, paragraphs=[None, m.level2, m.level3, m.level4],
        field=tokens.Paragraph.TEXT_FIELD))

single_par = (
    unified.marker_paragraph +
    Optional(intro_text_marker)
).setParseAction(
    lambda m: tokens.Paragraph.make(
        paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6],
        field=(tokens.Paragraph.TEXT_FIELD if m[-1] == 'text' else None)))
section_single_par = (
    unified.marker_part_section +
    unified.depth1_p +
    Optional(intro_text_marker)
).setParseAction(
    lambda m: tokens.Paragraph.make(
        part=m.part, section=m.section,
        paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6],
        field=(tokens.Paragraph.TEXT_FIELD if m[-1] == 'text' else None)))
# Matches "paragraph (a)(1)(i) of § 12.44"
single_par_section = (
    Optional(atomic.paragraph_marker) +
    unified.depth1_p +
    of_connective +
    unified.marker_part_section
).setParseAction(
    lambda m: tokens.Paragraph.make(
        part=m.part, section=m.section,
        paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6]))

single_comment_with_section = (
    (Marker("comment") | Marker("paragraph")) +
    atomic.section +
    unified.depth1_p +
    "-" +
    Optional("(") + comment_p + Optional(")")
).setParseAction(
    lambda m: tokens.Paragraph.make(
        is_interp=True, section=m.section,
        paragraphs=[_paren_from_match(m), m.level2, m.level3, m.level4]))
single_comment_par = (
    atomic.paragraph_marker +
    comment_p
).setParseAction(
    lambda m: tokens.Paragraph.make(
        is_interp=True, paragraphs=[None, m.level2, m.level3, m.level4]))


#   Token Lists
[docs]def make_multiple(to_repeat):
    """Shorthand for handling repeated tokens ('and', ',', 'through')"""
    return (
        (to_repeat + Optional(intro_text_marker)).setResultsName("head") +
        OneOrMore((
            atomic.conj_phrases +
            to_repeat +
            Optional(intro_text_marker)
        ).setResultsName("tail", listAllMatches=True))
    )


def _through_paren(prev_lab, next_lab):
    """Expand "through" for labels with embedded paragraphs (e.g. 12(c))"""
    lhs, rhs = prev_lab[-1], next_lab[-1]
    lhs_idx, rhs_idx = lhs.rindex('('), rhs.rindex('(')
    # Check if the previous and next labels are "through"-able. For example,
    # we can't compute A-14(a)(2) through B-14(a)(4) nor can we compute
    # A-14(a)(1) through A-14(b)(3)
    if lhs[:lhs_idx] != rhs[:rhs_idx] or prev_lab[:-1] != next_lab[:-1]:
        logger.warning("Bad use of 'through': %s %s", prev_lab, next_lab)
        return []
    else:
        prefix = lhs[:lhs_idx + 1]
        lhs, rhs = lhs[lhs_idx + 1:-1], rhs[rhs_idx + 1:-1]
        for level in p_levels:
            if lhs in level and rhs in level:
                lidx, ridx = level.index(lhs), level.index(rhs)
                if lidx < ridx:
                    return [tokens.Paragraph.make(prev_lab[:-1] +
                                                  [prefix + level[i] + ')'])
                            for i in range(lidx + 1, ridx)]
        logger.warning("Error with 'through': %s %s", prev_lab, next_lab)
        return []


def _through_sect(prev_lab, next_lab):
    """Expand "through" for labels ending in a section number."""
    return [tokens.Paragraph.make(prev_lab[:2] + [str(i)])
            for i in range(int(prev_lab[-1]) + 1, int(next_lab[-1]))]


def _through_paragraph(prev_lab, next_lab):
    """Expand "through" for labels ending in a paragraph."""
    depth = len(prev_lab)
    start = p_levels[depth - 4].index(prev_lab[-1]) + 1
    end = p_levels[depth - 4].index(next_lab[-1])
    return [tokens.Paragraph.make(prev_lab[:depth - 1] +
                                  [p_levels[depth - 4][i]])
            for i in range(start, end)]


[docs]def make_par_list(listify, force_text_field=False):
    """Shorthand for turning a pyparsing match into a tokens.Paragraph"""
    def curried(match=None):
        pars = []
        matches = [match.head] + list(match.tail)
        for match in matches:
            match_as_list = listify(match)
            next_par = tokens.Paragraph.make(match_as_list)
            next_lab = next_par.label
            if match[-1] == 'text' or force_text_field:
                next_par = attr.assoc(next_par,
                                      field=tokens.Paragraph.TEXT_FIELD)
            if match.through:
                #   Iterate through, creating paragraph tokens
                prev_lab = pars[-1].label
                if '(' in prev_lab[-1] and '(' in next_lab[-1]:
                    pars.extend(_through_paren(prev_lab, next_lab))
                elif len(prev_lab) == 3:
                    pars.extend(_through_sect(prev_lab, next_lab))
                elif len(prev_lab) > 3:
                    pars.extend(_through_paragraph(prev_lab, next_lab))
            pars.append(next_par)
        return tokens.TokenList(pars)
    return curried


multiple_sections = (
    atomic.sections_marker +
    make_multiple(unified.part_section)
).setParseAction(make_par_list(lambda m: [m.part, None, m.section]))


multiple_paragraph_sections = (
    atomic.section_marker +
    make_multiple(Optional(unified.part_section) + unified.any_depth_p)
).setParseAction(make_par_list(lambda m: [
    m.part, None, m.section, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5,
    m.plaintext_p6]))


appendix_section = unified.appendix_with_section.copy().setParseAction(
    lambda m: tokens.Paragraph.make(appendix=m.appendix,
                                    section=m.appendix_section))

appendix_section_heading_of = (
    Marker("heading") + of_connective +
    unified.appendix_with_section
).copy().setParseAction(
    lambda m: tokens.Paragraph.make(
        appendix=m.appendix, section=m.appendix_section,
        field=tokens.Paragraph.HEADING_FIELD))

multiple_appendices = make_multiple(
    unified.appendix_with_section
).setParseAction(make_par_list(
    lambda m: [None, 'Appendix:' + m.appendix, m.appendix_section]))

multiple_comment_pars = (
    atomic.paragraphs_marker +
    make_multiple(comment_p)
).setParseAction(make_par_list(lambda m: [
    None, 'Interpretations', None, None, m.level2, m.level3, m.level4]))

#   Not a context as one wouldn't list these for contextual purposes
multiple_comments = (
    Marker("comments") +
    make_multiple(atomic.section + unified.depth1_p)
).setParseAction(make_par_list(lambda m: [
    None, 'Interpretations', m.section, _paren_from_match(m)]))

multiple_interp_entries = (
    Marker("entries") + Marker("for") +
    (atomic.section + unified.depth1_p).setResultsName("head") +
    OneOrMore((
        atomic.conj_phrases +
        unified.any_depth_p
    ).setResultsName("tail", listAllMatches=True))
).setParseAction(make_par_list(
    lambda m: [None, None, m.section, m.p1, m.p2, m.p3, m.p4,
               m.plaintext_p5, m.plaintext_p6]))

multiple_paragraphs = (
    (atomic.paragraphs_marker | atomic.paragraph_marker) +
    make_multiple(unified.any_depth_p)
).setParseAction(make_par_list(lambda m: [
    m.part, None, m.section, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5,
    m.plaintext_p6]))


# e.g. "introductory text of paragraphs (a)(5)(ii) and (d)(5)(ii)"
multiple_intro_text_of = (
    intro_text_marker + of_connective +
    atomic.paragraphs_marker +
    make_multiple(unified.any_depth_p)
).setParseAction(make_par_list(
    lambda m: [None, None, None, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5,
               m.plaintext_p6],
    force_text_field=True))


[docs]def tokenize_override_ps(match):
    """ Create token.Paragraphs for the given override match """
    # Part, Section or Appendix, p1, p2, p3, p4, p5, p6
    match_list = list(match)
    par_list = [match.part, None, None, None, None, None, None, None]

    if match.section:
        par_list[1] = match.section
    elif match.appendix:
        par_list[1] = "Appendix:" + match.appendix

    # Set paragraph depths
    for p in match_list[2:]:
        par_list[match_list.index(p)] = p

    par = tokens.Paragraph.make(par_list)
    return [par]


_keyterm_label_part = (
    Suppress(Marker("keyterm")) +
    QuotedString(quoteChar='(', endQuoteChar=')')
).setParseAction(lambda m: "p{0}".format(hash_for_paragraph(m[0])))
_simple_label_part = Word(string.ascii_lowercase + string.ascii_uppercase +
                          string.digits)
_label_part = _keyterm_label_part | _simple_label_part

override_label = (
    Suppress("[") +
    Marker("label") + Suppress(":") +
    atomic.part +
    Suppress("-") +
    (atomic.section | atomic.appendix) +
    ZeroOrMore(Suppress("-") + _label_part) +
    Suppress("]")
).setParseAction(tokenize_override_ps)

# Looks like: [subject-group(Some text Goes Here)]
subject_group = (
    context_certainty +
    Suppress("[subject-group") +
    QuotedString(quoteChar='(', endQuoteChar=')').setResultsName("subgroup") +
    Suppress("]")
).setParseAction(lambda m: tokens.Context(
    [None, 'Subjgrp:' + subjgrp_label(m.subgroup, [])], bool(m.certain)))

# Phrases like '“Nonimmigrant visa”' become 'p12345678'
_double_quote_label = QuotedString(
    quoteChar=u'“', endQuoteChar=u'”'
).setParseAction(lambda m: "p{0}".format(hash_for_paragraph(m[0])))
# Phrases like "definition for the term “Nonimmigrant visa”" become a
# paragraph token with the appropriate paragraph label set
definition = (
    Marker("definition") +
    (Marker("of") | Marker("for")) +
    Optional(Marker("the") + Marker("term")) +
    _double_quote_label.copy().setResultsName("paragraph")
).setParseAction(lambda m: tokens.Paragraph.make(paragraphs=[m.paragraph]))

#   grammar which captures all of these possibilities
token_patterns = QuickSearchable(
    put_active | put_passive | post_active | post_passive |
    delete_active | delete_passive | move_active | move_passive |
    designate_active | reserve_active |
    insert_in_order |

    interp | marker_subpart | appendix |
    comment_context_with_section | comment_context_without_section |
    comment_context_under_with_section |
    paragraph_heading_of | section_heading_of |
    multiple_intro_text_of | intro_text_of |
    appendix_section_heading_of |
    intro_text_of_interp |
    comment_heading | appendix_subheading | section_paragraph_heading_of |
    # Must come after other headings as it is a catch-all
    section_heading |
    multiple_paragraph_sections | section_single_par |
    multiple_interp_entries |

    multiple_sections | multiple_paragraphs | multiple_appendices |
    multiple_comment_pars | multiple_comments |
    #   Must come after multiple_appendices
    appendix_section |
    #   Must come after multiple_pars |
    single_par_section | single_par |
    #   Must come after multiple_comment_pars
    single_comment_with_section | single_comment_par |
    #   Must come after section_single_par
    section |
    #   Must come after intro_text_of
    intro_text |

    definition |

    # Finally allow for an explicit override label
    override_label | subject_group |

    paragraph_context |
    and_token
)

subpart_label = QuickSearchable(
    atomic.part + Suppress('-') +
    atomic.subpart_marker + Suppress(':') +
    Word(string.ascii_uppercase, max=1) +
    LineEnd())