Source code for regparser.grammar.amdpar

# -*- coding: utf-8 -*-
# @todo: this file is becoming too large; refactor
import logging
import string

import attr
from pyparsing import (CaselessLiteral, FollowedBy, LineEnd, Literal,
                       OneOrMore, Optional, QuotedString, Suppress, Word,
                       ZeroOrMore)
from six.moves import reduce

from regparser.grammar import atomic, tokens, unified
from regparser.grammar.utils import Marker, QuickSearchable, WordBoundaries
from regparser.tree.paragraph import hash_for_paragraph, p_levels
from regparser.tree.reg_text import subjgrp_label

logger = logging.getLogger(__name__)


intro_text_marker = (
    (Marker("introductory") + WordBoundaries(CaselessLiteral("text"))) |
    (Marker("subject") + Marker("heading")).setParseAction(lambda _: "text")
)

of_connective = (Marker("of") | Marker("for") | Marker("to"))

passive_marker = (
    Marker("is") | Marker("are") | Marker("was") | Marker("were") |
    Marker("and").setResultsName("and_prefix").setParseAction(
        lambda _: True))


and_token = Marker("and").setParseAction(lambda _: tokens.AndToken())


# Verbs
[docs]def generate_verb(word_list, verb, active): """Short hand for making tokens.Verb from a list of trigger words""" word_list = [CaselessLiteral(w) for w in word_list] if not active: word_list = [passive_marker + w for w in word_list] grammar = reduce(lambda l, r: l | r, word_list) grammar = WordBoundaries(grammar) grammar = grammar.setParseAction( lambda m: tokens.Verb(verb, active, bool(m.and_prefix))) return grammar
put_active = generate_verb( ['revising', 'revise', 'correcting', 'correct'], tokens.Verb.PUT, active=True) put_passive = generate_verb( ['revised', 'corrected'], tokens.Verb.PUT, active=False) post_active = generate_verb(['adding', 'add'], tokens.Verb.POST, active=True) post_passive = generate_verb(['added'], tokens.Verb.POST, active=False) delete_active = generate_verb( ['removing', 'remove'], tokens.Verb.DELETE, active=True) delete_passive = generate_verb(['removed'], tokens.Verb.DELETE, active=False) move_active = generate_verb( ['redesignating', 'redesignate'], tokens.Verb.MOVE, active=True) move_passive = generate_verb(['redesignated'], tokens.Verb.MOVE, active=False) designate_active = generate_verb( ['designate'], tokens.Verb.DESIGNATE, active=True) reserve_active = generate_verb(['reserve', 'reserving'], tokens.Verb.RESERVE, active=True) insert_in_order = Literal("[insert-in-order]").setParseAction( lambda m: tokens.Verb(tokens.Verb.INSERT, active=True)) # Context context_certainty = Optional( Marker("in") | Marker("to") | Marker("of") | ( Marker("under") + Optional( Marker("subheading")))).setResultsName("certain") interp = ( context_certainty + atomic.comment_marker + unified.marker_part ).setParseAction( lambda m: tokens.Context([m.part, 'Interpretations'], bool(m.certain))) # This may be a regtext paragraph or it may be an interpretation paragraph_context = ( atomic.section + unified.depth1_p + ~ FollowedBy("-") ).setParseAction( lambda m: tokens.Context([None, None, m.section, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6])) def _paren_join(elements): return '(' + ')('.join(el for el in elements if el) + ')' def _paren_from_match(match): values = [match.p1, match.p2, match.p3, match.p4, match.plaintext_p5, match.plaintext_p6] return _paren_join(values) marker_subpart = ( context_certainty + unified.marker_subpart ).setParseAction( lambda m: tokens.Context([None, 'Subpart:' + m.subpart], bool(m.certain))) comment_context_with_section = ( context_certainty + # Confusingly, these are sometimes "comments", sometimes "paragraphs" (Marker("comment") | Marker("paragraph")) + atomic.section + unified.depth1_p + ~ FollowedBy("-") ).setParseAction(lambda m: tokens.Context( [None, 'Interpretations', m.section, _paren_from_match(m)], bool(m.certain))) # Mild modification of the above; catches "under 2(b)" comment_context_under_with_section = ( Marker("under") + atomic.section + unified.depth1_p ).setParseAction(lambda m: tokens.Context( [None, 'Interpretations', m.section, _paren_from_match(m)], True)) comment_context_without_section = ( context_certainty + atomic.paragraph_marker + unified.depth2_p ).setParseAction( lambda m: tokens.Context( [None, 'Interpretations', None, _paren_join([m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6])], bool(m.certain))) appendix = ( context_certainty + unified.marker_appendix + Optional(Marker("to") + unified.marker_part) ).setParseAction( lambda m: tokens.Context([m.part, 'Appendix:' + m.appendix], bool(m.certain))) section = ( context_certainty + atomic.section_marker + unified.part_section ).setParseAction( lambda m: tokens.Context([m.part, None, m.section], bool(m.certain))) # Paragraph components (used when not replacing the whole paragraph) section_heading = Marker("heading").setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.HEADING_FIELD)) intro_text = intro_text_marker.copy().setParseAction( lambda _: tokens.Paragraph([], field=tokens.Paragraph.TEXT_FIELD)) # Paragraphs comment_p = ( Word(string.digits).setResultsName("level2") + Optional( Suppress(".") + Word("ivxlcdm").setResultsName('level3') + Optional( Suppress(".") + Word(string.ascii_uppercase).setResultsName("level4")))) section_heading_of = ( Marker("heading") + of_connective + unified.marker_part_section ).setParseAction( lambda m: tokens.Paragraph.make(part=m.part, section=m.section, field=tokens.Paragraph.HEADING_FIELD)) section_paragraph_heading_of = ( Marker("heading") + of_connective + (atomic.paragraph_marker | Marker("comment")) + atomic.section + unified.depth1_p ).setParseAction( lambda m: tokens.Paragraph.make( is_interp=True, section=m.section, paragraphs=[_paren_join([m.p1, m.p2, m.p3, m.p4, m.p5])], field=tokens.Paragraph.HEADING_FIELD)) appendix_subheading = ( Marker("subheading") + unified.marker_appendix ).setParseAction( # Use '()' to pad the label out to what's expected of interpretations lambda m: tokens.Paragraph.make( is_interp=True, section=m.appendix, paragraphs=['()'], field=tokens.Paragraph.HEADING_FIELD)) paragraph_heading_of = ( Marker("heading") + of_connective + unified.marker_paragraph.copy() ).setParseAction( lambda m: tokens.Paragraph.make( paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6], field=tokens.Paragraph.KEYTERM_FIELD)) comment_heading = ( Marker("heading") + Optional(of_connective) + atomic.section + unified.depth1_p ).setParseAction( lambda m: tokens.Paragraph.make( is_interp=True, section=m.section, paragraphs=[_paren_join([m.p1, m.p2, m.p3, m.p4, m.p5])], field=tokens.Paragraph.HEADING_FIELD)) # e.g. "introductory text of paragraph (a)(5)(ii)" intro_text_of = ( intro_text_marker + of_connective + atomic.paragraph_marker + unified.depth1_p ).setParseAction( lambda m: tokens.Paragraph.make( paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6], field=tokens.Paragraph.TEXT_FIELD)) intro_text_of_interp = ( intro_text_marker + of_connective + atomic.paragraph_marker + comment_p ).setParseAction( lambda m: tokens.Paragraph.make( is_interp=True, paragraphs=[None, m.level2, m.level3, m.level4], field=tokens.Paragraph.TEXT_FIELD)) single_par = ( unified.marker_paragraph + Optional(intro_text_marker) ).setParseAction( lambda m: tokens.Paragraph.make( paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6], field=(tokens.Paragraph.TEXT_FIELD if m[-1] == 'text' else None))) section_single_par = ( unified.marker_part_section + unified.depth1_p + Optional(intro_text_marker) ).setParseAction( lambda m: tokens.Paragraph.make( part=m.part, section=m.section, paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6], field=(tokens.Paragraph.TEXT_FIELD if m[-1] == 'text' else None))) # Matches "paragraph (a)(1)(i) of § 12.44" single_par_section = ( Optional(atomic.paragraph_marker) + unified.depth1_p + of_connective + unified.marker_part_section ).setParseAction( lambda m: tokens.Paragraph.make( part=m.part, section=m.section, paragraphs=[m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6])) single_comment_with_section = ( (Marker("comment") | Marker("paragraph")) + atomic.section + unified.depth1_p + "-" + Optional("(") + comment_p + Optional(")") ).setParseAction( lambda m: tokens.Paragraph.make( is_interp=True, section=m.section, paragraphs=[_paren_from_match(m), m.level2, m.level3, m.level4])) single_comment_par = ( atomic.paragraph_marker + comment_p ).setParseAction( lambda m: tokens.Paragraph.make( is_interp=True, paragraphs=[None, m.level2, m.level3, m.level4])) # Token Lists
[docs]def make_multiple(to_repeat): """Shorthand for handling repeated tokens ('and', ',', 'through')""" return ( (to_repeat + Optional(intro_text_marker)).setResultsName("head") + OneOrMore(( atomic.conj_phrases + to_repeat + Optional(intro_text_marker) ).setResultsName("tail", listAllMatches=True)) )
def _through_paren(prev_lab, next_lab): """Expand "through" for labels with embedded paragraphs (e.g. 12(c))""" lhs, rhs = prev_lab[-1], next_lab[-1] lhs_idx, rhs_idx = lhs.rindex('('), rhs.rindex('(') # Check if the previous and next labels are "through"-able. For example, # we can't compute A-14(a)(2) through B-14(a)(4) nor can we compute # A-14(a)(1) through A-14(b)(3) if lhs[:lhs_idx] != rhs[:rhs_idx] or prev_lab[:-1] != next_lab[:-1]: logger.warning("Bad use of 'through': %s %s", prev_lab, next_lab) return [] else: prefix = lhs[:lhs_idx + 1] lhs, rhs = lhs[lhs_idx + 1:-1], rhs[rhs_idx + 1:-1] for level in p_levels: if lhs in level and rhs in level: lidx, ridx = level.index(lhs), level.index(rhs) if lidx < ridx: return [tokens.Paragraph.make(prev_lab[:-1] + [prefix + level[i] + ')']) for i in range(lidx + 1, ridx)] logger.warning("Error with 'through': %s %s", prev_lab, next_lab) return [] def _through_sect(prev_lab, next_lab): """Expand "through" for labels ending in a section number.""" return [tokens.Paragraph.make(prev_lab[:2] + [str(i)]) for i in range(int(prev_lab[-1]) + 1, int(next_lab[-1]))] def _through_paragraph(prev_lab, next_lab): """Expand "through" for labels ending in a paragraph.""" depth = len(prev_lab) start = p_levels[depth - 4].index(prev_lab[-1]) + 1 end = p_levels[depth - 4].index(next_lab[-1]) return [tokens.Paragraph.make(prev_lab[:depth - 1] + [p_levels[depth - 4][i]]) for i in range(start, end)]
[docs]def make_par_list(listify, force_text_field=False): """Shorthand for turning a pyparsing match into a tokens.Paragraph""" def curried(match=None): pars = [] matches = [match.head] + list(match.tail) for match in matches: match_as_list = listify(match) next_par = tokens.Paragraph.make(match_as_list) next_lab = next_par.label if match[-1] == 'text' or force_text_field: next_par = attr.assoc(next_par, field=tokens.Paragraph.TEXT_FIELD) if match.through: # Iterate through, creating paragraph tokens prev_lab = pars[-1].label if '(' in prev_lab[-1] and '(' in next_lab[-1]: pars.extend(_through_paren(prev_lab, next_lab)) elif len(prev_lab) == 3: pars.extend(_through_sect(prev_lab, next_lab)) elif len(prev_lab) > 3: pars.extend(_through_paragraph(prev_lab, next_lab)) pars.append(next_par) return tokens.TokenList(pars) return curried
multiple_sections = ( atomic.sections_marker + make_multiple(unified.part_section) ).setParseAction(make_par_list(lambda m: [m.part, None, m.section])) multiple_paragraph_sections = ( atomic.section_marker + make_multiple(Optional(unified.part_section) + unified.any_depth_p) ).setParseAction(make_par_list(lambda m: [ m.part, None, m.section, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6])) appendix_section = unified.appendix_with_section.copy().setParseAction( lambda m: tokens.Paragraph.make(appendix=m.appendix, section=m.appendix_section)) appendix_section_heading_of = ( Marker("heading") + of_connective + unified.appendix_with_section ).copy().setParseAction( lambda m: tokens.Paragraph.make( appendix=m.appendix, section=m.appendix_section, field=tokens.Paragraph.HEADING_FIELD)) multiple_appendices = make_multiple( unified.appendix_with_section ).setParseAction(make_par_list( lambda m: [None, 'Appendix:' + m.appendix, m.appendix_section])) multiple_comment_pars = ( atomic.paragraphs_marker + make_multiple(comment_p) ).setParseAction(make_par_list(lambda m: [ None, 'Interpretations', None, None, m.level2, m.level3, m.level4])) # Not a context as one wouldn't list these for contextual purposes multiple_comments = ( Marker("comments") + make_multiple(atomic.section + unified.depth1_p) ).setParseAction(make_par_list(lambda m: [ None, 'Interpretations', m.section, _paren_from_match(m)])) multiple_interp_entries = ( Marker("entries") + Marker("for") + (atomic.section + unified.depth1_p).setResultsName("head") + OneOrMore(( atomic.conj_phrases + unified.any_depth_p ).setResultsName("tail", listAllMatches=True)) ).setParseAction(make_par_list( lambda m: [None, None, m.section, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6])) multiple_paragraphs = ( (atomic.paragraphs_marker | atomic.paragraph_marker) + make_multiple(unified.any_depth_p) ).setParseAction(make_par_list(lambda m: [ m.part, None, m.section, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6])) # e.g. "introductory text of paragraphs (a)(5)(ii) and (d)(5)(ii)" multiple_intro_text_of = ( intro_text_marker + of_connective + atomic.paragraphs_marker + make_multiple(unified.any_depth_p) ).setParseAction(make_par_list( lambda m: [None, None, None, m.p1, m.p2, m.p3, m.p4, m.plaintext_p5, m.plaintext_p6], force_text_field=True))
[docs]def tokenize_override_ps(match): """ Create token.Paragraphs for the given override match """ # Part, Section or Appendix, p1, p2, p3, p4, p5, p6 match_list = list(match) par_list = [match.part, None, None, None, None, None, None, None] if match.section: par_list[1] = match.section elif match.appendix: par_list[1] = "Appendix:" + match.appendix # Set paragraph depths for p in match_list[2:]: par_list[match_list.index(p)] = p par = tokens.Paragraph.make(par_list) return [par]
_keyterm_label_part = ( Suppress(Marker("keyterm")) + QuotedString(quoteChar='(', endQuoteChar=')') ).setParseAction(lambda m: "p{0}".format(hash_for_paragraph(m[0]))) _simple_label_part = Word(string.ascii_lowercase + string.ascii_uppercase + string.digits) _label_part = _keyterm_label_part | _simple_label_part override_label = ( Suppress("[") + Marker("label") + Suppress(":") + atomic.part + Suppress("-") + (atomic.section | atomic.appendix) + ZeroOrMore(Suppress("-") + _label_part) + Suppress("]") ).setParseAction(tokenize_override_ps) # Looks like: [subject-group(Some text Goes Here)] subject_group = ( context_certainty + Suppress("[subject-group") + QuotedString(quoteChar='(', endQuoteChar=')').setResultsName("subgroup") + Suppress("]") ).setParseAction(lambda m: tokens.Context( [None, 'Subjgrp:' + subjgrp_label(m.subgroup, [])], bool(m.certain))) # Phrases like '“Nonimmigrant visa”' become 'p12345678' _double_quote_label = QuotedString( quoteChar=u'“', endQuoteChar=u'”' ).setParseAction(lambda m: "p{0}".format(hash_for_paragraph(m[0]))) # Phrases like "definition for the term “Nonimmigrant visa”" become a # paragraph token with the appropriate paragraph label set definition = ( Marker("definition") + (Marker("of") | Marker("for")) + Optional(Marker("the") + Marker("term")) + _double_quote_label.copy().setResultsName("paragraph") ).setParseAction(lambda m: tokens.Paragraph.make(paragraphs=[m.paragraph])) # grammar which captures all of these possibilities token_patterns = QuickSearchable( put_active | put_passive | post_active | post_passive | delete_active | delete_passive | move_active | move_passive | designate_active | reserve_active | insert_in_order | interp | marker_subpart | appendix | comment_context_with_section | comment_context_without_section | comment_context_under_with_section | paragraph_heading_of | section_heading_of | multiple_intro_text_of | intro_text_of | appendix_section_heading_of | intro_text_of_interp | comment_heading | appendix_subheading | section_paragraph_heading_of | # Must come after other headings as it is a catch-all section_heading | multiple_paragraph_sections | section_single_par | multiple_interp_entries | multiple_sections | multiple_paragraphs | multiple_appendices | multiple_comment_pars | multiple_comments | # Must come after multiple_appendices appendix_section | # Must come after multiple_pars | single_par_section | single_par | # Must come after multiple_comment_pars single_comment_with_section | single_comment_par | # Must come after section_single_par section | # Must come after intro_text_of intro_text | definition | # Finally allow for an explicit override label override_label | subject_group | paragraph_context | and_token ) subpart_label = QuickSearchable( atomic.part + Suppress('-') + atomic.subpart_marker + Suppress(':') + Word(string.ascii_uppercase, max=1) + LineEnd())