Source code for regparser.tree.depth.optional_rules

"""Depth derivation has a mechanism for _optional_ rules. This module contains
a collection of such rules. All functions should accept two parameters; the
latter is a list of all variables in the system; the former is a function
which can be used to constrain the variables. This allows us to define rules
over subsets of the variables rather than all of them, should that make our
constraints more useful"""
from constraint import InSetConstraint

from regparser.tree.depth import markers
from regparser.tree.depth.rules import _level_and_children, ancestors


[docs]def depth_type_inverses(constrain, all_variables): """If paragraphs are at the same depth, they must share the same type. If paragraphs are the same type, they must share the same depth""" def inner(typ, idx, depth, *all_prev): if typ == markers.stars or typ == markers.markerless: return True for i in range(0, len(all_prev), 3): prev_typ, prev_idx, prev_depth = all_prev[i:i + 3] if prev_depth == depth and prev_typ not in (markers.stars, typ, markers.markerless): return False if prev_typ == typ and prev_depth != depth: return False return True for i in range(0, len(all_variables), 3): constrain(inner, all_variables[i:i + 3] + all_variables[:i])
[docs]def star_new_level(constrain, all_variables): """STARS should never have subparagraphs as it'd be impossible to determine where in the hierarchy these subparagraphs belong. @todo: This _probably_ should be a general rule, but there's a test that this breaks in the interpretations. Revisit with CFPB regs""" def inner(prev_typ, prev_depth, typ, depth): return not (prev_typ == markers.stars and depth == prev_depth + 1) for i in range(3, len(all_variables), 3): prev_typ, prev_depth = all_variables[i - 3], all_variables[i - 1] typ, depth = all_variables[i], all_variables[i + 2] constrain(inner, [prev_typ, prev_depth, typ, depth])
[docs]def stars_occupy_space(constrain, all_variables): """Star markers can't be ignored in sequence, so 1, *, 2 doesn't make sense for a single level, unless it's an inline star. In the inline case, we can think of it as 1, intro-text-to-1, 2""" def per_level(elements): level, grouped_children = _level_and_children(elements) if not level: return True # Base Case last_idx, last_typ = -1, None for typ, idx, _ in level: if typ == markers.stars: if idx == 0: # STARS_TAG, not INLINE_STARS last_idx += 1 # sequences must be increasing. Exception for markerless elif (last_idx >= idx and markers.markerless not in (last_typ, typ)): return False else: last_idx = idx last_typ = typ for children in grouped_children: # Recurse if not per_level(children): return False return True def inner(*all_vars): elements = [tuple(all_vars[i:i + 3]) for i in range(0, len(all_vars), 3)] return per_level(elements) constrain(inner, all_variables)
[docs]def limit_paragraph_types(*p_types): """Constraint paragraphs to a limited set of paragraph types. This can reduce the search space if we know (for example) that the text comes from regulations and hence does not have capitalized roman numerals""" def constrainer(constrain, all_variables): types = [all_variables[i] for i in range(0, len(all_variables), 3)] constrain(InSetConstraint(p_types), types) return constrainer
[docs]def limit_sequence_gap(size=0): """We've loosened the rules around sequences of paragraphs so that paragraphs can be skipped. This allows arbitrary tightening of that rule, effectively allowing gaps of a limited size""" gap_size = size + 1 # we'll always want the difference to be >= 1 def inner(typ, idx, depth, *all_prev): ancestor_markers = ancestors(all_prev) # Continuing a sequence or becoming more shallow if depth < len(ancestor_markers): # Find the previous marker at this depth prev_typ, prev_idx, prev_depth = ancestor_markers[depth] types = {prev_typ, typ} special_types = {markers.stars, markers.markerless} if not special_types & types and prev_typ == typ: return idx > prev_idx and idx - prev_idx <= gap_size return True def constrainer(constrain, all_variables): for i in range(0, len(all_variables), 3): constrain(inner, all_variables[i:i + 3] + all_variables[:i]) return constrainer