Source code for regparser.layer.terms

# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import re
from collections import defaultdict, namedtuple

import inflection

from regparser.layer import def_finders
from regparser.layer.layer import Layer
from regparser.layer.scope_finder import ScopeFinder
from regparser.tree import struct
from regparser.tree.priority_stack import PriorityStack
from regparser.web.settings import parser as settings

try:
    key = ('(?i)(p)erson$', '\\1eople')
    del inflection.PLURALS[inflection.PLURALS.index(key)]
except ValueError:
    pass


MAX_TERM_LENGTH = 100
Inflected = namedtuple('Inflected', ['singular', 'plural'])


[docs]class ParentStack(PriorityStack): """Used to keep track of the parents while processing nodes to find terms. This is needed as the definition may need to find its scope in parents."""
[docs] def unwind(self): """No collapsing needs to happen.""" self.pop()
[docs] def parent_of(self, node): level = self.peek_level(node.depth() - 1) return level[-1] if level else None
[docs]class Terms(Layer): shorthand = 'terms' STARTS_WITH_WORDCHAR = re.compile(r'^\w.*$') ENDS_WITH_WORDCHAR = re.compile(r'^.*\w$') def __init__(self, *args, **kwargs): Layer.__init__(self, *args, **kwargs) self.layer['referenced'] = {} # scope -> List[(term, definition_ref)] self.scoped_terms = defaultdict(list) self.scope_finder = ScopeFinder() self._inflected = {}
[docs] def inflected(self, term): """Check the memoized Inflected version of the provided term""" if term not in self._inflected: self._inflected[term] = Inflected( inflection.singularize(term), inflection.pluralize(term)) return self._inflected[term]
[docs] def look_for_defs(self, node, stack=None): """Check a node and recursively check its children for terms which are being defined. Add these definitions to self.scoped_terms.""" stack = stack or ParentStack() stack.add(node.depth(), node) if node.node_type in (struct.Node.REGTEXT, struct.Node.SUBPART, struct.Node.EMPTYPART): included, excluded = self.node_definitions(node, stack) if included: for scope in self.scope_finder.determine_scope(stack): self.scoped_terms[scope].extend(included) self.scoped_terms['EXCLUDED'].extend(excluded) for child in node.children: self.look_for_defs(child, stack)
[docs] def pre_process(self): """Step through every node in the tree, finding definitions. Also keep track of which subpart we are in. Finally, document all defined terms. """ self.scope_finder.add_subparts(self.tree) self.look_for_defs(self.tree) referenced = self.layer['referenced'] for scope in self.scoped_terms: for ref in self.scoped_terms[scope]: key = ref.term + ":" + ref.label if (key not in referenced or # New term # Or this term is earlier in the paragraph ref.start < referenced[key]['position'][0]): referenced[key] = { 'term': ref.term, 'reference': ref.label, 'position': ref.position }
[docs] def applicable_terms(self, label): """Find all terms that might be applicable to nodes with this label. Note that we don't have to deal with subparts as subpart_scope simply applies the definition to all sections in a subpart""" applicable_terms = {} for segment_length in range(1, len(label) + 1): scope = tuple(label[:segment_length]) for ref in self.scoped_terms.get(scope, []): applicable_terms[ref.term] = ref # overwrites return applicable_terms
[docs] def is_exclusion(self, term, node): """Some definitions are exceptions/exclusions of a previously defined term. At the moment, we do not want to include these as they would replace previous (correct) definitions. We also remove terms which are inside an instance of the IGNORE_DEFINITIONS_IN setting""" applicable_terms = self.applicable_terms(node.label) if term in applicable_terms: regex = 'the term .?' + re.escape(term) + '.? does not include' if re.search(regex, node.text.lower()): return True for start, end in self.ignored_offsets(node.label[0], node.text): if term in node.text[start:end]: return True return False
[docs] def node_definitions(self, node, stack=None): """Find defined terms in this node's text.""" references = [] stack = stack or ParentStack() for finder in (def_finders.ExplicitIncludes(), def_finders.SmartQuotes(stack), def_finders.ScopeMatch(self.scope_finder), def_finders.XMLTermMeans(references), def_finders.DefinitionKeyterm(stack.parent_of(node))): # Note that `extend` is very important as XMLTermMeans uses the # list reference references.extend(finder.find(node)) references = [r for r in references if len(r.term) <= MAX_TERM_LENGTH] return ( [r for r in references if not self.is_exclusion(r.term, node)], [r for r in references if self.is_exclusion(r.term, node)])
[docs] def process(self, node): """Determine which (if any) definitions would apply to this node, then find if any of those terms appear in this node""" applicable_terms = self.applicable_terms(node.label) layer_el = [] # Remove any definitions defined in this paragraph term_list = [ (term, ref) for term, ref in applicable_terms.items() if ref.label != node.label_id()] exclusions = self.excluded_offsets(node) matches = self.calculate_offsets(node.text, term_list, exclusions) matches = sorted(matches, key=lambda triplet: triplet[0]) for _, ref, offsets in matches: layer_el.append({ "ref": ref.term + ':' + ref.label, "offsets": offsets }) return layer_el
def _word_matches(self, term, text): """Return the start and end indexes of the term within the text, accounting for word boundaries""" # @todo - this is rather slow -- probably want to memoize the results regex = re.escape(term) if self.STARTS_WITH_WORDCHAR.match(term): regex = r'\b' + regex if self.ENDS_WITH_WORDCHAR.match(term): regex += r'\b' regex = re.compile(regex) return [(match.start(), match.end()) for match in regex.finditer(text)]
[docs] def ignored_offsets(self, cfr_part, text): """Return a list of offsets corresponding to the presence of an "ignored" phrase in the text""" ignored_phrases = (settings.IGNORE_DEFINITIONS_IN.get('ALL', []) + settings.IGNORE_DEFINITIONS_IN.get(cfr_part, [])) positions = [] for phrase in ignored_phrases: positions.extend(self._word_matches(phrase, text)) return positions
[docs] def excluded_offsets(self, node): """We explicitly exclude certain chunks of text (for example, words we are defining shouldn't have links appear within the defined term.) More will be added in the future""" exclusions = [] for reflist in self.scoped_terms.values(): exclusions.extend( ref.position for ref in reflist if ref.label == node.label_id()) exclusions.extend(self.ignored_offsets(node.label[0], node.text)) return exclusions
[docs] def calculate_offsets(self, text, applicable_terms, exclusions=None, inclusions=None): """Search for defined terms in this text, including singular and plural forms of these terms, with a preference for all larger (i.e. containing) terms.""" # don't modify the original exclusions = list(exclusions or []) inclusions = list(inclusions or []) # add singulars and plurals to search terms search_terms = {(inflected, t[1]) for t in applicable_terms for inflected in self.inflected(t[0])} # longer terms first search_terms = sorted(search_terms, key=lambda x: len(x[0]), reverse=True) matches = [] for term, ref in search_terms: re_term = r'\b' + re.escape(term) + r'\b' offsets = [ (m.start(), m.end()) for m in re.finditer(re_term, text.lower())] safe_offsets = [] for start, end in offsets: # Start is contained in an existing def if any(start >= e[0] and start <= e[1] for e in exclusions): continue # End is contained in an existing def if any(end >= e[0] and end <= e[1] for e in exclusions): continue safe_offsets.append((start, end)) if not safe_offsets: continue exclusions.extend(safe_offsets) matches.append((term, ref, safe_offsets)) return matches