Source code for regparser.layer.def_finders

# -*- coding: utf-8 -*-
"""Parsers for finding a term that's being defined within a node"""
import abc
import re
from collections import namedtuple
from itertools import chain

import six
from pyparsing import ParseException

from regparser.citations import Label
from regparser.grammar import terms as grammar
from regparser.tree.struct import Node
from regparser.web.settings import parser as settings


[docs]class Ref(namedtuple('Ref', ['term', 'label', 'start'])):
    """A reference to a defined term. Keeps track of the term, where it was
    found and the term's position in that node's text"""
    def __new__(cls, term, label, start):
        term = six.text_type(term).lower()
        return super(Ref, cls).__new__(cls, term, label, start)

    @property
    def end(self):
        return self.start + len(self.term)

    @property
    def position(self):
        return (self.start, self.end)


[docs]class FinderBase(six.with_metaclass(abc.ABCMeta)):
    """Base class for all of the definition finder classes. Defines the
    interface they must implement"""
    @abc.abstractmethod
[docs]    def find(self, node):
        """Given a Node, pull out any definitions it may contain as a list of
        Refs"""
        raise NotImplementedError()


[docs]class ExplicitIncludes(FinderBase):
    """Definitions can be explicitly included in the settings. For example,
    say that a paragraph doesn't indicate that a certain phrase is a
    definition; we can define INCLUDE_DEFINITIONS_IN in our settings file,
    which will be checked here."""
[docs]    def find(self, node):
        refs = []
        cfr_part = node.label[0] if node.label else None
        included = list(settings.INCLUDE_DEFINITIONS_IN.get("ALL", []))  # copy
        included.extend(settings.INCLUDE_DEFINITIONS_IN.get(cfr_part, []))
        for included_term, context in included:
            if context in node.text and included_term in node.text:
                pos_start = node.text.index(included_term)
                refs.append(Ref(included_term, node.label_id(), pos_start))
        return refs


[docs]class SmartQuotes(FinderBase):
    """Definitions indicated via smart quotes"""
    def __init__(self, stack):
        """Stack (which references ancestors of a node) is used to determine
        whether or not to apply smart quotes"""
        self.stack = stack

[docs]    def find(self, node):
        refs = []
        if self.stack and self.has_def_indicator():
            for match, _, _ in grammar.smart_quotes.scanString(node.text):
                term = match.term[0].strip(',.;')
                refs.append(Ref(term, node.label_id(), match.term.pos[0]))
        return refs

[docs]    def has_def_indicator(self):
        """With smart quotes, we catch some false positives, phrases in quotes
        that are not terms. This extra test lets us know that a parent of the
        node looks like it would contain definitions."""
        for node in self.stack.lineage():
            lower_text = node.text.lower()
            in_text = 'Definition' in node.text
            in_title = 'Definition' in (node.title or '')
            pattern1 = re.search('the term .* (means|refers to)', lower_text)
            pattern2 = re.search(u'“[^”]+” (means|refers to)', lower_text)
            if in_text or in_title or pattern1 or pattern2:
                return True
        return False


[docs]class ScopeMatch(FinderBase):
    """We know these will be definitions because the scope of the definition
    is spelled out. E.g. 'for the purposes of XXX, the term YYY means'"""
    def __init__(self, finder):
        """Finder is an instance of ScopeFinder"""
        self.finder = finder

[docs]    def find(self, node):
        refs = []
        for match, _, _ in grammar.scope_term_type_parser.scanString(
                node.text):
            valid_scope = self.finder.scope_of_text(
                match.scope, Label.from_node(node), verify_prefix=False)
            valid_term = re.match("^[a-z ]+$", match.term[0])
            if valid_scope and valid_term:
                term = match.term[0].strip()
                pos_start = node.text.index(term, match.term.pos[0])
                refs.append(Ref(term, node.label_id(), pos_start))
        return refs


[docs]class XMLTermMeans(FinderBase):
    """Namespace for a matcher for e.g. '<E>XXX</E> means YYY'"""
    def __init__(self, existing_refs=None):
        """Existing refs will be used to exclude certain matches"""
        if existing_refs is None:
            existing_refs = []
        self.exclusions = list(existing_refs)

[docs]    def find(self, node):
        refs = []
        tagged_text = node.tagged_text
        for match, _, _ in grammar.xml_term_parser.scanString(tagged_text):
            # Position in match reflects XML tags, so its dropped in
            # preference of new values based on node.text.
            for match in chain([match.head], match.tail):
                pos_start = self.pos_start(match.term[0], node.text)
                term = node.tagged_text[match.term.pos[0]:match.term.pos[1]]
                ref = Ref(term, node.label_id(), pos_start)
                refs.append(ref)
                self.exclusions.append(ref)
        return refs

[docs]    def pos_start(self, needle, haystack):
        """Search for the first instance of `needle` in the `haystack`
        excluding any overlaps from `self.exclusions`. Implicitly returns None
        if it can't be found"""
        start = 0
        while start >= 0:
            start = haystack.find(needle, start)
            if not any(r.start <= start and r.end >= start
                       for r in self.exclusions):
                return start
            start += 1


[docs]class DefinitionKeyterm(object):
    """Matches definitions identified by being a first-level paragraph in a
    section with a specific title"""
    _NORMALIZE_RE = re.compile(r'[^a-z]+')
    _section_titles = ['definition', 'meaningofterms']  # already normalized

    def __init__(self, parent):
        is_regtext = parent and parent.node_type == Node.REGTEXT
        is_section = is_regtext and len(parent.label) == 2
        title = parent and self._normalize(parent.title)
        title_match = title in self._section_titles
        self.title_matches = is_section and title_match

    @classmethod
    def _normalize(cls, title):
        """Makes a title comparable with cls._section_titles"""
        return cls._NORMALIZE_RE.sub('', (title or "").lower())

    @staticmethod
    def _split_phrase(phrase):
        """A single phrase might contain multiple terms. Attempt to split it
        into subphrases. Using a heuristic, we declare that if all subphrases
        are a single word long, those subphrases are each terms. In this way,
        we treat "apple or banana or pear" as three different terms, but see
        only one term in "fruit salad or mix" (the whole phrase)"""
        potential_terms = phrase.split(" or ")
        if any(" " in term for term in potential_terms):
            return [phrase]
        else:
            return potential_terms

[docs]    def find(self, node):
        if self.title_matches:
            tagged_text = node.tagged_text
            try:
                match = grammar.key_term_parser.parseString(tagged_text)
                phrase = node.tagged_text[match.term.pos[0]:match.term.pos[1]]
                return [Ref(term, node.label_id(), node.text.find(term))
                        for term in self._split_phrase(phrase)]
            except ParseException:
                return []
        return []