Source code for regparser.layer.key_terms

from __future__ import unicode_literals

import re

from regparser.layer.layer import Layer
from regparser.layer.paragraph_markers import marker_of
from regparser.layer.terms import Terms

KEYTERM_RE = re.compile(r'<E T="03">(?P<keyterm>[^<]*?)</E>', re.UNICODE)
TRIM_FROM_KEYTERM = ['See also', 'See']


[docs]def keyterm_in_text(tagged_text):
    """Pull out the key term of the provided markup using a regex. The XML <E>
    tags that indicate keyterms are also used for italics, which means some
    non-key term phrases would be lumped in. We eliminate them here."""
    match = KEYTERM_RE.match(tagged_text.strip())
    keyterm = ''
    if match:
        keyterm = match.group('keyterm')
    keyterm = keyterm.strip()

    for to_trim in TRIM_FROM_KEYTERM:
        if keyterm.endswith(to_trim):
            keyterm = keyterm[:-len(to_trim)].strip()

    return keyterm or None


[docs]class KeyTerms(Layer):
    shorthand = 'keyterms'

    @classmethod
[docs]    def keyterm_in_node(cls, node, ignore_definitions=True):
        tagged = node.tagged_text.replace(marker_of(node), '', 1).strip()
        keyterm = keyterm_in_text(tagged)

        if keyterm and not (ignore_definitions and
                            cls.is_definition(node, keyterm)):
            return keyterm

    @staticmethod
[docs]    def is_definition(node, keyterm):
        """A definition might be masquerading as a keyterm. Do not allow
        this"""
        included, excluded = Terms(None).node_definitions(node)
        terms = included + excluded
        keyterm_as_term = keyterm.lower()
        return any(ref.term == keyterm_as_term for ref in terms)

[docs]    def process(self, node):
        """ Get keyterms if we have text in the node that preserves the
        <E> tags. """
        keyterm = self.keyterm_in_node(node)
        if keyterm:
            return [{
                "key_term": keyterm,
                # The first instance of the key term is right one.
                "locations": [0]
            }]