Source code for regparser.layer.key_terms

from __future__ import unicode_literals

import re

from regparser.layer.layer import Layer
from regparser.layer.paragraph_markers import marker_of
from regparser.layer.terms import Terms

KEYTERM_RE = re.compile(r'<E T="03">(?P<keyterm>[^<]*?)</E>', re.UNICODE)
TRIM_FROM_KEYTERM = ['See also', 'See']


[docs]def keyterm_in_text(tagged_text): """Pull out the key term of the provided markup using a regex. The XML <E> tags that indicate keyterms are also used for italics, which means some non-key term phrases would be lumped in. We eliminate them here.""" match = KEYTERM_RE.match(tagged_text.strip()) keyterm = '' if match: keyterm = match.group('keyterm') keyterm = keyterm.strip() for to_trim in TRIM_FROM_KEYTERM: if keyterm.endswith(to_trim): keyterm = keyterm[:-len(to_trim)].strip() return keyterm or None
[docs]class KeyTerms(Layer): shorthand = 'keyterms' @classmethod
[docs] def keyterm_in_node(cls, node, ignore_definitions=True): tagged = node.tagged_text.replace(marker_of(node), '', 1).strip() keyterm = keyterm_in_text(tagged) if keyterm and not (ignore_definitions and cls.is_definition(node, keyterm)): return keyterm
@staticmethod
[docs] def is_definition(node, keyterm): """A definition might be masquerading as a keyterm. Do not allow this""" included, excluded = Terms(None).node_definitions(node) terms = included + excluded keyterm_as_term = keyterm.lower() return any(ref.term == keyterm_as_term for ref in terms)
[docs] def process(self, node): """ Get keyterms if we have text in the node that preserves the <E> tags. """ keyterm = self.keyterm_in_node(node) if keyterm: return [{ "key_term": keyterm, # The first instance of the key term is right one. "locations": [0] }]