Source code for regparser.layer.internal_citations

# -*- coding: utf-8 -*-
import logging

from regparser.citations import Label, internal_citations
from regparser.layer.layer import Layer
from regparser.tree.struct import walk

logger = logging.getLogger(__name__)


[docs]class InternalCitationParser(Layer): shorthand = 'internal-citations' def __init__(self, tree, cfr_title, **context): super(InternalCitationParser, self).__init__(tree, **context) self.cfr_title = cfr_title self.known_citations = set() self.verify_citations = True
[docs] def pre_process(self): """As a preprocessing step, run through the entire tree, collecting all labels.""" def per_node(node): self.known_citations.add(tuple(node.label)) walk(self.tree, per_node)
[docs] def process(self, node): citations_list = self.parse(node.text, label=Label.from_node(node), title=str(self.cfr_title)) if citations_list: return citations_list
[docs] def remove_missing_citations(self, citations, text): """Remove any citations to labels we have not seen before (i.e. those collected in the pre_processing stage)""" final = [] for c in citations: if tuple(c.label.to_list()) in self.known_citations: final.append(c) else: logger.warning("Missing citation? %s %r", text[c.start:c.end], c.label) logger.debug("Context: %s", text) return final
[docs] def parse(self, text, label, title=None): """ Parse the provided text, pulling out all the internal (self-referential) citations. """ def to_layer(pc): return {'offsets': [(pc.start, pc.end)], 'citation': pc.label.to_list()} citations = internal_citations(text, label, require_marker=True, title=title) if self.verify_citations: citations = self.remove_missing_citations(citations, text) all_citations = [to_layer(c) for c in citations] return self.strip_whitespace(text, all_citations)
@staticmethod
[docs] def strip_whitespace(text, citations): """Modifies the offsets to exclude any trailing whitespace. Modifies the offsets in place.""" for citation in citations: for i in range(len(citation['offsets'])): start, end = citation['offsets'][i] string = text[start:end] lstring = string.lstrip() rstring = string.rstrip() new_start = start + (len(string) - len(lstring)) new_end = end - (len(string) - len(rstring)) citation['offsets'][i] = (new_start, new_end) return citations