Source code for regparser.tree.paragraph

import hashlib
import re

from regparser.search import segments
from regparser.tree import struct
from regparser.tree.depth import markers as mtypes

p_levels = [list(mtypes.lower), list(mtypes.ints), list(mtypes.roman),
            list(mtypes.upper), list(mtypes.em_ints), list(mtypes.em_roman)]


[docs]def p_level_of(marker): """Given a marker(string), determine the possible paragraph levels it could fall into. This is useful for determining the order of paragraphs""" potential_levels = [] for level, markers in enumerate(p_levels): if marker in markers: potential_levels.append(level) return potential_levels
_NONWORDS = re.compile(r'\W+')
[docs]def hash_for_paragraph(text): """Hash a chunk of text and convert it into an integer for use with a MARKERLESS paragraph identifier. We'll trim to just 8 hex characters for legibility. We don't need to fear hash collisions as we'll have 16**8 ~ 4 billion possibilities. The birthday paradox tells us we'd only expect collisions after ~ 60 thousand entries. We're expecting at most a few hundred""" phrase = _NONWORDS.sub('', text.lower()) hashed = hashlib.sha1(phrase.encode('utf-8')).hexdigest()[:8] return int(hashed, 16)
[docs]class ParagraphParser(): def __init__(self, p_regex, node_type): """p_regex is the regular expression used when searching through paragraphs. It should contain a %s for the next paragraph 'part' (e.g. 'a', 'A', '1', 'i', etc.) inner_label_fn is a function which takes the current label, and the next paragraph 'part' and produces a new label.""" self.p_regex = p_regex self.node_type = node_type @staticmethod
[docs] def matching_subparagraph_ids(p_level, paragraph): """Return a list of matches if this paragraph id matches one of the subparagraph ids (e.g. letter (i) and roman numeral (i).""" matches = [] for depth in range(p_level + 1, len(p_levels)): for sub_id, sub in enumerate(p_levels[depth]): if sub == p_levels[p_level][paragraph]: matches.append((depth, sub_id)) return matches
[docs] def best_start(self, text, p_level, paragraph, starts, exclude=None): """Given a list of potential paragraph starts, pick the best based on knowledge of subparagraph structure. Do this by checking if the id following the subparagraph (e.g. ii) is between the first match and the second. If so, skip it, as that implies the first match was a subparagraph.""" if exclude is None: exclude = [] subparagraph_hazards = self.matching_subparagraph_ids( p_level, paragraph) starts = starts + [(len(text), len(text))] for i in range(1, len(starts)): _, prev_end = starts[i - 1] next_start, _ = starts[i] s_text = text[prev_end:next_start] s_exclude = [ (e_start + prev_end, e_end + prev_end) for e_start, e_end in exclude] is_subparagraph = False for hazard_level, hazard_idx in subparagraph_hazards: if self.find_paragraph_start_match( s_text, hazard_level, hazard_idx + 1, s_exclude): is_subparagraph = True if not is_subparagraph: return starts[i - 1]
[docs] def find_paragraph_start_match(self, text, p_level, paragraph, exclude=None): """Find the positions for the start and end of the requested label. p_Level is one of 0,1,2,3; paragraph is the index within that label. Return None if not present. Does not return results in the exclude list (a list of start/stop indices). """ if exclude is None: exclude = [] if len(p_levels) <= p_level or len(p_levels[p_level]) <= paragraph: return None match_starts = [(m.start(), m.end()) for m in re.finditer( self.p_regex.format(p_levels[p_level][paragraph]), text)] match_starts = [ (start, end) for start, end in match_starts if all(end < es or start > ee for es, ee in exclude)] if len(match_starts) == 0: return None elif len(match_starts) == 1: return match_starts[0] else: return self.best_start( text, p_level, paragraph, match_starts, exclude)
[docs] def paragraph_offsets(self, text, p_level, paragraph, exclude=None): """Find the start/end of the requested paragraph. Assumes the text does not just up a p_level -- see build_paragraph_tree below.""" if exclude is None: exclude = [] start = self.find_paragraph_start_match( text, p_level, paragraph, exclude) if start is None: return None id_start, id_end = start end = self.find_paragraph_start_match( text[id_end:], p_level, paragraph + 1, [(e_start - id_end, e_end - id_end) for e_start, e_end in exclude] ) if end is None: end = len(text) else: end = end[0] + id_end return (id_start, end)
[docs] def paragraphs(self, text, p_level, exclude=None): """Return a list of paragraph offsets defined by the level param.""" if exclude is None: exclude = [] def offsets_fn(remaining_text, p_idx, exclude): return self.paragraph_offsets( remaining_text, p_level, p_idx, exclude) return segments(text, offsets_fn, exclude)
[docs] def build_tree(self, text, p_level=0, exclude=None, label=None, title=''): """Build a dict to represent the text hierarchy.""" if exclude is None: exclude = [] if label is None: label = [] subparagraphs = self.paragraphs(text, p_level, exclude) if subparagraphs: body_text = text[0:subparagraphs[0][0]] else: body_text = text children = [] for paragraph, (start, end) in enumerate(subparagraphs): new_text = text[start:end] new_excludes = [(e[0] - start, e[1] - start) for e in exclude] new_label = label + [p_levels[p_level][paragraph]] children.append( self.build_tree( new_text, p_level + 1, new_excludes, new_label)) return struct.Node(body_text, children, label, title, self.node_type)