import hashlib
import re
from regparser.search import segments
from regparser.tree import struct
from regparser.tree.depth import markers as mtypes
p_levels = [list(mtypes.lower), list(mtypes.ints), list(mtypes.roman),
list(mtypes.upper), list(mtypes.em_ints), list(mtypes.em_roman)]
[docs]def p_level_of(marker):
"""Given a marker(string), determine the possible paragraph levels it
could fall into. This is useful for determining the order of
paragraphs"""
potential_levels = []
for level, markers in enumerate(p_levels):
if marker in markers:
potential_levels.append(level)
return potential_levels
_NONWORDS = re.compile(r'\W+')
[docs]def hash_for_paragraph(text):
"""Hash a chunk of text and convert it into an integer for use with a
MARKERLESS paragraph identifier. We'll trim to just 8 hex characters for
legibility. We don't need to fear hash collisions as we'll have 16**8 ~ 4
billion possibilities. The birthday paradox tells us we'd only expect
collisions after ~ 60 thousand entries. We're expecting at most a few
hundred"""
phrase = _NONWORDS.sub('', text.lower())
hashed = hashlib.sha1(phrase.encode('utf-8')).hexdigest()[:8]
return int(hashed, 16)
[docs]class ParagraphParser():
def __init__(self, p_regex, node_type):
"""p_regex is the regular expression used when searching through
paragraphs. It should contain a %s for the next paragraph 'part'
(e.g. 'a', 'A', '1', 'i', etc.) inner_label_fn is a function which
takes the current label, and the next paragraph 'part' and produces
a new label."""
self.p_regex = p_regex
self.node_type = node_type
@staticmethod
[docs] def matching_subparagraph_ids(p_level, paragraph):
"""Return a list of matches if this paragraph id matches one of the
subparagraph ids (e.g. letter (i) and roman numeral (i)."""
matches = []
for depth in range(p_level + 1, len(p_levels)):
for sub_id, sub in enumerate(p_levels[depth]):
if sub == p_levels[p_level][paragraph]:
matches.append((depth, sub_id))
return matches
[docs] def best_start(self, text, p_level, paragraph, starts, exclude=None):
"""Given a list of potential paragraph starts, pick the best based
on knowledge of subparagraph structure. Do this by checking if the
id following the subparagraph (e.g. ii) is between the first match
and the second. If so, skip it, as that implies the first match was
a subparagraph."""
if exclude is None:
exclude = []
subparagraph_hazards = self.matching_subparagraph_ids(
p_level, paragraph)
starts = starts + [(len(text), len(text))]
for i in range(1, len(starts)):
_, prev_end = starts[i - 1]
next_start, _ = starts[i]
s_text = text[prev_end:next_start]
s_exclude = [
(e_start + prev_end, e_end + prev_end)
for e_start, e_end in exclude]
is_subparagraph = False
for hazard_level, hazard_idx in subparagraph_hazards:
if self.find_paragraph_start_match(
s_text, hazard_level, hazard_idx + 1, s_exclude):
is_subparagraph = True
if not is_subparagraph:
return starts[i - 1]
[docs] def find_paragraph_start_match(self, text, p_level, paragraph,
exclude=None):
"""Find the positions for the start and end of the requested label.
p_Level is one of 0,1,2,3; paragraph is the index within that label.
Return None if not present. Does not return results in the exclude
list (a list of start/stop indices). """
if exclude is None:
exclude = []
if len(p_levels) <= p_level or len(p_levels[p_level]) <= paragraph:
return None
match_starts = [(m.start(), m.end()) for m in re.finditer(
self.p_regex.format(p_levels[p_level][paragraph]), text)]
match_starts = [
(start, end) for start, end in match_starts
if all(end < es or start > ee for es, ee in exclude)]
if len(match_starts) == 0:
return None
elif len(match_starts) == 1:
return match_starts[0]
else:
return self.best_start(
text, p_level, paragraph, match_starts, exclude)
[docs] def paragraph_offsets(self, text, p_level, paragraph, exclude=None):
"""Find the start/end of the requested paragraph. Assumes the text
does not just up a p_level -- see build_paragraph_tree below."""
if exclude is None:
exclude = []
start = self.find_paragraph_start_match(
text, p_level, paragraph, exclude)
if start is None:
return None
id_start, id_end = start
end = self.find_paragraph_start_match(
text[id_end:], p_level, paragraph + 1,
[(e_start - id_end, e_end - id_end) for e_start, e_end in exclude]
)
if end is None:
end = len(text)
else:
end = end[0] + id_end
return (id_start, end)
[docs] def paragraphs(self, text, p_level, exclude=None):
"""Return a list of paragraph offsets defined by the level param."""
if exclude is None:
exclude = []
def offsets_fn(remaining_text, p_idx, exclude):
return self.paragraph_offsets(
remaining_text, p_level, p_idx, exclude)
return segments(text, offsets_fn, exclude)
[docs] def build_tree(self, text, p_level=0, exclude=None, label=None, title=''):
"""Build a dict to represent the text hierarchy."""
if exclude is None:
exclude = []
if label is None:
label = []
subparagraphs = self.paragraphs(text, p_level, exclude)
if subparagraphs:
body_text = text[0:subparagraphs[0][0]]
else:
body_text = text
children = []
for paragraph, (start, end) in enumerate(subparagraphs):
new_text = text[start:end]
new_excludes = [(e[0] - start, e[1] - start) for e in exclude]
new_label = label + [p_levels[p_level][paragraph]]
children.append(
self.build_tree(
new_text, p_level + 1, new_excludes, new_label))
return struct.Node(body_text, children, label, title, self.node_type)