Source code for regparser.tree.appendix.generic

from regparser import search


[docs]def is_title_case(line): """Determine if a line is title-case (i.e. the first letter of every word is upper-case. More readable than the equivalent all([]) form.""" for word in line.split(u' '): if len(word) > 0 and len(word) > 3 and word[0] != word[0].upper(): return False return True
[docs]def find_next_segment(text): """Find the start/end of the next segment. A segment for the generic appendix parser is something separated by a title-ish line (a short line with title-case words).""" lines = text.split("\n") for i in range(len(lines) - 1): lines[i] = lines[i] + "\n" start = 0 end = 0 found_start = False for line in lines + ["Placeholder Title"]: if len(line.strip()) > 0 and len(line) < 100 and is_title_case(line): if found_start: return (start, end) else: found_start = True end += len(line) if not found_start: start += len(line)
[docs]def segments(text): """Return a list of segment offsets. See find_next_segment()""" def offsets_fn(remaining_text, idx, excludes): return find_next_segment(remaining_text) return search.segments(text, offsets_fn)