Source code for regparser.tree.appendix.generic

from regparser import search


[docs]def is_title_case(line):
    """Determine if a line is title-case (i.e. the first letter of every
    word is upper-case. More readable than the equivalent all([]) form."""
    for word in line.split(u' '):
        if len(word) > 0 and len(word) > 3 and word[0] != word[0].upper():
            return False
    return True


[docs]def find_next_segment(text):
    """Find the start/end of the next segment. A segment for the generic
    appendix parser is something separated by a title-ish line (a short line
    with title-case words)."""
    lines = text.split("\n")
    for i in range(len(lines) - 1):
        lines[i] = lines[i] + "\n"
    start = 0
    end = 0
    found_start = False
    for line in lines + ["Placeholder Title"]:
        if len(line.strip()) > 0 and len(line) < 100 and is_title_case(line):
            if found_start:
                return (start, end)
            else:
                found_start = True
        end += len(line)
        if not found_start:
            start += len(line)


[docs]def segments(text):
    """Return a list of segment offsets. See find_next_segment()"""
    def offsets_fn(remaining_text, idx, excludes):
        return find_next_segment(remaining_text)
    return search.segments(text, offsets_fn)