Source code for regparser.tree.depth.heuristics

"""Set of heuristics for trimming down the set of solutions. Each heuristic
works by penalizing a solution; it's then up to the caller to grab the
solution with the least penalties."""
from collections import defaultdict
from itertools import takewhile

from regparser.tree.depth import markers


[docs]def prefer_multiple_children(solutions, weight=1.0):
    """Dock solutions which have a paragraph with exactly one child. While
    this is possible, it's unlikely."""
    result = []
    for solution in solutions:
        flags = 0
        depths = [a.depth for a in solution.assignment]
        for i, depth in enumerate(depths):
            child_depths = takewhile(lambda d: d > depth, depths[i + 1:])
            matching_depths = [d for d in child_depths if d == depth + 1]
            if len(matching_depths) == 1:
                flags += 1
        result.append(solution.copy_with_penalty(weight * flags / len(depths)))
    return result


[docs]def prefer_diff_types_diff_levels(solutions, weight=1.0):
    """Dock solutions which have different markers appearing at the same
    level. This also occurs, but not often."""
    result = []
    for solution in solutions:
        depth_types = defaultdict(set)
        for par in solution.assignment:
            depth_types[par.depth].add(par.typ)

        flags, total = 0, 0
        for types in depth_types.values():
            total += len(types)
            flags += len(types) - 1

        result.append(solution.copy_with_penalty(weight * flags / total))
    return result


[docs]def prefer_shallow_depths(solutions, weight=0.1):
    """Dock solutions which have a higher maximum depth"""
    # Smallest maximum depth across solutions
    min_max_depth = min(max(p.depth for p in s.assignment) for s in solutions)
    max_max_depth = max(p.depth for s in solutions for p in s.assignment)
    variance = max_max_depth - min_max_depth
    if variance:
        result = []
        for solution in solutions:
            max_depth = max(p.depth for p in solution.assignment)
            flags = max_depth - min_max_depth
            result.append(solution.copy_with_penalty(
                weight * flags / variance))
        return result
    else:
        return solutions


[docs]def prefer_no_markerless_sandwich(solutions, weight=1.0):
    """Prefer solutions which don't use MARKERLESS to switch depth, like
            a
            MARKERLESS
                a
    """
    result = []
    for solution in solutions:
        flags = 0
        for idx in range(2, len(solution.assignment)):
            pprev_depth = solution.assignment[idx - 2].depth
            prev_typ = solution.assignment[idx - 1].typ
            prev_depth = solution.assignment[idx - 1].depth
            depth = solution.assignment[idx].depth

            sandwich = prev_typ == markers.markerless
            incremented = depth == prev_depth + 1
            incrementing = prev_depth == pprev_depth + 1

            if sandwich and incremented and incrementing:
                flags += 1

        total = len(solution.assignment)
        result.append(solution.copy_with_penalty(
            weight * flags / float(total)))

    return result