Source code for regparser.notice.util

from lxml import etree


[docs]def prepost_pend_spaces(el): """FR's XML doesn't always add spaces around tags that clearly need them. Account for this by adding spaces around the el where needed.""" not_append_space = """@#$(-'" \t\n""" not_prepend_space = """%):?!,. \t\n""" space_added = False parent = el.getparent() prev = el.getprevious() if prev is not None: if prev.tail and prev.tail[-1] not in not_append_space: prev.tail = prev.tail + ' ' space_added = True elif parent.text and parent.text[-1] not in not_append_space: parent.text = parent.text + ' ' space_added = True if (el.tail and el.tail[0] not in not_prepend_space and (el.text or el.getchildren() or not space_added)): el.tail = ' ' + el.tail
[docs]def swap_emphasis_tags(el): """FR's XML uses a different set of tags than the standard we'd like (XHTML). Swap out at needed""" for e in el.xpath('.//E'): original = 'E' if 'T' in e.attrib: original = original + '-' + e.attrib['T'] del e.attrib['T'] e.tag = 'em' e.attrib['data-original'] = original prepost_pend_spaces(e)
[docs]def spaces_then_remove(el, tag_str): """FR's XML tends to not add spaces where needed, which leads to the removal of tags sometimes smashing together words.""" for tag in el.xpath('.//' + tag_str): prepost_pend_spaces(tag) etree.strip_tags(el, tag_str) return el
[docs]def body_to_string(xml_node): """Create a string from the text of this node and its children (without the outer tag)""" return (xml_node.text.lstrip() + ''.join(etree.tounicode(c) for c in xml_node) + (xml_node.tail or '').rstrip())