word
toword
""" doc = parse_html(html, cleanup=False) _fixup_ins_del_tags(doc) html = serialize_html_fragment(doc, skip_outer=True) return html def serialize_html_fragment(el, skip_outer=False): """ Serialize a single lxml element as HTML. The serialized form includes the elements tail. If skip_outer is true, then don't serialize the outermost tag """ assert not isinstance(el, str), ( f"You should pass in an element, not a string like {el!r}") html = etree.tostring(el, method="html", encoding='unicode') if skip_outer: # Get rid of the extra starting tag: html = html[html.find('>')+1:] # Get rid of the extra end tag: html = html[:html.rfind('<')] return html.strip() else: return html @cython.cfunc def _fixup_ins_del_tags(doc): """fixup_ins_del_tags that works on an lxml document in-place """ for el in list(doc.iter('ins', 'del')): if not _contains_block_level_tag(el): continue _move_el_inside_block(el, tag=el.tag) el.drop_tag() #_merge_element_contents(el) @cython.cfunc def _contains_block_level_tag(el): """True if the element contains any block-level elements, like,
Hi there!
, if you remove the element you getHi there!
""" parent = el.getparent() text = el.text tail = el.tail if tail: if not len(el): text = (text or '') + tail else: el[-1].tail = (el[-1].tail or '') + tail index = parent.index(el) if text: previous = el.getprevious() if previous is None: parent.text = (parent.text or '') + text else: previous.tail = (previous.tail or '') + text parent[index:index+1] = el.getchildren() @cython.final @cython.cclass class InsensitiveSequenceMatcher(SequenceMatcher): """ Acts like SequenceMatcher, but tries not to find very small equal blocks amidst large spans of changes """ threshold = 2 @cython.cfunc def get_matching_blocks(self) -> list: size: cython.Py_ssize_t = min(len(self.b), len(self.b)) threshold: cython.Py_ssize_t = self.threshold threshold = min(threshold, size // 4) actual = SequenceMatcher.get_matching_blocks(self) return [item for item in actual if item[2] > threshold or not item[2]] if __name__ == '__main__': from lxml.html import _diffcommand _diffcommand.main()