tag that was not in the original document. If cleanup is true, make sure there's no or , and get rid of any and tags. """ if cleanup: # This removes any extra markup or structure like : html = cleanup_html(html) return fragment_fromstring(html, create_parent=True) _search_body = re.compile(r'', re.I|re.S).search _search_end_body = re.compile(r'', re.I|re.S).search _replace_ins_del = re.compile(r'', re.I|re.S).sub def cleanup_html(html): """ This 'cleans' the HTML, meaning that any page structure is removed (only the contents of are used, if there is any and tags are removed. """ match = _search_body(html) if match: html = html[match.end():] match = _search_end_body(html) if match: html = html[:match.start()] html = _replace_ins_del('', html) return html def split_trailing_whitespace(word): """ This function takes a word, such as 'test\n\n' and returns ('test','\n\n') """ stripped_length = len(word.rstrip()) return word[0:stripped_length], word[stripped_length:] def fixup_chunks(chunks): """ This function takes a list of chunks and produces a list of tokens. """ tag_accum = [] cur_word = None result = [] for chunk in chunks: if isinstance(chunk, tuple): if chunk[0] == 'img': src = chunk[1] tag, trailing_whitespace = split_trailing_whitespace(chunk[2]) cur_word = tag_token('img', src, html_repr=tag, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) tag_accum = [] result.append(cur_word) elif chunk[0] == 'href': href = chunk[1] cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ") tag_accum = [] result.append(cur_word) continue if is_word(chunk): chunk, trailing_whitespace = split_trailing_whitespace(chunk) cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace) tag_accum = [] result.append(cur_word) elif is_start_tag(chunk): tag_accum.append(chunk) elif is_end_tag(chunk): if tag_accum: tag_accum.append(chunk) else: assert cur_word, ( "Weird state, cur_word=%r, result=%r, chunks=%r of %r" % (cur_word, result, chunk, chunks)) cur_word.post_tags.append(chunk) else: assert False if not result: return [token('', pre_tags=tag_accum)] else: result[-1].post_tags.extend(tag_accum) return result # All the tags in HTML that don't require end tags: empty_tags = cython.declare(frozenset, defs.empty_tags) block_level_tags = cython.declare(frozenset, frozenset([ 'address', 'blockquote', 'center', 'dir', 'div', 'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'isindex', 'menu', 'noframes', 'noscript', 'ol', 'p', 'pre', 'table', 'ul', ])) block_level_container_tags = cython.declare(frozenset, frozenset([ 'dd', 'dt', 'frameset', 'li', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr', ])) any_block_level_tag = cython.declare(tuple, tuple(sorted( block_level_tags | block_level_container_tags)) ) def flatten_el(el, include_hrefs, skip_tag=False): """ Takes an lxml element el, and generates all the text chunks for that tag. Each start tag is a chunk, each word is a chunk, and each end tag is a chunk. If skip_tag is true, then the outermost container tag is not returned (just its contents).""" if not skip_tag: if el.tag == 'img': yield ('img', el.get('src'), start_tag(el)) else: yield start_tag(el) if el.tag in empty_tags and not el.text and not len(el) and not el.tail: return start_words = split_words(el.text) for word in start_words: yield html_escape(word) for child in el: yield from flatten_el(child, include_hrefs=include_hrefs) if el.tag == 'a' and el.get('href') and include_hrefs: yield ('href', el.get('href')) if not skip_tag: yield end_tag(el) end_words = split_words(el.tail) for word in end_words: yield html_escape(word) _find_words = re.compile(r'\S+(?:\s+|$)', re.U).findall def split_words(text): """ Splits some text into words. Includes trailing whitespace on each word when appropriate. """ if not text or not text.strip(): return [] words = _find_words(text) return words _has_start_whitespace = re.compile(r'^[ \t\n\r]').match def start_tag(el): """ The text representation of the start tag for a tag. """ attributes = ''.join([ f' {name}="{html_escape(value)}"' for name, value in el.attrib.items() ]) return f'<{el.tag}{attributes}>' def end_tag(el): """ The text representation of an end tag for a tag. Includes trailing whitespace when appropriate. """ tail = el.tail extra = ' ' if tail and _has_start_whitespace(tail) else '' return f'{extra}' def is_word(tok): return not tok.startswith('<') def is_end_tag(tok): return tok.startswith(' or tags inside of any block-level elements, e.g. transform
word
to
word
""" doc = parse_html(html, cleanup=False) _fixup_ins_del_tags(doc) html = serialize_html_fragment(doc, skip_outer=True) return html def serialize_html_fragment(el, skip_outer=False): """ Serialize a single lxml element as HTML. The serialized form includes the elements tail. If skip_outer is true, then don't serialize the outermost tag """ assert not isinstance(el, str), ( f"You should pass in an element, not a string like {el!r}") html = etree.tostring(el, method="html", encoding='unicode') if skip_outer: # Get rid of the extra starting tag: html = html[html.find('>')+1:] # Get rid of the extra end tag: html = html[:html.rfind('<')] return html.strip() else: return html @cython.cfunc def _fixup_ins_del_tags(doc): """fixup_ins_del_tags that works on an lxml document in-place """ for el in list(doc.iter('ins', 'del')): if not _contains_block_level_tag(el): continue _move_el_inside_block(el, tag=el.tag) el.drop_tag() #_merge_element_contents(el) @cython.cfunc def _contains_block_level_tag(el): """True if the element contains any block-level elements, like
, , etc. """ for el in el.iter(*any_block_level_tag): return True return False @cython.cfunc def _move_el_inside_block(el, tag): """ helper for _fixup_ins_del_tags; actually takes the etc tags and moves them inside any block-level tags. """ makeelement = el.makeelement for block_level_el in el.iter(*any_block_level_tag): if block_level_el is not el: break else: # No block-level tags in any child children_tag = makeelement(tag) children_tag.text = el.text el.text = None children_tag.extend(iter(el)) el[:] = [children_tag] return for child in list(el): if _contains_block_level_tag(child): _move_el_inside_block(child, tag) if child.tail: tail_tag = makeelement(tag) tail_tag.text = child.tail child.tail = None child.addnext(tail_tag) else: child_tag = makeelement(tag) el.replace(child, child_tag) child_tag.append(child) if el.text: text_tag = makeelement(tag) text_tag.text = el.text el.text = None el.insert(0, text_tag) def _merge_element_contents(el): """ Removes an element, but merges its contents into its place, e.g., given
Hi there!
, if you remove the element you get
Hi there!
""" parent = el.getparent() text = el.text tail = el.tail if tail: if not len(el): text = (text or '') + tail else: el[-1].tail = (el[-1].tail or '') + tail index = parent.index(el) if text: previous = el.getprevious() if previous is None: parent.text = (parent.text or '') + text else: previous.tail = (previous.tail or '') + text parent[index:index+1] = el.getchildren() @cython.final @cython.cclass class InsensitiveSequenceMatcher(SequenceMatcher): """ Acts like SequenceMatcher, but tries not to find very small equal blocks amidst large spans of changes """ threshold = 2 @cython.cfunc def get_matching_blocks(self) -> list: size: cython.Py_ssize_t = min(len(self.b), len(self.b)) threshold: cython.Py_ssize_t = self.threshold threshold = min(threshold, size // 4) actual = SequenceMatcher.get_matching_blocks(self) return [item for item in actual if item[2] > threshold or not item[2]] if __name__ == '__main__': from lxml.html import _diffcommand _diffcommand.main()