Here is what I have so far:
from bs4 import BeautifulSoup
def cleanme(html):
soup = BeautifulSoup(html) # create a new bs4 object from the html data loa
Using lxml instead:
# Requirements: pip install lxml
import lxml.html.clean
def cleanme(content):
cleaner = lxml.html.clean.Cleaner(
allow_tags=[''],
remove_unknown_tags=False,
style=True,
)
html = lxml.html.document_fromstring(content)
html_clean = cleaner.clean_html(html)
return html_clean.text_content().strip()
testhtml = "\n\nTHIS IS AN EXAMPLE I need this text capturedAnd this
"
cleaned = cleanme(testhtml)
print (cleaned)