Here is what I have so far:
from bs4 import BeautifulSoup
def cleanme(html):
soup = BeautifulSoup(html) # create a new bs4 object from the html data loa
Removing specified tags and comments in a clean manner. Thanks to Kim Hyesung for this code.
from bs4 import BeautifulSoup
from bs4 import Comment
def cleanMe(html):
soup = BeautifulSoup(html, "html5lib")
[x.extract() for x in soup.find_all('script')]
[x.extract() for x in soup.find_all('style')]
[x.extract() for x in soup.find_all('meta')]
[x.extract() for x in soup.find_all('noscript')]
[x.extract() for x in soup.find_all(text=lambda text:isinstance(text, Comment))]
return soup