I am fetching data from a web page using urllib2. The content of all the pages is in the English language so there is no issue of dealing with non-English text. The pages ar
BeautifulSoup stores data as Unicode internally so you don't need to perform character encoding manipulations manually.
To find keywords (case-insensitive) in a text (not in attribute values, or tag names):
#!/usr/bin/env python
import urllib2
from contextlib import closing
import regex # pip install regex
from BeautifulSoup import BeautifulSoup
with closing(urllib2.urlopen(URL)) as page:
soup = BeautifulSoup(page)
print soup(text=regex.compile(ur'(?fi)\L',
keywords=['your', 'keywords', 'go', 'here']))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import regex
from BeautifulSoup import BeautifulSoup, Comment
html = u'''
- tag names must not match
- Post will be found
- the same with post
- and post
- and poſt
- this is ignored
'''
soup = BeautifulSoup(html)
# remove comments
comments = soup.findAll(text=lambda t: isinstance(t, Comment))
for comment in comments: comment.extract()
# find text with keywords (case-insensitive)
print ''.join(soup(text=regex.compile(ur'(?fi)\L', opts=['post', 'li'])))
# compare it with '.lower()'
print '.lower():'
print ''.join(soup(text=lambda t: any(k in t.lower() for k in ['post', 'li'])))
# or exact match
print 'exact match:'
print ''.join(soup(text=' the same with post\n'))
Post will be found
the same with post
and post
and poſt
.lower():
Post will be found
the same with post
exact match:
the same with post