I am looking to write something that seems like it should be easy enough, but for whatever reason I\'m having a tough time getting my head around it.
I am looking to
Googled solutions:
#---------- find_urls.py----------#
# Functions to identify and extract URLs and email addresses
import re
def fix_urls(text):
pat_url = re.compile( r'''
(?x)( # verbose identify URLs within text
(http|ftp|gopher) # make sure we find a resource type
:// # ...needs to be followed by colon-slash-slash
(\w+[:.]?){2,} # at least two domain groups, e.g. (gnosis.)(cx)
(/?| # could be just the domain name (maybe w/ slash)
[^ \n\r"]+ # or stuff then space, newline, tab, quote
[\w/]) # resource name ends in alphanumeric or slash
(?=[\s\.,>)'"\]]) # assert: followed by white or clause ending
) # end of match group
''')
pat_email = re.compile(r'''
(?xm) # verbose identify URLs in text (and multiline)
(?=^.{11} # Mail header matcher
(?)'"\]]) # assert: followed by white or clause ending
) # end of match group
''')
for url in re.findall(pat_url, text):
text = text.replace(url[0], '%(url)s' % {"url" : url[0]})
for email in re.findall(pat_email, text):
text = text.replace(email[1], '%(email)s' % {"email" : email[1]})
return text
if __name__ == '__main__':
print fix_urls("test http://google.com asdasdasd some more text")
EDIT: Adjusted to your needs