'正则表达式'
# re.match() 尝试从字符串的起始位置匹配一个模式, 如果起始位置匹配不成功,返回NONE
re.match(pattern, string, flags=0)
1.常规匹配
====================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
print(len(content))
result = re.match('^Hello\s\d{3}\s\d{4}\s\w+.*Dome$', content)
print(result)
# group()返回匹配结果
print(result.group())
# span()返回匹配范围
print(result.span())
2.泛匹配 '.*'
======================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
print(len(content))
# .*代表任意字符
result = re.match('^Hllo.*Dome$', content)
print(result)
# group()返回匹配结果
print(result.group())
# span()返回匹配范围
print(result.span())
3.匹配目标 '()'
========================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
result = re.match('^Hello\s(\d+)\s(\d+)\s.*Dome$', content)
print(result.group(1))
# 运行结果 123
print(result.group(2))
# 运行结果 4567
4.贪婪匹配 '.*'
==========================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
# .* 会尽可能的多匹配 只给\d+留一个数字
result = re.match('^H.*(\d+).*Dome$', content)
# 运行结果
# 7
5.非贪婪匹配 '.*?'
===========================================================
import re
content = 'Hello 1234567 Word_This is Regex Dome'
# .*? 会尽可能少匹配
result = re.match('^H.*?(\d+).*Dome$', content)
# 运行结果 1234567
6.匹配模式 're.S'
============================================================
import re
content = '''Hello 1234567 Word_This
'''
# re.S匹配所有文本 如果不加 返回NONE .*匹配不到换行符
result = re.match('^H.*?(\d+).*Dome$', content, re.S)
print(result.group(1))
===========================================================
# 用\转义$
r = re.match('p is \$2', c)
print(r)
re.search() 扫描整个字符串并返回第一个成功的匹配
8.re.sub('\d+', string, centont)
=================================================================
9.re.compile() # 可以编译成正则对象规则 优点可以服用
==================================================================
import re
content = '''Hello 1234567 Word_This
'''
pattern = re.compile('^H.*?(\d+).*Dome$', re.S)
result = re.search(pattern, content)
# result = re.match('^H.*?(\d+).*Dome$', content, re.S)
print(result.group(1))
'正则练习'
=======================================================================
import re
html = '''
'''
result = re.search('<li.*?active.*?singer="(.*?)">(.*?)</a>', html, re.S)
# 运行结果
# 齐秦 往事随风
result = re.search('<li.*?singer="(.*?)">(.*?)</a>', html, re.S)
# 运行结果
# 任贤齐 沧海一声笑
result = re.search('<li.*?singer="(.*?)">(.*?)</a>', html)
# 运行结果
# beyond 光辉岁月
print(result.group(1), result.group(2))
# findall()方法 返回一个列表
result = re.findall('<li.*?href="(.*?)".*?singer="(.*?)>(.*?)</a>', html, re.S)
for results in result:
# 运行结果
# ('/2.mp3', '任贤齐"', '沧海一声笑')
# ('/3.mp3', '齐秦"', '往事随风')
# ('/4.mp3', 'beyond"', '光辉岁月')
# ('/5.mp3', '陈慧琳"', '记事本')
# ('/6.mp3', '邓丽君"', '<i class="fa fa-user"></i>但愿人长久')
# 返回所有歌名
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>', html, re.S)
for result in results:
# 运行结果
# 一路上有你
# 沧海一声笑
# 往事随风
# 光辉岁月
# 记事本
# 但愿人长久
# re.match() 尝试从字符串的起始位置匹配一个模式, 如果起始位置匹配不成功,返回NONE
re.match(pattern, string, flags=0)
1.常规匹配
====================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
print(len(content))
result = re.match('^Hello\s\d{3}\s\d{4}\s\w+.*Dome$', content)
print(result)
# group()返回匹配结果
print(result.group())
# span()返回匹配范围
print(result.span())
2.泛匹配 '.*'
======================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
print(len(content))
# .*代表任意字符
result = re.match('^Hllo.*Dome$', content)
print(result)
# group()返回匹配结果
print(result.group())
# span()返回匹配范围
print(result.span())
3.匹配目标 '()'
========================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
result = re.match('^Hello\s(\d+)\s(\d+)\s.*Dome$', content)
print(result.group(1))
# 运行结果 123
print(result.group(2))
# 运行结果 4567
4.贪婪匹配 '.*'
==========================================================
import re
content = 'Hello 123 4567 Word_This is Regex Dome'
# .* 会尽可能的多匹配 只给\d+留一个数字
result = re.match('^H.*(\d+).*Dome$', content)
# 运行结果
# 7
5.非贪婪匹配 '.*?'
===========================================================
import re
content = 'Hello 1234567 Word_This is Regex Dome'
# .*? 会尽可能少匹配
result = re.match('^H.*?(\d+).*Dome$', content)
# 运行结果 1234567
6.匹配模式 're.S'
============================================================
import re
content = '''Hello 1234567 Word_This
'''
# re.S匹配所有文本 如果不加 返回NONE .*匹配不到换行符
result = re.match('^H.*?(\d+).*Dome$', content, re.S)
print(result.group(1))
===========================================================
# 用\转义$
r = re.match('p is \$2', c)
print(r)
re.search() 扫描整个字符串并返回第一个成功的匹配
8.re.sub('\d+', string, centont)
=================================================================
9.re.compile() # 可以编译成正则对象规则 优点可以服用
==================================================================
import re
content = '''Hello 1234567 Word_This
'''
pattern = re.compile('^H.*?(\d+).*Dome$', re.S)
result = re.search(pattern, content)
# result = re.match('^H.*?(\d+).*Dome$', content, re.S)
print(result.group(1))
'正则练习'
=======================================================================
import re
html = '''
'''
result = re.search('<li.*?active.*?singer="(.*?)">(.*?)</a>', html, re.S)
# 运行结果
# 齐秦 往事随风
result = re.search('<li.*?singer="(.*?)">(.*?)</a>', html, re.S)
# 运行结果
# 任贤齐 沧海一声笑
result = re.search('<li.*?singer="(.*?)">(.*?)</a>', html)
# 运行结果
# beyond 光辉岁月
print(result.group(1), result.group(2))
# findall()方法 返回一个列表
result = re.findall('<li.*?href="(.*?)".*?singer="(.*?)>(.*?)</a>', html, re.S)
for results in result:
# 运行结果
# ('/2.mp3', '任贤齐"', '沧海一声笑')
# ('/3.mp3', '齐秦"', '往事随风')
# ('/4.mp3', 'beyond"', '光辉岁月')
# ('/5.mp3', '陈慧琳"', '记事本')
# ('/6.mp3', '邓丽君"', '<i class="fa fa-user"></i>但愿人长久')
# 返回所有歌名
results = re.findall('<li.*?>\s*?(<a.*?>)?(\w+)(</a>)?\s*?</li>', html, re.S)
for result in results:
# 运行结果
# 一路上有你
# 沧海一声笑
# 往事随风
# 光辉岁月
# 记事本
# 但愿人长久
文章来源: 爬虫re正则表达式笔记