1.1 查找文本中的模式
1 import re 2 pattern = 'this' 3 text = 'Does this text match the pattern?' 4 match = re.search(pattern,text) 5 6 s = match.start() 7 e = match.end() 8 9 print(match.re.pattern,match.string,s,e,text[s:e])
1.2 编译表达式
1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 import re
4
5 regexes = [re.compile(p) for p in ['this','that']]
6 text = 'Does this text match the pattern?'
7 print("Text: %r\n",text)
8
9 for regex in regexes:
10 print('Seeking "%s" ->' % regex.pattern)
11 if regex.search(text):
12 print('Match')
13 else:
14 print("No match!")
1.3 多重匹配
1 import re
2 text = 'abbaaabbbbaaaaa'
3 pattern = 'ab'
4
5 for match in re.finditer(pattern,text):
6 if match:
7 print('Found %s,start is %d,end is %d' % \
8 (text[match.start():match.end()],match.start(),match.end()))
1.4模式语法
1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 import re
4
5 def find_patterns(text,patterns=[]):
6 for pattern,desc in patterns:
7 print("Pattern %r (%s)" % (pattern,desc))
8 print("%r" % text)
9 match_compile = re.compile(pattern)
10 for match in match_compile.finditer(text):
11 print("%s%r" % ('*' * match.start(),text[match.start():match.end()]))
12 return
13
14
15 if __name__ == "__main__":
16 # -----------------------重复-----------------------------
17 # python正则表达式有5种重复的方式,如下:
18 # 1.星号(*)表示前面的模式重复0次或无限次
19 # 2.加与(+)表示前面的模式重复1次或无限次(至少一次)
20 # 3.问号(?)表示前面的模式重复0次或1次
21 # 4.大括号({m,n})表示前面的模式重复次数为:m<=x<=n
22 # 5.大括号({m,})表示前面的模式至少重复m次
23 find_patterns(
24 'abbaabbba',
25 [('ab*', 'a后面没有b或无限个b'), # abb|a|abbb|a
26 ('ab+', 'a后面至少有一个b'), # abb|abbb
27 ('ab?', 'a后面有0个或1个b'), # ab|a|ab|a
28 ('ab{3}', 'a后面有3个b'), # abbb
29 ('ab{2,3}', 'a后面b的个数为大于2个且小于3个') # abb|abbb
30 ]
31 )
32
33 # -----------------------关闭贪婪模式-----------------------------
34 #正则表达式在匹配模式时采用的是贪婪算法,即尽可能多的匹配,这是很多
35 #书及网上资料的解释,这种说法很片面。贪婪匹配正确的理解应该是这样的:
36 #当尽可能多的匹配及尽可能少的匹配都匹配时,取可能多的匹配或者尽可能
37 #多的匹配不匹配时取尽可能少的匹配;关闭贪婪模式时,当尽可能多的匹配
38 #及尽可能少的匹配都匹配时取尽可能少的匹配,当尽可能少的匹配不匹配时
39 #取尽可能多的匹配.通配符为*、+,?,但可以关闭这种算法,即心可能少的匹
40 # 配,在后面加?即可,对应的能配符为*?、+?、??,{m,n}?
41 find_patterns(
42 'abbaabbba',
43 [('ab*?', 'a后面没有b或无限个b'), # a|a|a|a
44 ('ab+?', 'a后面至少有一个b'), # ab|ab
45 ('ab??', 'a后面有0个或1个b'), # a|a|a|a
46 ('ab{3}?', 'a后面有3个b'), # abbb
47 ('ab{2,3}?', 'a后面b的个数为大于2个且小于3个') # abb|abb
48 ]
49 )
50 #从上面的例子中可以看到贪婪算法对ab{3}不起作用,这点值得注意
51
52
53 # -----------------------字符集[]-----------------------------
54 #1.匹配模式匹配里面任何一个字符即可,例如[ab]匹配a或b
55 #2.很多特殊字符在字符集里将失去原来的意义,如+、.
56 find_patterns(
57 'abbaabbba',
58 [('[ab]', '匹配a或b'), # a|b|b|a|a|b|b|b|a
59 ('a[ab]+', 'a后面匹配1个或多个a或b'), # abbaabbba
60 ('a[ab]+?', 'a后面匹配1个a或b') # ab|aa
61 ]
62 )
63
64 # -----------------------字符集区间-----------------------------
65 #随着字符集变得越来越大,单个匹配会变得很枯燥,可以利用一种更为紧凑
66 #的格式:区间
67 find_patterns(
68 'This is some text -- with punctuation.',
69 [('[a-z]+','匹配一个或多个小写字母'), #his|is|some|text|with|punctuation
70 ('[A-Z]+', '匹配一个或多个大写字母'),#T
71 ('[a-zA-Z]+', '匹配一个或多个小写字母或大写字母'),#This|is|some|text|with|punctuation
72 ('[A-Z][a-z]+', '一个大写字母后面匹配一个或多个小写字母')#This
73 ]
74 )
75 # -----------------------元字符点号(.)-----------------------------
76 #元字符点号(.)后面匹配单个字符,单行模式中不匹配换行符
77 find_patterns(
78 'abbabbbba',
79 [('a.', '匹配a或b'), # ab|aa
80 ('b.', 'a后面匹配1个或多个a或b'), # bb|bb|ba
81 ('a.*b', 'a后面匹配1个a或b'), # abbaabbb
82 ('a.*?b','匹配ab或') # ab|aab,为什么最后一个匹配是aab,请参考"关闭贪婪模式"那段话
83 ]
84 )
85
86 # -----------------------^-----------------------------
87 #1.在字符集([])里^表示排除某些字符
88 find_patterns(
89 'This is some text -- with punctuation.',
90 [('[^-. ]+','排除横杠,点号或空格')] #This|is|some|text|with|punctuation
91 )
92
93 # -----------------------转义码-----------------------------
94 #1.\d:一个数字,同[0-9]
95 #2.\D:非数字,同[^0-9]
96 #3.\w:字母或数字,同[0-9a-zA-Z]
97 #4.\W:非字母数字,同[^0-9a-zA-Z]
98 #5.\s:空白字符,制表符、窗格、换行符
99 #6.\S:非空白字符
100 find_patterns(
101 'A prime #1 example!',
102 [(r'\d+','匹配一个或多个数字'),#1
103 (r'\D+','匹配一个或多个除数字以外的多个字符'),#A prime #| example!
104 (r'\s+','匹配一个或多个空白字符'),#' '|' '|' '
105 (r'\S+','匹配一个或多个非空白字符'),#A|prime|#1|example!
106 (r'\w+','匹配一个或多个数字字符'),#A|prime|example
107 (r'\W+','匹配一个或多个非数字字符'),#' '| #|' '|!
108 ]
109 )
110
111 # -----------------------匹配元字符-----------------------------
112 #如果要匹配正则表达式中的字符,则需要对搜索模式的字符进行转义
113 find_patterns(r'\d+ \D+ \s+',[(r'\\.\+','匹配元字符')])
114
115 # -----------------------锚定符-----------------------------
116 #正则表达式除了匹配模式的内容外,还可以使用锚定符指定文本在模式
117 #中的相对位置
118 #1.^匹配字符串或行的开头
119 #2.$匹配字符串或行的末尾
120 #3.\A匹配字符串或行的开头
121 #4.\Z匹配字符串或行的末尾
122 #5.\b匹配一个单词的开头或末尾的空串
123 #6.\B不匹配一个单词的开头或末尾的空串
124 find_patterns(
125 'This is some text -- with punctuation.',
126 [(r'^\w+','匹配以字母数字开头的字符串或行'), #This
127 (r'\A\w+', '匹配以字母数字开头的字符串或行'),#This
128 (r'\w+\S*$', '匹配以字母数字及非空白字符结尾的字符串或行'),#punctuation.
129 (r'\w+\S*\Z', '匹配以字母数字及非空白字符结尾的字符串或行'),#punctuation.
130 (r'\w*t\w*','匹配包含字母t的单词'),#text|with|punctuation
131 (r'\bt\w+','匹配以t开头的单词'),#text
132 (r'\w+t\b','匹配以t结尾的单词'),#text
133 (r'\Bt\B','匹配字母t且字母t不在单词的开头或结尾') #t|t|t
134 ]
135 )
1.5 组解析匹配
1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 import re
4
5 def find_patterns(text,patterns=[]):
6 for pattern,desc in patterns:
7 print("Pattern %r (%s)" % (pattern,desc))
8 print("%r" % text)
9 match_compile = re.compile(pattern)
10 for match in match_compile.finditer(text):
11 print("%s%r" % ('*' * match.start(),text[match.start():match.end()]))
12 return
13
14
15 if __name__ == "__main__":
16 # -----------------------分组()-----------------------------
17 #为模式增加分组可以隔离匹配文本的各个部分,进一步扩展这些功能
18 #来创建一个解析工具
19 find_patterns(
20 'abbaaabbbbaaaaa',
21 [('a(ab)','a后面匹配ab'),#aab
22 ('a(a*b*)','a后面匹配0-n个a或匹配0-n个b'),#abb|aaabbbb|aaaaa
23 ('a(ab)*','a后面匹配0-n个ab'),#a|a|aab|a|a|a|a|a
24 ('a(ab)+','a后面匹配1-n个ab')#aab
25 ]
26 )
27
28 # -----------------------groups-----------------------------
29 #为了访问一个模式中单个组所匹配的子串,可以使用Match对象的groups()方法
30 #groups方法可以在match,search里使用,在finditer里不能使用
31 text ='This is text -- with some text -- with punctuation.'
32 print(text)
33 patterns = [
34 (r'^(\w+)','匹配以字母数字开头'),#This
35 (r'(\w+)\S*$', '匹配以字母数字结尾或以非空白字符结尾'),#punctuation.
36 (r'(\bt\w+)\W+(\w+)', '匹配以t开头的单词,以数字字母结尾及中间为非数字字母'),#text -- with
37 (r'(\w+t)\b', '匹配t结尾的单词')#text
38 ]
39
40 for pattern,desc in patterns:
41 regex = re.compile(pattern)
42 match = regex.search(text)
43 print('Pattern %r (%s)' % (pattern,desc))
44 print('group is:',match.group(),'group(0) is:',match.group(0))
45 print('groups is:',match.groups())
46
47
48 # -----------------------name group-----------------------------
49 #groups将分组存放到一个元组里,其实也可以将分组放到一个字典里
50 text ='This is text -- with some text -- with punctuation.'
51 print(text)
52 patterns = [
53 (r'^(?P<first_word>\w+)','匹配以字母数字开头'),#This
54 (r'(?P<last_word>\w+)\S*$', '匹配以字母数字结尾或以非空白字符结尾'),#punctuation.
55 (r'(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)', '匹配以t开头的单词,以数字字母结尾及中间为非数字字母'),#text -- with
56 (r'(?P<ends_with_t>\w+t)\b', '匹配t结尾的单词')#text
57 ]
58
59 for pattern,desc in patterns:
60 regex = re.compile(pattern)
61 match = regex.search(text)
62 print('Pattern %r (%s)' % (pattern,desc))
63 print('group is:',match.group(),'group(0) is:',match.group(0),'match(1) is:',match.group(1))
64 print('groups is:',match.groups())
65 print('groupdict is:',match.groupdict())
66
67
68 def find_patterns_dict(text, patterns=[]):
69 for pattern, desc in patterns:
70 print("Pattern %r (%s)" % (pattern, desc))
71 print("%r" % text)
72 match_compile = re.compile(pattern)
73 for match in match_compile.finditer(text):
74 prefix = ' ' * match.start()
75 print("%s%r%s" % (prefix, text[match.start():match.end()],' ' * (len(text)-match.end())))
76 # print('This groups is:', match.groups()) if match.groups() else ''
77 if match.groups():
78 print('This groups is:',match.groups())
79 if match.groupdict():
80 print("%s%s" % (' ' * (len(text)-match.start()),match.groupdict()))
81 return
82
83 find_patterns_dict(
84 'abbaabbba',
85 [(r'a((a*)(b*))','1')] #abb('bb','','bb')|aabbb('abbb','a','bb')|a('','','')
86 )
87
88 # -----------------------管道符号|-----------------------------
89 #a((a+)|(b+)),表示a后面只匹配由a或b一个字母构成的序列
90 #a((a|b)+),表示a后面匹配可能包含a或b的序列,两者不同,请看下面的实例
91 find_patterns_dict(
92 'abbaabbba',
93 [(r'a((a+)|(b+))','a后面只匹配a或b'),#abb('bb','','bb')|aa('a','a','')
94 (r'a((a|b)+)', 'a后面可能匹配a或b等同于[ab]') #abbaabbba('bbaabbba','a')
95 ]
96 )
97 # -----------------------非捕获组-----------------------------
98 #将分组屏蔽,使其不在groups里
99 find_patterns_dict(
100 'abbaabbba',
101 [(r'a((a+)|(b+))','a后面只匹配a或b'), #abb('bb','','bb')|aa('a','a','')
102 (r'a((?:a+)|(?:b+))', 'a后面可能匹配a或b等同于[ab]'), #abb(bb,)|aa('a',)
103 (r'a(?:ab)+', 'a后面可能匹配a或b等同于[ab]') #aab
104 ]
105 )
1.6 搜索选项
1 # ----------------------------不区分大小写-------------------------------
2 import re
3 text = 'This is some text -- with punctuatuion.'
4 pattern = r'\bT\w+'
5 with_case = re.compile(pattern)
6 without_case = re.compile(pattern,re.IGNORECASE)
7
8 print('Text:\n %r' % text)
9 print('Pattern:\n %s'% pattern)
10 print('Case-sensitive:')
11 for match in with_case.findall(text):
12 print(' %r' % match)
13 print('Case-insensitive:')
14 for match in without_case.findall(text):
15 print(' %r' % match)
16
17 # ----------------------------多行输入-------------------------------
18 text = 'This is some text -- with punctuatuion.\nA second line.'
19 pattern = r'(^\w+)|(\w+\S*$)'
20 single_line = re.compile(pattern)
21 multiline = re.compile(pattern,re.MULTILINE)
22
23 print('Text:\n %r' % text)
24 print('Pattern:\n %s'% pattern)
25 print('Single Line:')
26 for match in single_line.findall(text):
27 print(' %r' % (match,))
28 print('multiline:')
29 for match in multiline.findall(text):
30 print(' %r' % (match,))
31
32 # ----------------------------DOTALL-------------------------------
33 #匹配换行符
34 text = 'This is some text -- with punctuatuion.\nA second line.'
35 pattern = r'.+'
36 no_newlines = re.compile(pattern)
37 dotall = re.compile(pattern,re.DOTALL)
38
39 print('Text:\n %r' % text)
40 print('Pattern:\n %s'% pattern)
41 print('Single Line:')
42 for match in no_newlines.findall(text):
43 print(' %r' % (match,))
44 print('multiline:')
45 for match in dotall.findall(text):
46 print(' %r' % (match,))
1.7 匹配邮箱
1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 import re
4
5 # ----------------------------匹配邮箱V1.0-------------------------------
6 #这个正则表达式不完善,比如后缀为com,org,edu三者组合的也会匹配,这种匹配
7 #不够严谨,因此最后加一个$可以修正这个bug
8 address = re.compile('[\w\d.+-]+@([\w\d.]+\.)+(com|org|edu)$')
9
10 candidates = [
11 'first.last@example.com',
12 'first.last+category@gmail.org',
13 'valid-address@mail.example.edu',
14 'not-valid@examle.foo'
15 ]
16 for candidate in candidates:
17 match = address.search(candidate)
18 print('%-30s %s' % (candidate,'Matches' if match else 'No match'))
19 if match:
20 print(match.group(), match.groups())
21
22 # ----------------------------匹配邮箱V2.0-------------------------------
23 #使用re.VERBOSE
24 address = re.compile(
25 '''
26 [\w\d.+-]+ #用户名
27 @
28 ([\w\d.]+\.)+ #域名
29 (com|org|edu) #顶级域名
30 ''',re.VERBOSE
31 )
32
33 candidates = [
34 'first.last@example.com',
35 'first.last+category@gmail.edu',
36 'valid-address@mail.example.org',
37 'not-valid@examle.foo'
38 ]
39 for candidate in candidates:
40 match = address.search(candidate)
41 print('%-30s %s' % (candidate,'Matches' if match else 'No match'))
42 if match:
43 print(match.group(), match.groups())
44
45
46 # ----------------------------匹配邮箱V3.0-------------------------------
47 #
48 address = re.compile(
49 '''
50 #匹配人名
51 ((?P<name>
52 ([\w.,]+\s+)*[\w.,]+)
53 \s*
54 <
55 )?
56 #匹配地址
57 (?P<email>
58 [\w\d.+-]+
59 @
60 ([\w\d.]+\.)+
61 (com|org|edu)
62 )
63 >?
64 ''',re.VERBOSE
65 )
66 #
67 # address1 = re.compile('((?P<name>([\w.,]+\s+)*[\w.,]+)\s*<)?(?P<email>[\w\d.+-]+@([\w\d.]+\.)+(com|org|edu))>?$')
68 #
69 #
70 candidates = [
71 'first.last@example.com',
72 'first.last+category@gmail.edu',
73 'valid-address@mail.example.org',
74 'not-valid@examle.foo',
75 'First Last <first.last@example.com>',
76 'No Brackets first.last@example.com',
77 'First Last',
78 'First Middle Last <frist.last@example.com>',
79 'First M. Last <First.last@example.com>',
80 '<first.last@example.com>'
81 ]
82
83 for candidate in candidates:
84 print('candidate is:',candidate)
85 match = address.search(candidate)
86 if match:
87 print(' Name:',match.groupdict()['name'])
88 print(' Email:',match.groupdict()['email'])
89 print(' The group is:',match.group())
90 print(' The groups is:', match.groups())
91
92 else:
93 print('No match!')
94
95 # ----------------------------模式中嵌套标志-------------------------------
96 #若编译表达式里不能增加标志,则可以将标志嵌入到表达式字符串本身
97 #python所以标志的缩写如下:
98 #IGNORECASE->i
99 #MULTILINE->m
100 #DOTALL->s
101 #UNICODE->u
102 #VERBOSE->x
103 import re
104 text = 'This is some text -- with punctuation.'
105 pattern = r'(?i)\b\T\w+'
106 regex = re.compile(pattern)
107
108 print('The Text is:',text)
109 print('The Pattern is:',pattern)
110 print('The match is:',regex.findall(text))
1.8正则表达式的高级用法
1 #!/usr/bin/env python
2 # -*- coding:utf-8 -*-
3 import re
4 address1 = re.compile(
5 '''
6 #name match
7 ^(?P<name>([\w.]+\s+)*[\w.]+)?\s*
8
9 (?(name)
10 (?P<last_name>(?=(<.*>$)))
11 |
12 (?=([^<].*[^>]$))
13 )
14 (?(last_name)<|\s*)
15 #email match
16 (?P<email>
17 [\w\d.+-]+
18 @
19 ([\w\d.]+\.)+
20 (com|org|edu)
21 )
22 (?(last_name)>|\s*)
23 $
24 ''',re.VERBOSE|re.IGNORECASE
25 )
26
27 candidates = [
28 'First Last <first.last@example1.com>',
29 'first last@example.com',
30 'first.last first.last@example.com',
31 'Open Bracket <first.last@example.com',
32 'Close Bracket frist.last@example.com>'
33 ]
34
35 for candidate in candidates:
36 print('candidate is:',candidate)
37 match = address.search(candidate)
38 if match:
39 print(' Name:',match.groupdict()['name'])
40 print(' Email:',match.groupdict()['email'])
41 print(' The group is:',match.group())
42 print(' The groups is:', match.groups())
43 else:
44 print('No match!')
45
46
47
48
49 if __name__ == "__main__":
50 pass
1.9修改字符串
1 #version 0.1
import re
text = '''Paragraph one
on two lines.
Paragraph two.
Paragraph three.'''
for num,para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',text,flags=re.DOTALL)):
print(num,repr(para))
2 import re
3 bold = re.compile(r'\*{2}(.*?)\*{2}')
4 text = 'Make this **bold**. This **too**.'
5 print('Text:',text)
6 print('Bold:',bold.sub(r'<b>\1</b>',text))
7
8 #version 0.2
9 import re
10 bold = re.compile(r'\*{2}(?P<name>.*?)\*{2}')
11 text = 'Make this **bold**. This **too**.'
12 print('Text:',text)
13 print('Bold:',bold.sub(r'<b>\g<name></b>',text))
14
15 #version 0.3
16 import re
17 bold = re.compile(r'\*{2}(.*?)\*{2}')
18 text = 'Make this **bold**. This **too**.'
19 print('Text:',text)
20 print('Bold:',bold.sub(r'<b>\1</b>',text,count=2))
1.10 Split
import re
text = '''Paragraph one
on two lines.
Paragraph two.
Paragraph three.'''
for num,para in enumerate(re.findall(r'(.+?)(\n{2,}|$)',text,flags=re.DOTALL)):
print(num,repr(para))
来源:https://www.cnblogs.com/chencsj/p/7967606.html