python re库-----学习(正则表达式)

#!/usr/bin/env python
#-*- coding:UTF-8 -*-
#####################################################
# Author: sunfx   [email protected]
# Last modified:  2014/11/12  - 2014/11/13
# Filename:  re.py
# Q  Q  群:  236147801
#####################################################

import re

#1.查找文本中的字符

pattern = ‘this‘
text = ‘Does this text match the pattern?‘

match = re.search(pattern,text)

s = match.start()
e = match.end()

print ‘Found "%s"\nin "%s"\nfrom %d to %d ("%s")‘ %      (match.re.pattern,match.string,s,e,text[s:e])

‘‘‘
match.re.pattern 要匹配的内容
match.string 匹配的字符
s  匹配到内容开始索引
d  匹配到内容结束索引
text[s:e] 匹配字符
‘‘‘

#2.编译表达式

regexes = [ re.compile(p)
            for p in [‘this‘,‘that‘]              
] #把字符转换Regexobject格式

print ‘Text: %r\n‘ % text #输出text内容

for regex in regexes:

	print ‘Seeking "%s"->‘ % regex.pattern,  #regex.pattern 要匹配的字符

	if regex.search(text): #在text中搜索this or that

		print ‘match!‘

	else:

		print ‘no match‘

#3.多重匹配

text = ‘abbaaabbbbaaaaa‘

pattern = ‘ab‘

for match in re.findall(pattern,text):

	print ‘Found: "%s"‘ % match

#findall 直接返回字符串

for match in re.finditer(pattern,text):
	s = match.start()
	e = match.end()
	print ‘Found "%s" at %d:%d‘ % (text[s:e],s,e)

#finditer 返回原输入文字在字符串的位置

#4.模式语法

def test_patterns(text,patterns=[]):

	for pattern,desc in patterns: 
		print ‘Pattern %r (%s) \n‘ %(pattern,desc) 
		print ‘   %r‘ % text
		for match in re.finditer(pattern,text):
			s = match.start()
			e = match.end()
			substr = text[s:e] #匹配到的字符
			n_backslashes = text[:s].count(‘\\‘) #查找文本:s坐标之前的包含多少\			prefix = ‘.‘ * ( s + n_backslashes ) 
			print ‘    %s%r‘ % (prefix,substr) 
		print
	return

test_patterns(‘abbaaabbbbaaaaa‘,
            [(‘ab‘,"‘a‘ followed by ‘b‘")]
	)

#贪婪模式 这种模式会减少单个匹配减少
‘‘‘
     *                ‘匹配一次到多次‘
     +                ‘至少匹配一次到多次‘
     ?                ‘只匹配一次‘
     ab*,             ‘a followerd by zero or more b‘),  #匹配0次或者更多次
     ab+,             ‘a followerd by one or mrore b‘),  #最少匹配一次或者更多次
     ab?,             ‘a followerd by zero or one b‘),   #匹配0最多一次
     ab{3},           ‘a followerd by three b‘),         #最少匹配三次
     ab{2,3},           ‘a followerd by two to three b‘)   #匹配两至三次

     ab*?,             ‘a followerd by zero or more b‘),  #匹配0次或者更多次
     ab+?,             ‘a followerd by one or mrore b‘),  #最少匹配一次或者更多次
     ab??,             ‘a followerd by zero or one b‘),   #匹配0最多一次
     ab{3}?,           ‘a followerd by three b‘),         #最少匹配三次
     ab{2,3}?,           ‘a followerd by two to three b‘)   #匹配两至三次
‘‘‘

#用法如下:

str = ‘absdsdsdsdsd‘

print re.findall(‘ab*‘,str)
#[‘ab‘]

print re.findall(‘ab*?‘,str)
#[‘a‘]

#5.字符集

‘‘‘
[ab]     ‘either a or b 匹配a或者b‘
a[ab]+   ‘a followerd by 1 more a or b 匹配一次a、b或者多次 ‘
a[ab]+?  ‘a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次‘
[^]      ‘不包含内容‘
[a-z]    ‘所有小写ASCII字母‘ 
[A-Z]    ‘所有大写写ASCII字母‘ 
[a-zA-Z] ‘一个小写和大写的序列‘
[A-Za-z] ‘一个大写小写的序列‘
‘‘‘
str =‘aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba‘

print re.findall(‘[ab]‘,str)
print re.findall(‘a[ab]+‘,str)
print re.findall(‘a[ab]+?‘,str)
print re.findall(‘[^_]‘,str)

str = ‘China,lovE‘

print re.findall(‘[a-z][A-Z]‘,str)  #[‘vE‘] 
print re.findall(‘[A-Z][a-z]‘,str)  #[‘Ch‘]

print re.findall(‘[A-Z][a-z]+‘,str) #[‘China‘]
print re.findall(‘[a-z][A-Z]+‘,str) #[‘vE‘]

print re.findall(‘[A-Z][a-z]*‘,str) #[‘China‘, ‘E‘]
print re.findall(‘[a-z][A-Z]*‘,str) #[‘h‘, ‘i‘, ‘n‘, ‘a‘, ‘l‘, ‘o‘, ‘vE‘]

print re.findall(‘[A-Z][a-z]?‘,str) #[‘Ch‘, ‘E‘]
print re.findall(‘[a-z][A-Z]?‘,str) #[‘h‘, ‘i‘, ‘n‘, ‘a‘, ‘l‘, ‘o‘, ‘vE‘]

‘‘‘
.      元字符匹配一个字符
a.
b.
a.*b
a.*?b
‘‘‘

c = ‘woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd‘

print re.findall(‘a.‘,c)  #[‘ai‘, ‘aw‘, ‘as‘, ‘aa‘, ‘ab‘]
print re.findall(‘b.‘,c)  #[‘b,‘, ‘bs‘, ‘ba‘, ‘bb‘, ‘bb‘, ‘bb‘, ‘bs‘]
print re.findall(‘a.*b‘,c)  #[‘aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb‘] #贪婪模式匹配a到b之间的任意字符长度字符
print re.findall(‘a.*?b‘,c)  #[‘aizhongguoawsb‘, ‘asssssssssssssdsdsdsdb‘, ‘aaab‘] # ?结束了* 的贪婪模式,
                             #它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符

#6.转义码

‘‘‘
转义码                                   含义
 \d                                    一个数字
 \D                                    一个非字符
 \s                                    空白符(制表符、空格、换行符)
 \S                                    非空白符(符号、字母、数字)
 \w                                    字母数字
 \W                                    非字母数字(符号、制表符、空格、换行符)
‘‘‘

#7.锚定

‘‘‘
锚定码                               含义
  ^                              字符串或行的开始
  $                              字符串或行结束
  \A                             字符串开始
  \Z                             字符串结束
  \b                             一个单词开头或者末尾的空串
  \B                             不在一个单词的开头活末尾的空串
‘‘‘
#8.限制搜索 match、search

text = ‘This is some text --with punctuation.‘

pattern = ‘is‘

print ‘Text    :‘,text
print ‘pattern:‘,pattern

m = re.match(pattern,text)   #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到.
print ‘Match :‘,m   

s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容
print ‘Search :‘,s

pattern = re.compile(r‘\b\w*is\w*\b‘) #编译规则

print ‘Text:‘,text

pos = 0
while  True:
	match = pattern.search(text,pos) #搜索规则
	if not match:
		break
	s = match.start()
	e = match.end() 
	print ‘  %d : %d = "%s"‘ % (s,e-1,text[s:e]) 
	pos = e

#9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中)
regex = re.compile(r‘(\bt\w+)\W+(\w+)‘)

print ‘Input  text      :‘,text

print ‘Pattern          :‘,regex.pattern

match = regex.search(text)
print ‘Entire match     :‘,match.group(0) #表示整个表达式的字符串,子组从1开始排序
print ‘World start with "t":‘,match.group(1) #匹配到的第一组
print ‘World after "t" word :‘,match.group(2) #匹配到的第二组

#python对基本分组进行了扩展 (?P<name>pattern)

print text
print
for pattern in [ r‘^(?P<first_word>\w+)‘,
                 r‘(?P<last_word>\w+)\S*$‘,
                 r‘(?P<t_word>\bt\w+)\W+(?P<other_word>\w+)‘,
                 r‘(?P<ends_with_t>\w+t)\b‘,
                 ]:
    regex = re.compile(pattern)
    match = regex.search(text)
    print ‘Matching "%s"‘ % pattern
    print ‘ ‘,match.groups()
    print ‘ ‘,match.groupdict()
    print

继续学习中...........

时间： 2025-01-13 18:17:21

python re库-----学习(正则表达式)的相关文章

【python标准库学习】re模块

1.什么是re 正则表达式一门相对通用的语言,在python中也有对正则表达式的支持,那就是的内置re模块.正则表达式就是一系列的规则去匹配字符串然后进行相应的操作,这些规则网上一搜一大片,而re则是运用正则表达式来提供一系列的功能强大的接口让我们来调用.通常我们在对日志文件进行操作的时候会对正则表达式运用的比较多来得到我们希望得到的数据. 2.python中的转义符正则表达式中通常用反斜杠'\'来代表转义,'\d'代表数字等,但是python本身也是通过反斜杠'\'来表示转义,所以就和正则表

【python标准库学习】thread，threading(二)多线程同步

继上一篇介绍了python的多线程和基本用法.也说到了python中多线程中的同步锁,这篇就来看看python中的多线程同步问题. 有时候很多个线程同时对一个资源进行修改,这个时候就容易发生错误,看看这个最简单的程序: import thread, time count = 0 def addCount(): global count for i in range(100000): count += 1 for i in range(10): thread.start_new_thread(ad

【python标准库学习】thread，threading(一)多线程的介绍和使用

在单个程序中我们经常用多线程来处理不同的工作,尤其是有的工作需要等,那么我们会新建一个线程去等然后执行某些操作,当做完事后线程退出被回收.当一个程序运行时,就会有一个进程被系统所创建,同时也会有一个线程运行,这个线程就是主线程main,在主线程中所创建的新的线程都是子线程,子线程通常都是做一些辅助的事.python中提供了thread和threading两个模块来支持多线程. python中使用线程有两种方式,第一种是用thread模块的start_new_thread函数,另一种是用threa

python第三方库学习之xlrd读取Excel文件

因为经常会涉及到从Excel表中导数据,所以就学习了python的xlrd来读取excel中的数据. 1.xlrd的安装 xlrd是python的第三方库,所以是需要自己安装的,可以在python的官网http://pypi.python.org/pypi/xlrd下载该模块来安装,也可以通过其他手段,比如easy_install或者pip啥的,我已经安装好pip所以就用最懒的方式来安装了pip install xlrd来安装. 2.分析excel文件的层级对象要读取excel的数据,就要了解

Python标准库01 正则表达式 (re包)

作者:Vamei 出处:http://www.cnblogs.com/vamei 欢迎转载,也请保留这段声明.谢谢! 我将从正则表达式开始讲Python的标准库.正则表达式是文字处理中常用的工具,而且不需要额外的系统知识或经验.我们会把系统相关的包放在后面讲解. 正则表达式(regular expression)主要功能是从字符串(string)中通过特定的模式(pattern),搜索想要找到的内容. 语法之前,我们简介了字符串相关的处理函数.我们可以通过这些函数实现简单的搜索功能,比如说从字

python标准库学习-random

想想这么多年,也是没有好好梳理一下自己的知识体系,以至于总是会有书到用时方恨少的遗憾. 最近既然有学习的动力,干脆就趁着这份工作不是特别忙的机会,写一点东西吧,也理理自己的逻辑思维能力. python有哪些库? 这个问题呢可以参照http://blog.csdn.net/python_wangjunji/article/details/8689297这篇博文来看. 当然咯,首先要先推荐一个可厉害的学习程序:Dash.学编程必备查询库,各种语言,专治"我要看源码病". 那第一篇呢,我就先

转 Python标准库01 正则表达式 (re包)

python requests库学习笔记（上）

尊重博客园原创精神,请勿转载! requests库官方使用手册地址:http://www.python-requests.org/en/master/:中文使用手册地址:http://cn.python-requests.org/zh_CN/latest/: requests库作者Kenneth Reitz个人主页:https://www.kennethreitz.org/: requests库github地址:https://github.com/requests/requests: requ

python re模块学习--正则表达式函数

这里主要介绍Python中常用的正则表达式处理函数.关于python中正则表达式的语法会再总结一篇博文. re.match re.match 尝试从字符串的开始匹配一个模式,如:下面的例子匹配第一个单词. 代码如下: #!/usr/bin/env python# -*- coding: utf-8 -*-import retext = "JGood is a handsome boy, he is cool, clever, and so on..."m = re.match(r&qu