1 ‘‘‘ 2 题目描述: 3 找出一个html文件中所有的url 4 5 思路 : 6 利用正则表达式进行匹配 7 8 ‘‘‘ 9 10 11 import re 12 13 14 with open(‘test.txt‘) as fp: 15 text = fp.read() 16 pattern = re.compile( 17 "((http|ftp|https)://)(([a-zA-Z0-9\._-]+\.[a-zA-Z]{2,6})|([0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}))(:[0-9]{1,4})*(/[a-zA-Z0-9\&%_\./-~-]*)?", re.DOTALL) 18 urls = pattern.findall(text) 19 for i in urls: 20 full_url = ‘‘ 21 for url in i: 22 full_url += url 23 24 print full_url
时间: 2024-10-14 09:05:58