代码可直接复制到python文件中进行运行
# 1. 文件内创建函数 # 内建函数和方法 # open() 打开文件 # read() 输入 # readline() 输入一行 # seek() 文件移动 # write() 输出 # close() 关闭文件 # 写入文件,执行完成后生成txt文件 file1 = open(‘name.txt‘, ‘w‘) file1.write("20200202") file1.close() # 读取文件 file2 = open(‘name.txt‘) str = file2.read() print(str) file2.close() # 编辑文件 file3 = open(‘name.txt‘, ‘a‘) # 字符中带\n输入进行换行 file3.write("\n11111") file3.close() # 读取一行 file4 = open(‘name.txt‘) print(file4.readline()) file4.close() # 逐行读取 file5 = open(‘name.txt‘) for str_1 in file5.readlines(): print(str_1) file5.close() # 操作完成之后鼠标指针行首 file6 = open(‘name.txt‘) print(file6.readline()) # 回到行首 print(file6.seek(0)) file6.close() # 2.python异常的检测和处理 try: a = 1 / 0 except Exception as e: print(‘捕获到的异常是 %s‘ % e) finally: print(‘最终都会执行的语句‘) # 3.python的 可变参数 def howLong(first, *other): print(first) print(other) howLong(‘123‘, ‘1222‘, ‘1111‘) # 4.函数的迭代器和生成器 list1 = {1, 2, 3} it = iter(list1) # 迭代器next() print(next(it)) print(next(it)) print(next(it)) def frange(start, stop, step): x = start while x < stop: # 生成器关键字 yield yield x x += step for i in frange(10, 12, 0.5): print(i) # 5.Lambda表达式:匿名函数 add = lambda x, y: x + y print(add(2, 4)) # 6.python的内建函数 a = [1, 2, 34, 5, 6] # filter():够快a中大于2的数 print(list(filter(lambda x: x > 2, a))) # map():依次a中的数加一 print(list(map(lambda x: x + 1, a))) # 多个列表处理:a,b中第一个元素相加 b = [3, 4, 5, 9] print(list(map(lambda x, y: x + y, a, b))) # reduce使用需要引入:完成数字累加 from functools import reduce print(reduce(lambda x, y: x + y, [1, 2, 3], 4)) # zip进行矩阵转换 dicta = {‘aa‘: ‘a‘, ‘bb‘: ‘b‘, ‘cc‘: ‘c‘} dictc = zip(dicta.values(), dicta.keys()) print(list(dictc)) # 7. python 的闭包:嵌套函数 def sum(a): def add(b): return a + b return add num27 = sum(2) print(num27(4)) # 8.python多线程 import threading from threading import current_thread class Mythread(threading.Thread): def run(self): print(current_thread().getName(), ‘start‘) print(‘run‘) print(current_thread().getName(), ‘start‘) t1 = Mythread() t1.start() t1.join() # 线程同步 print(current_thread().getName(), ‘end‘) # 9.python正则表达式re # . 匹配任意单个字符 # ^ 以什么字符做开头 # $ 以什么字符做结尾(从后向前进行匹配) # * 字符出现0~n次 # + 前面字符出现1~N次 # ? 前面字符出现0次或1次 # {m} 前面字符出现m的次 # {m,n} 前面字符出现m~n次 # [] 中括号中任意一个字符匹配成功即可 # | 字符选择左边或者右边 # \d 匹配内容为数字 # \D 匹配非数字 # \s 匹配字符串 # () 进行分组 import re p = re.compile(‘.{3}‘) # 任意字符出现三次 print(p.match(‘d‘)) p1 = re.compile(‘jpg$‘) # 查找以jpg结尾的字符 print(p1.match(‘d‘)) p2 = re.compile(‘ca*‘) # 查找以jpg结尾的字符 print(p2.match(‘cat‘)) p3 = re.compile(‘a{4}‘) # 查找a出现4次 print(p3.match(‘caaaat‘)) p4 = re.compile(‘c[bcd]t‘) # 出现bcd中任意一个 print(p4.match(‘cat‘)) # 分组 p5 = re.compile(r‘(\d+)-(\d+)-(\d+)‘) print(p5.match(‘2019-02-02‘)) # 匹配日期 print(p5.match(‘2019-02-02‘).group(1)) # 匹配年份 year, month, day = p5.match(‘2019-02-02‘).groups() # 匹配年份 print(year, month, day) # match是完全匹配进行分组,search是进行字符匹配搜索 print(p5.match(‘aaa2019-02-02‘)) print(p5.search(‘aaa2019-02-02‘)) # sub匹配替换 phone = ‘123-456-789 # 这是电话号码‘ print(re.sub(r‘#.*$‘, ‘‘, phone)) # 将警号后面替换为空 print(re.sub(r‘\D‘, ‘‘, phone)) # 非数字替换为空 # 10. python日期函数函数库 # import time print(time.time()) # 1970年到现在的时间 print(time.localtime()) print(time.strftime(‘%Y-%m-%d %H:%M:%S‘)) import datetime # datetime用作时间的修改 print(datetime.datetime.now()) new_time = datetime.timedelta(minutes=10) print(datetime.datetime.now() + new_time) # 十分钟之后的时间 one_day = datetime.datetime(2019, 9, 9) new_day = datetime.timedelta(days=10) print(one_day + new_day) # 11.网页数据采集与urllib from urllib import request url = ‘http://www.baidu.com‘ response = request.urlopen(url, timeout=1) # print(response.read().decode(‘utf-8‘)) # 12.GET和POST请求 from urllib import parse from urllib import request data = bytes(parse.urlencode({‘world‘: ‘hello‘}), encoding=‘utf8‘) # print(data) response = request.urlopen(‘http://httpbin.org/post‘, data=data) # print(response.read().decode(‘utf-8‘)) import urllib import socket try: response2 = request.urlopen(‘http://httpbin.org/get‘, timeout=1) # print(response2.read()) except urllib.error.URLError as e: if isinstance(e.reason, socket.timeout): print("time out") # 13.python的requests库的使用 # get请求 import requests url2131 = ‘http://httpbin.org/get‘ data2131 = {‘key‘: ‘value‘, ‘abc‘: ‘xyz‘} response2131 = requests.get(url2131, data2131) # print(response2131.text) # post请求 url2132 = ‘http://httpbin.org/post‘ data2132 = {‘key‘: ‘value‘, ‘abc‘: ‘xyz‘} response2132 = requests.post(url2132, data2132) # print(response2132.json()) # 14.python的正则表达式爬取链接 # import requests # import re content = requests.get(‘http://www.cnu.cc/discoveryPage/hot-人像‘).text # print(content) patter2141 = re.compile(r‘<a href="(.*?)".*?title">(.*?)</div>‘, re.S) results2141 = re.findall(patter2141, content) # print(‘ssssss‘, results2141) for result2141 in results2141: url2141, name2141 = result2141 # print(url2141, re.sub(‘\s‘, ‘‘, name2141)) # 15.爬蟲使用beautiful Soup的安装使用 # pip3 install bs4 from bs4 import BeautifulSoup soup = BeautifulSoup(content, ‘lxml‘) # print(soup.prettify()) # 格式化的处理 # print(soup.title) # 获取title # print(soup.title.string) # 获取title # print(soup.p) # 获取p标签 # print(soup.a) # 获取a标签 # print(soup.find(id=‘link3‘)) # 获取id=link3的标签 # 查找所有a标签的链接 # for link in soup.find_all(‘a‘): # print(link.get(‘href‘)) # print(soup.get_text()) # 获取文档中所有文本内容 # 16.爬虫网页标题 # from bs4 import BeautifulSoup # import requests headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "close", "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", "Referer": "http://www.infoq.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" } url2161 = ‘https://www.infoq.com/news/‘ # 取得网页完整内容 def craw(url2162): response2162 = requests.get(url2162, headers=headers) print(response2162.text) # craw(url2161) # 取得新闻标题 def craw2(url2163): response2163 = requests.get(url2163, headers=headers) soup2163 = BeautifulSoup(response2163.text, ‘lxml‘) for title_href in soup2163.find_all(‘div‘, class_=‘items__content‘): print([title.get(‘title‘) for title in title_href.find_all(‘a‘) if title.get(‘title‘)]) # craw2(url2161) # # 翻页 # for i in range(15, 46, 15): # url2164 = ‘http://www.infoq.com/news/‘ + str(i) # # print(url) # craw2(url2164) # 17.python爬虫爬取图片下载 from bs4 import BeautifulSoup import requests import os import shutil headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", "Accept-Language": "zh-CN,zh;q=0.8", "Connection": "close", "Cookie": "_gauges_unique_hour=1; _gauges_unique_day=1; _gauges_unique_month=1; _gauges_unique_year=1; _gauges_unique=1", "Referer": "http://www.infoq.com", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36 LBBROWSER" } url = ‘http://www.infoq.com/presentations‘ # 下载图片 # Requests 库封装复杂的接口,提供更人性化的 HTTP 客户端,但不直接提供下载文件的函数。 # 需要通过为请求设置特殊参数 stream 来实现。当 stream 设为 True 时, # 上述请求只下载HTTP响应头,并保持连接处于打开状态, # 直到访问 Response.content 属性时才开始下载响应主体内容 def download_jpg(image_url, image_localpath): response = requests.get(image_url, stream=True) if response.status_code == 200: with open(image_localpath, ‘wb‘) as f: response.raw.deconde_content = True shutil.copyfileobj(response.raw, f) # 取得演讲图片 def craw3(url): response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, ‘lxml‘) for pic_href in soup.find_all(‘div‘, class_=‘items__content‘): for pic in pic_href.find_all(‘img‘): imgurl = pic.get(‘src‘) dir = os.path.abspath(‘.‘) filename = os.path.basename(imgurl) imgpath = os.path.join(dir, filename) print(‘开始下载 %s‘ % imgurl) download_jpg(imgurl, imgpath) # craw3(url) # 翻页 j = 0 for i in range(12, 37, 12): url = ‘http://www.infoq.com/presentations‘ + str(i) j += 1 print(‘第 %d 页‘ % j) craw3(url)
原文地址:https://www.cnblogs.com/LiLiliang/p/12287705.html
时间: 2024-11-09 08:04:39