python 爬虫获取世界杯比赛赛程

#!/usr/bin/python
# -*- coding:utf8 -*-

import requests
import re
import os
import time
# from urllib
import json
from bs4 import BeautifulSoup
from datetime import date

def getTimeExpire(time_play,time_gap):
	# print(time_play)
	try:
		time_arr=time.strptime(time_play,"%Y-%m-%d %H:%M:%S")
	except:
		print(‘时间转化失败‘)
		return ‘‘
	else:
		t1=time.mktime(time_arr)
		x = time.localtime(t1+time_gap)#是秒不是毫秒
		return time.strftime(‘%Y-%m-%d %H:%M:%S‘,x)
def getHtml():
	#改成从网站直接获取，但是网站需要分页
	with open(‘F:\\test\\python\\worldcup.html‘, ‘r‘,encoding=‘utf-8‘) as f:
		content = f.read()
		soup = BeautifulSoup(content,‘lxml‘)
		nodes=soup.select(‘.b-pull-refresh-content > div‘)
		arr=[]
		#写入CSV文件的头部
		filename = "F:\\test\\python\\worldcup.csv"
		f = open(filename,‘a‘)
		f.writelines(‘team1,team2,time_expire,time_play \n‘)
		f.close()
		for node in nodes:
			date = node.select(‘.wa-match-schedule-list-title‘)[0].get_text().strip()
			datas = node.select(‘.sfc-contacts-list .wa-match-schedule-list-item‘)
			for d in datas:
				obj={‘team1‘:‘‘,‘team2‘:‘‘,‘time‘:‘‘}
				obj[‘team1‘]=d.select(‘.wa-tiyu-schedule-item-name.c-line-clamp1‘)[0].get_text().strip()
				obj[‘team2‘]=d.select(‘.wa-tiyu-schedule-item-name.c-line-clamp1‘)[1].get_text().strip()
				obj[‘time_play‘]=‘2018-‘+date[2:8]+‘‘+d.select(‘.status-text‘)[0].get_text().strip()+‘:00‘
				obj[‘time_expire‘]=getTimeExpire(obj[‘time_play‘],-10*60)
				filename = "F:\\test\\python\\worldcup.csv"
				f = open(filename,‘a‘)
				f.writelines(obj[‘team1‘]+‘,‘+obj[‘team2‘]+‘,‘+obj[‘time_expire‘]+‘,‘+obj[‘time_play‘]+‘\n‘)
				f.close()
#getHtml()

def getFromAPI():
	month=6
	day=11
	#从2018-06-14 到 07-15
	for d in range(0,15):
		day+=2
		if day>30:
			month+=1
			day=1
		url="http://tiyu.baidu.com/api/match/%E4%B8%96%E7%95%8C%E6%9D%AF/live/date/2018-"+str(month)+‘-‘+str(day)+"/direction/after?from=self"
		time.sleep(1)
		data = json.loads(requests.get(url,timeout=3).text)
		if(data[‘status‘]==‘0‘):
			print(‘为0‘)
			for matches in data[‘data‘]:
				for m in matches[‘list‘]:
					filename = "F:\\test\\python\\worldcupFromAPI.csv"
					f = open(filename,‘a‘)
					if m[‘startTime‘]>time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()):
						f.writelines(m[‘leftLogo‘][‘name‘]+‘,‘+m[‘rightLogo‘][‘name‘]+‘,‘+getTimeExpire(m[‘startTime‘],-10*60)+‘,‘+m[‘startTime‘]+‘\n‘)
					f.close()

getFromAPI()

原文地址：https://www.cnblogs.com/cao-zhen/p/9215222.html

时间： 2024-10-10 17:14:17

python 爬虫获取世界杯比赛赛程的相关文章

Python爬虫获取图片并下载保存至本地的实例

今天小编就为大家分享一篇Python爬虫获取图片并下载保存在本地的实例,具有很好的参考价值,希望对大家有所帮助.一起来看看吧! 1.抓取煎蛋网上的图片 2.代码如下 * * * import urllib.request import os def url_open(url): req=urllib.request.Request(url) req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Geck

python爬取足球比赛赛程笔记

目标:爬取某网站比赛赛程,动态网页,则需找到对应ajax请求(具体可参考:https://blog.csdn.net/you_are_my_dream/article/details/53399949) # -*- coding:utf-8 -*- import sys import re import urllib.request link = "https://***" r = urllib.request.Request(link) r.add_header('User-Agen

python 爬虫获取文件式网站资源完整版（基于python 3.6）

<--------------------------------下载函数-----------------------------> import requestsimport threading # 传入的命令行参数,要下载文件的url# url = 'http://www.nco.ncep.noaa.gov/pmb/codes/nwprod/nosofs.v3.0.4/fix/cbofs/nos.cbofs.romsgrid.nc' def Handler(start, end, url

Python爬虫获取JSESSIONID登录网站

在使用Python对一些网站的数据进行采集时,经常会遇到需要登录的情况.这些情况下,使用FireFox等浏览器登录时,自带的调试器(快捷键F12)就可以看到登录的时候网页向服务器提交的信息,把这部分信息提取出来就可以利用Python 的 urllib2 库结合Cookie进行模拟登录然后采集数据,如以下代码: #coding=utf-8 import urllib import urllib2 import httplib import cookielib url = 'http://www.x

Python爬虫获取迅雷会员帐号

代码如下: 1 import re 2 import urllib.request 3 import urllib 4 import time 5 6 from collections import deque 7 8 head = { 9 'Connection': 'Keep-Alive', 10 'Accept': 'text/html, application/xhtml+xml, */*', 11 'Accept-Language': 'en-US,en;q=0.8,zh-Hans-C

python 爬虫获取文件式网站资源（基于python 3.6）

import urllib.request from bs4 import BeautifulSoup from urllib.parse import urljoin from Cat.findLinks import get_link from Cat.Load import Schedule import osimport timeimport errno -------import的其余包代码---------------- def get_link(page): # 寻找链接的href

python爬虫获取豆瓣网前250部电影的详细信息

网址 https://movie.douban.com/top250 一共250部电影,有分页,获取每一部的详细信息不采用框架,使用 urilib读取网页,re进行正则表达式匹配,lxml进行xpath查找 1 from film import * 2 from urllib import request 3 import time,re 4 url=r'https://movie.douban.com/top250?start=' 5 for i in range(10): 6 url=ur

python爬虫获取校园网新闻

首先打开校园网(以我的学校为例"http://www.zhbit.com/") 现在我们需要获取上图红色框框的数据并输出下面我们打开浏览器的开发模式并定位到相应的代码不难发现,学校要闻只显示5条信息下面我们就开始写程序 # -*- coding:utf-8 -*- import urllib import urllib2 import re url = "http://www.zhbit.com/" #利用urllib2模块打开校园网 res = urllib

python爬虫获取图片

import re import os import urllib #根据给定的网址来获取网页详细信息,得到的html就是网页的源代码 def getHtml(url): page = urllib.request.urlopen(url) html = page.read() return html.decode('UTF-8') def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) i