selenium配合phantomjs实现爬虫功能，并把抓取的数据写入excel

# -*- coding: UTF-8 -*-
‘‘‘
Created on 2016年5月13日

@author: csxie
‘‘‘
import datetime
from Base import BasePage
import ExcelOperation as excel
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.expected_conditions import text_to_be_present_in_element
import unittest
class JobLog(BasePage):
    url=""#目标url
    def setUp(self):
        BasePage.setUP(self)
    def tearDown(self):
        BasePage.tearDown(self)

    def test_querysql(self):
        """
        查询到符合条件的信息并写入excel
        """
        driver=self.driver
        driver.get(self.url)
        ‘‘‘
        设置参数
        start:起始日期
        end:截止日期
        user:查询人员
        ‘‘‘
        end=datetime.date.today()
        start=end+datetime.timedelta(days=-6)
        user=None
        #起始日期
        txtStart=driver.find_element_by_id("ctl01_txtSelectDateFrom")
        txtStart.clear()
        txtStart.send_keys(start)
        #截止日期
        txtEnd=driver.find_element_by_id("ctl01_txtSelectDateTo")
        txtEnd.clear()
        txtEnd.send_keys(end)
        #查询人员
        if(user!=None):
            txtuser=driver.find_element_by_id("ctl01_txtUser")
            txtuser.clear()
            txtuser.send_keys(user)
        #查询按钮
        btnQueryInfo=driver.find_element_by_id("ctl01_btnQueryInfo")
        btnQueryInfo.click()
        #总页数
        totalPages=int(driver.find_element_by_id("ctl01_uc_CommonPager_lb_TotalPages").text)
        i=1
        while(i<=totalPages):
            #table的xpath
            listTable=self.getTableData(".//*[@id=‘ctl01__wrLogList_gvlist‘]",colNO=9,loop=i)
            #当前页数
            currentPage=int(driver.find_element_by_id("ctl01_uc_CommonPager_lb_PageNum").text)
            if(currentPage==1):
                excel.save_to_excel_newsheet(listTable,excelPath=‘joblog.xls‘)
            else:
                excel.save_to_excel_oldsheet(listTable,excelPath=‘joblog.xls‘)
            if(i!=totalPages):
                #下一页按钮
                driver.find_element_by_id("ctl01_uc_CommonPager_NextPage").click()
                WebDriverWait(driver,8).until(text_to_be_present_in_element((‘id‘,‘ctl01_uc_CommonPager_lb_PageNum‘),str(i+1)))
            i=i+1

    def getTableData(self,value,colTag=‘td‘,colNO=None,loop=None):
        ‘‘‘
        获取表格数据
        colNO:存在合并行时，强制指定表格列数
        ‘‘‘
        driver=self.driver
        table=driver.find_element(by="xpath",value=value)
        rows=table.find_elements(by="tag name",value="tr")#行集合
        maxrowCount=len(rows)#表格行数

        #如果colNO=None不为空，则用传进来的值作为列数
        if(colNO==None):
            cols = table.find_elements(by="tag name",value="td");
            if(cols!=‘td‘):
                cols = table.find_elements(by="tag name",value="th")
            maxcolCount=len(cols)/len(rows)#表格列数
        else:
            maxcolCount=colNO      

        lists = [[] for i in range(maxrowCount)]
        #print  "列数"+str(maxcolCount)
        #print  "行数"+str(maxrowCount)
        i=1
        while i<=maxrowCount:
            j=1;#初始化列
            while j<=maxcolCount:
                try:
                    if(i==1):
                        try:
                            innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                        except:
                            colTag="th"
                            innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                    else:
                        colTag="td"
                        if(j==3 or j==5):
                            innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]/a").get_attribute("title")
                        elif(j==8):
                            innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                            innerText=float(innerText.replace("h", ""))
                        else:
                            innerText=driver.find_element(by="xpath",value=value+"/tbody/tr["+str(i)+"]/"+colTag+"["+str(j)+"]").text
                except:
                    innerText=""
                #print "第"+str(i)+"行,第"+str(j)+"列"+str(innerText)
                lists[i-1].append(innerText)
                j+=1
            i+=1
        if(loop!=1):#不是第一次循环，列头不要
            return lists[1:]
        return lists

if __name__ == "__main__":
    unittest.main()

# -*- coding: UTF-8 -*-
‘‘‘
Created on 2016年5月18日

@author: csxie
‘‘‘
import xlwt
from xlrd import open_workbook
from xlutils.copy import copy
import os

def save_to_excel_newsheet(listTable,excelPath=r‘C:\demo.xls‘,sheetName=‘sheet1‘):
    if(isinstance(listTable,list)):
        rowNO=len(listTable);
        if(rowNO==0):
            raise ValueError,u‘传入的是list是空的‘
        colNO=len(listTable[0]);

        wkb = xlwt.Workbook()
        sheet = wkb.add_sheet(sheetName)
        for i in range(rowNO):
            for j in range(colNO):
                #print listTable[i][j]
                #print str(i)+‘行‘+str(j)+‘列‘
                sheet.write(i,j,listTable[i][j])
        if(os.path.exists(excelPath)):
            os.remove(excelPath)
        wkb.save(excelPath)
        return True
    else:
        raise TypeError,u‘传入的类型不是list‘

def save_to_excel_oldsheet(listTable,excelPath=r‘C:\demo.xls‘,sheetName=‘sheet1‘):
    if(isinstance(listTable,list)):
        rowNO=len(listTable);
        if(rowNO==0):
            raise ValueError,u‘传入的是list是空的‘
        colNO=len(listTable[0]);
        rb = open_workbook(excelPath)
        sheet_index=rb.sheet_names().index(sheetName)
        old_rows=rb.sheet_by_name(sheetName).nrows#已存在的excel中数据行数

        wkb = copy(rb)
        sheet = wkb.get_sheet(sheet_index)
        for i in range(rowNO):
            for j in range(colNO):
                sheet.write(i+old_rows,j,listTable[i][j])
                #print str(i)+‘行‘+str(j)+‘列‘
        wkb.save(excelPath)
        return True
    else:
        raise TypeError,u‘传入的类型不是list‘

# -*- coding: UTF-8 -*-
from selenium import webdriver
import unittest

class BasePage(unittest.TestCase):
    def setUP(self):
        self.driver= webdriver.PhantomJS(executable_path="phantomjs.exe")
        #self.driver.set_window_size(1400, 1000)
        #self.driver.implicitly_wait(1)
        self.verificationErrors = []

    def tearDown(self):
        self.driver.quit()
        #self.driver.close()
        self.assertEqual([], self.verificationErrors)

时间： 2024-10-12 02:29:38

selenium配合phantomjs实现爬虫功能，并把抓取的数据写入excel的相关文章

iOS开发——网络实用技术OC篇&网络爬虫－使用青花瓷抓取网络数据

网络爬虫-使用青花瓷抓取网络数据由于最近在研究网络爬虫相关技术,刚好看到一篇的的搬了过来! 望谅解..... 写本文的契机主要是前段时间有次用青花瓷抓包有一步忘了,在网上查了半天也没找到写的完整的教程,于是待问题解决后抽时间截了图,自己写一遍封存在博客园中以便以后随时查阅. charles又名青花瓷,在iOS开发中的抓包中具有重要作用.最大的三点用处,一就是拦截别人软件的发送的请求和后端接口,练习开发.二是自己后端返回的response拦截修改后再接收以达到测试临界数据的作用.三写脚本重复拦截

iOS开发——网络使用技术OC篇&网络爬虫－使用正则表达式抓取网络数据

网络爬虫-使用正则表达式抓取网络数据关于网络数据抓取不仅仅在iOS开发中有,其他开发中也有,也叫网络爬虫,大致分为两种方式实现 1:正则表达 2:利用其他语言的工具包:java/Python 先来看看网络爬虫的基本原理: 一个通用的网络爬虫的框架如图所示: 网络爬虫的基本工作流程如下: 1.首先选取一部分精心挑选的种子URL: 2.将这些URL放入待抓取URL队列: 3.从待抓取URL队列中取出待抓取在URL,解析DNS,并且得到主机的ip,并将URL对应的网页下载下来,存储进已下载网页库中.

[Python爬虫] 之四：Selenium 抓取微博数据

抓取代码: # coding=utf-8import osimport refrom selenium import webdriverimport selenium.webdriver.support.ui as uifrom selenium.webdriver.common.keys import Keysimport timefrom selenium.webdriver.common.action_chains import ActionChainsimport IniFileclas

Python爬虫实战四之抓取淘宝MM照片

福利啊福利,本次为大家带来的项目是抓取淘宝MM照片并保存起来,大家有没有很激动呢? 最新动态更新时间:2015/8/2 最近好多读者反映代码已经不能用了,原因是淘宝索引页的MM链接改了.网站改版了,URL的索引已经和之前的不一样了,之前可以直接跳转到每个MM的个性域名,现在中间加了一个跳转页,本以为可以通过这个页面然后跳转到原来的个性域名,而经过一番折腾发现,这个跳转页中的内容是JS动态生成的,所以不能用Urllib库来直接抓取了,本篇就只提供学习思路,代码不能继续用了. 之后博主会利用其它方

网络爬虫: 从allitebooks.com抓取书籍信息并从amazon.com抓取价格(2): 抓取allitebooks.com书籍信息及ISBN码

这一篇首先从allitebooks.com里抓取书籍列表的书籍信息和每本书对应的ISBN码. 一.分析需求和网站结构 allitebooks.com这个网站的结构很简单,分页+书籍列表+书籍详情页. 要想得到书籍的详细信息和ISBN码,我们需要遍历所有的页码,进入到书籍列表,然后从书籍列表进入到每本书的详情页里,这样就能够抓取详情信息和ISBN码了. 二.从分页里遍历每一页书籍列表通过查看分页功能的HTML代码,通过class＝"current"可以定位当前页码所在span标签,此s

[转载]爬虫的自我解剖(抓取网页HtmlUnit)

网络爬虫第一个要面临的问题,就是如何抓取网页,抓取其实很容易,没你想的那么复杂,一个开源HtmlUnit包,4行代码就OK啦,例子如下: 1 2 3 4 final WebClient webClient=new WebClient(); final HtmlPage page=webClient.getPage("http://www.yanyulin.info"); System.out.println(page.asText()); webClient.closeAllWindo

基于Node.js的强大爬虫能直接发布抓取的文章哦

基于Node.js的强大爬虫能直接发布抓取的文章哦基于Node.js的强大爬虫能直接发布抓取的文章哦!本爬虫源码基于WTFPL协议,感兴趣的小伙伴们可以参考一下一.环境配置 1)搞一台服务器,什么linux都行,我用的是CentOS 6.5: 2)装个mysql数据库,5.5或5.6均可,图省事可以直接用lnmp或lamp来装,回头还能直接在浏览器看日志: 3)先安个node.js环境,我用的是0.12.7,更靠后的版本没试过: 4)执行npm -g install forever,安装f

分布式爬虫：使用Scrapy抓取数据

分布式爬虫:使用Scrapy抓取数据 Scrapy是Python开发的一个快速,高层次的屏幕抓取和web抓取框架,用于抓取web站点并从页面中提取结构化的数据.Scrapy用途广泛,可以用于数据挖掘.监测和自动化测试. 官方主页: http://www.scrapy.org/ 中文文档:Scrapy 0.22 文档 GitHub项目主页:https://github.com/scrapy/scrapy Scrapy 使用了 Twisted 异步网络库来处理网络通讯.整体架构大致如下(注:图片来自

iOS—网络实用技术OC篇&网络爬虫－使用java语言抓取网络数据

网络爬虫-使用java语言抓取网络数据前提:熟悉java语法(能看懂就行) 准备阶段:从网页中获取html代码实战阶段:将对应的html代码使用java语言解析出来,最后保存到plist文件上一片文章已经介绍我们可以使用两个方式来抓取网络数据实现网络爬虫,并且大致介绍了一下怎么使用正则表达式去实现数据的抓取由于笔者曾经学过一段时间java和android相关的技术,今天就讲讲怎么使用java去抓取网络数据,关于Python有机会等笔者好好研究一下再来分享,但其实会一种就可以,除非你的需求