实例
#!/usr/bin/python # -*- coding: utf-8 -*- ‘‘‘ Created on Dec 6, 2013 @author: Jay <[email protected]> @description: use PhantomJS to parse a web page to get the geo info of an IP ‘‘‘ import datetime import urllib # from pyquery import PyQuery as pq import pdfkit from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0 from selenium.webdriver.support import expected_conditions as EC # available since import time #import db #from db import exec_sql, fetchone_sql, fetchall_sql import sys reload(sys) sys.setdefaultencoding(‘utf-8‘) def spider_question(url): """ 功能说明:根据URL生成PDF """ browser = webdriver.PhantomJS(executable_path=‘./phantomjs‘) # 这要可能需要制定phatomjs可执行文件的位置 # browser.set_window_size(1024, 786) print ‘start request url‘, datetime.datetime.now() browser.get(url) # Load page print ‘end request url‘, datetime.datetime.now() try: WebDriverWait(browser, 60).until(EC.title_contains("complete")) # 查找数据,500毫秒查找一次,找到则继续执行下面代码,超过10秒抛出异常 except Exception, e: print "http 500" print e browser.quit() return "response_500" print "end math load:", datetime.datetime.now() html = browser.find_element_by_tag_name("html").get_attribute("innerHTML") browser.quit() html = ‘<!DOCTYPE html><html>‘ + html + "</html>" print ‘begin pdfkit:‘, datetime.datetime.now() pdf_name = "test2.pdf" pdfkit.from_string(html, pdf_name) print "end pdfkit", datetime.datetime.now() return html def main(): #url = "http://192.168.0.126/tea/test/wrongset/download/question/?day_list=20151103&user_id=849127&js=1" url = "http://192.168.0.126/open/math2_preview/?js=1&engine=webkit2&css=0" now = datetime.datetime.now() spider_question(url) now2 = datetime.datetime.now() print(now2-now) # pdfkit.from_string("hello", ‘test.pdf‘) if __name__ == "__main__": main() print "completed"
时间: 2024-12-21 22:05:03