一、环境
项目:maven项目
数据库:mysql
二、项目介绍
我们要爬去的页面是https://shimo.im/doc/iKYXMBsZ5x0kui8P
假设我们需要进入这个页面,爬取页面里面的所有电影百度云链接,并保存在mysql数据库里。
三、pom.xml配置
首先我们需要新建一个maven项目,并在pom.xml配置如下jar包。
<?xml version="1.0" encoding="UTF-8"?> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>com.jk</groupId> <artifactId>shimo</artifactId> <version>1.0-SNAPSHOT</version> <packaging>jar</packaging> <name>shimo</name> <!-- FIXME change it to the project‘s website --> <url>http://www.example.com</url> <properties> <application.class>com.jk.ShiMoChromeProcessor</application.class> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.7</maven.compiler.source> <maven.compiler.target>1.7</maven.compiler.target> </properties> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>4.11</version> <scope>test</scope> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-core</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>us.codecraft</groupId> <artifactId>webmagic-extension</artifactId> <version>0.7.3</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-java</artifactId> <version>3.0.1</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-chrome-driver</artifactId> <version>3.0.1</version> </dependency> <dependency> <groupId>org.seleniumhq.selenium</groupId> <artifactId>selenium-remote-driver</artifactId> <version>3.0.1</version> </dependency> <dependency> <groupId>com.codeborne</groupId> <artifactId>phantomjsdriver</artifactId> <version>1.2.1</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-exec</artifactId> <version>1.3</version> </dependency> <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java --> <dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>5.1.6</version> </dependency> </dependencies> <build> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-shade-plugin</artifactId> <version>2.2</version> <executions> <execution> <phase>package</phase> <goals> <goal>shade</goal> </goals> <configuration> <transformers> <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> <mainClass>com.jk.ShiMoChromeProcessor</mainClass> </transformer> </transformers> </configuration> </execution> </executions> </plugin> </plugins> </build></project>
三、下载谷歌浏览器和谷歌浏览器驱动
我们这里采用selenium爬去动态网页,这也是目前比较常见的方法。如果不清楚这个,可以提前看看。我们需要用下面代码来模拟一个谷歌浏览器,其中chromebin就是你电脑下载的谷歌浏览器路径,chromedriver是谷歌浏览器驱动,userdata是你下载谷歌浏览器后的User Date文件夹路径。下载路径 链接:https://pan.baidu.com/s/1NnMdRfEXdwBo-ltpP-J4Sw 提取码:jqnx
WebDriver driver = TestChromeDriver.getChromeDriver(chromebin,chromedriver,userdata);
驱动下载之后随便安装在哪个盘里都可以,但是路径一定要记得。
谷歌浏览器下载之后点击一下,自动帮你安装在C盘,桌面也有图标显示,chromebin和userdata的路径可以通过图标属性找到。
四、将要使用的参数放在config.properties配置文件下
#这里的三个参数就是连接数据库用的db_url=jdbc:mysql://localhost:3306/ziyuan?useUnicode=true&characterEncoding=utf-8db_username=rootdb_password=962464 #这里就是刚刚说的三个路径chromebin=C:\\Users\\hasee\\AppData\\Local\\Google\\Chrome\\Application\\chrome.exechromedriver=G:\\new\\chromedriver\\chromedriver.exeuserdata=C:\\Users\\hasee\\AppData\\Local\\Google\\Chrome\\User Data #数据库表名db_table=shimo #爬取的链接guochan=https://shimo.im/doc/iKYXMBsZ5x0kui8P
五、连接数据库的JavaBean
public class DataSourceModel { private String url; private String username; private String password; DataSourceModel(){ } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getUsername() { return username; } public void setUsername(String username) { this.username = username; } public String getPassword() { return password; } public void setPassword(String password) { this.password = password; }}
六、爬虫保存到数据库的JavaBean
在mysql数据库里创建一个表格
public class Shimo { private String name; private String url; private String createtime; private String updatetime; private String path; private String rengong; private String type; public String getType() { return type; } public void setType(String type) { this.type = type; } public String getName() { return name; } public void setName(String name) { this.name = name; } public String getUrl() { return url; } public void setUrl(String url) { this.url = url; } public String getCreatetime() { return createtime; } public void setCreatetime(String createtime) { this.createtime = createtime; } public String getUpdatetime() { return updatetime; } public void setUpdatetime(String updatetime) { this.updatetime = updatetime; } public String getPath() { return path; } public void setPath(String path) { this.path = path; } public String getRengong() { return rengong; } public void setRengong(String rengong) { this.rengong = rengong; }} 七、Processor类
public class ShiMo2ChromeProcessor implements PageProcessor { static Properties properties; static DataSourceModel dataSourceModel; static String chromebin; static String chromedriver; static String userdata; static String table; static String runTime; static String quanji; static String guochan; static String oumei; static String yingdan; static String dongmanbl; static String taiguoyuenanyindu; static String hanguo; static String riben; static{ properties=Utils.loadConfig("/config.properties"); dataSourceModel=new DataSourceModel(); dataSourceModel.setUrl(properties.getProperty("db_url")); dataSourceModel.setUsername(properties.getProperty("db_username")); dataSourceModel.setPassword(properties.getProperty("db_password")); chromebin=properties.getProperty("chromebin"); chromedriver=properties.getProperty("chromedriver"); userdata=properties.getProperty("userdata"); table=properties.getProperty("db_table"); runTime=properties.getProperty("runTime"); quanji=properties.getProperty("quanji"); guochan=properties.getProperty("guochan"); oumei=properties.getProperty("oumei"); yingdan=properties.getProperty("yingdan"); dongmanbl=properties.getProperty("dongmanbl"); taiguoyuenanyindu=properties.getProperty("taiguoyuenanyindu"); hanguo=properties.getProperty("hanguo"); riben=properties.getProperty("riben"); } private String keyWord; private Site site = Site .me() .setCharset("UTF-8") .setCycleRetryTimes(3) .setSleepTime(3 * 1000) .addHeader("Connection", "keep-alive") .addHeader("Cache-Control", "max-age=0") .addHeader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0"); public ShiMo2ChromeProcessor() { } @Override public Site getSite() { return site; } @Override public void process(Page page){ WebDriver driver = TestChromeDriver.getChromeDriver(chromebin,chromedriver,userdata); driver.manage().window().maximize();//窗口最大化 try { driver.get(page.getRequest().getUrl()); Thread.sleep(10000);//10s //查找下一个链接 ananyDetail(driver); driver.quit();//浏览器退出 } catch (Exception e) { e.printStackTrace(); driver.quit();//浏览器退出 } } public static void ananyDetail(WebDriver driver) throws Exception{ //类型 String type=driver.getTitle(); List<WebElement> list=driver.findElements(By.className("gutter-author-6748903")); for(WebElement webElement:list){ try { List font=webElement.findElements(By.tagName("font")); if(font.isEmpty()){ continue; } String font1=webElement.findElements(By.tagName("font")).get(0).getText().trim(); if(font1.startsWith("点")&&!font1.startsWith("点击")){ //分析页面 String text= ""; String name= ""; String pwd= ""; try { text = webElement.getText().replace("?","").replace("点","");// if(text!=null){// text=text.replace(" ","|");// } if(text.contains("密码")){ //拆分 String[] nameAndPwd=text.split("密码"); name=nameAndPwd[0]; pwd="密码"+nameAndPwd[nameAndPwd.length-1]; }else{ //不拆分 name=text; pwd=""; } } catch (Exception e) { e.printStackTrace(); } WebElement aTag= null; try { aTag = webElement.findElement(By.tagName("a")); } catch (Exception e) { e.printStackTrace(); } //分析url String url=""; try { if(aTag!=null){ url=aTag.getAttribute("href"); } } catch (Exception e) { e.printStackTrace(); } Shimo shimo=new Shimo(); shimo.setPath(driver.getCurrentUrl()); shimo.setName(name.trim()); String prefix=""; if(url.contains("pan.baidu")){ prefix="百度网盘:"; }else{ prefix="链接:"; } shimo.setUrl(prefix+url.trim()+" "+pwd.trim()); shimo.setType(type); saveDb(shimo); } } catch (Exception e) { e.printStackTrace(); continue; } } } public static void saveDb(Shimo shimo){ Connection connection=null; try { //入数据库 connection=Utils.getConnection(dataSourceModel); //先查询是否存在 SimpleDateFormat sdf=new SimpleDateFormat("yyyy-MM-dd"); String querySql="select count(1) as totalnum from "+table+" where name=‘#name‘"; querySql=querySql.replace("#name",shimo.getName()); int count=Utils.excuteCountQuery(connection,querySql); if(count<=0){ //插入 String sql="insert into "+table+" (name,url,createtime,path,rengong,type) values (‘#name‘,‘#url‘,‘#createtime‘,‘#path‘,‘#rengong‘,‘#type‘)"; sql=sql.replace("#name",shimo.getName()) .replace("#url",shimo.getUrl()) .replace("#createtime",sdf.format(new Date())) .replace("#path",shimo.getPath()) .replace("#type",shimo.getType()) .replace("#rengong","0"); Utils.saveDb(connection,sql); }else{ //更新 String updateSql="update "+table+" set url=‘#url‘,updatetime=‘#updatetime‘,path=‘#path‘,type=‘#type‘ where name=‘#name‘ and rengong=‘0‘"; updateSql=updateSql.replace("#name",shimo.getName()) .replace("#url",shimo.getUrl()) .replace("#updatetime",sdf.format(new Date())) .replace("#type",shimo.getType()) .replace("#path",shimo.getPath()); Utils.saveDb(connection,updateSql); } } catch (Exception e) { System.out.println("入库失败"); e.printStackTrace(); }finally { if(connection!=null){ try { connection.close(); } catch (SQLException e) { e.printStackTrace(); } } } } public static void main(String[] args){ System.out.println("++++++++系统启动中..."); Map<String,Boolean> map=new HashMap<>(); while(true){ System.out.println("++++++++系统运行中..."); SimpleDateFormat simpleDateFormat=new SimpleDateFormat("yyyy-MM-dd"); String today=simpleDateFormat.format(new Date());//今天 SimpleDateFormat sdf=new SimpleDateFormat("HH"); String nowTime=sdf.format(new Date()); //当天没有跑过,且时间到了06点。 //跑过之后,将标识改为true //if((map.get(today)==null||map.get(today)==false)&&runTime.equals(nowTime)){ if(true){ map.put(today,new Boolean(true)); System.out.println("++++++++数据抓取中..."); //早晨6点开始跑 Spider spider1=Spider.create(new ShiMo2ChromeProcessor()); spider1.addUrl(guochan) .setDownloader(new HttpClientDownloader()) .thread(1) .run(); } try { Thread.sleep(600000);//10分钟跑一次 } catch (InterruptedException e) { e.printStackTrace(); } } }} 八、TestChromeDriver
public class TestChromeDriver { public static WebDriver getChromeDriver(String chromebin,String chromedriver,String userdata ) { /* 设定 chrome启动文件的位置, 若未设定则取默认安装目录的 chrome */ System.setProperty("webdriver.chrome.bin", chromebin); /* 设定 chrome webdirver 的位置 ,若未设定则从path变量读取*/ System.setProperty("webdriver.chrome.driver", chromedriver); ChromeOptions chromeOption=new ChromeOptions(); chromeOption.addArguments("--user-data-dir="+userdata);// chromeOption.addArguments("--headless"); chromeOption.addArguments("--no-sandbox"); WebDriver driver = new ChromeDriver(chromeOption); return driver; } } 九、工具类
public class Utils { public static Properties loadConfig(String configFile) { InputStream input = null; Properties properties = new Properties(); try { input = Utils.class.getResourceAsStream(configFile); properties.load(input); } catch (Exception e) { System.out.println("配置文件加载失败"); } finally { if(input != null) { try { input.close(); } catch (IOException e) { e.printStackTrace(); } } } return properties; } public static Connection getConnection(DataSourceModel dataSourceModel){ Connection conn=null; try { Class.forName("com.mysql.jdbc.Driver"); conn= DriverManager.getConnection(dataSourceModel.getUrl(), dataSourceModel.getUsername(), dataSourceModel.getPassword()); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (SQLException e) { e.printStackTrace(); } return conn; } public static void saveDb(Connection connection,String sql){ Statement statement=null; try { if(connection!=null){ statement=connection.createStatement(); statement.executeUpdate(sql); } } catch (Exception e) { e.printStackTrace(); } finally { try { if(statement!=null) statement.close(); } catch (SQLException e) { e.printStackTrace(); } } } public static int excuteCountQuery(Connection connection,String sql){ int rowCount=0; Statement statement=null; ResultSet resultSet=null; try { statement=connection.createStatement(); resultSet=statement.executeQuery(sql); while(resultSet.next()){ rowCount = resultSet.getInt("totalnum"); } } catch (Exception e) { e.printStackTrace(); } finally { try { if(resultSet!=null) resultSet.close(); if(statement!=null) statement.close(); } catch (SQLException e) { e.printStackTrace(); } } return rowCount; } }
原文地址:https://www.cnblogs.com/fangyunchen/p/10260518.html
时间: 2024-10-08 18:52:47