这是我实现的第一个版本的爬虫,局限比较大,用的算法思想也比较简单。不过通过爬虫1号,我对爬虫和搜索引擎实现机制有了更多的认识,收获还是挺大的^_^,涂鸦之作,我不知道别人如何实现的,就按自己的想法写了个,大家有兴趣可以看看,用java实现的
这是工程目录:
下面是具体代码:
package com.rgy.reptile; import com.rgy.utils.PageUtils; public class Entry { public static void main(String args[]){ String url = "http://www.youku.com"; PageUtils.history_list.add(url); PageUtils.parent_stack.push(url); PageUtils.searchUrl(url); //PageUtils.hrefShow(url); } }
package com.rgy.entity; import java.util.ArrayList; public class PageInfo { private String url; private String title; private String keywords; private ArrayList<String> href_list; public PageInfo(){ this.url=""; this.title=""; this.keywords=""; this.href_list=null; } public void setUrl(String url){ this.url = url; } public void setTitle(String title){ this.title = title; } public void setKeywords(String keywords){ this.keywords = keywords; } public void setHref_list(ArrayList<String> href_list){ this.href_list = href_list; } public String getUrl(){ return url; } public String getTitle(){ return title; } public String getKeywords(){ return keywords; } public ArrayList<String> getHref_list(){ return href_list; } }
package com.rgy.utils; import java.util.ArrayList; import java.util.Stack; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import com.rgy.entity.PageInfo; public class PageUtils { public static PageInfo getPageInfo(String url){ PageInfo info = new PageInfo(); if(url.endsWith("/")){ url = url.substring(0, url.length()-1); } info.setUrl(url); try{ Document doc = Jsoup.connect(url).timeout(30000).get(); String title = doc.title().toString(); info.setTitle(title); String keywords = doc.getElementsByTag("meta").select("[name=keywords]").attr("content"); info.setKeywords(keywords); Elements links = doc.getElementsByTag("a"); ArrayList<String> href_list = new ArrayList<String>(); for (Element link : links) { String linkHref = link.attr("href"); if(linkHref.endsWith("/")){ linkHref = linkHref.substring(0, linkHref.length()-1); } //如果数组中不存在这个链接 if(linkIsAvailable(linkHref)&&!href_list.contains(linkHref)){ href_list.add(linkHref); info.setHref_list(href_list); } } }catch(Exception ex){ ex.printStackTrace(); } return info; } public static boolean linkIsAvailable(String url){ if(url.startsWith("http://")){ String regex = ".*.exe|.*.apk|.*.zip|.*.rar|.*.pdf|.*.doc"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(url); return !matcher.matches(); } return false; } public static boolean keywordsIsAvailable(String keywords){ String regex = ".*青春.*|.*搞笑.*|.*微电影.*|.*短片.*|.*迷你剧.*|.*喜剧.*"; Pattern pattern = Pattern.compile(regex); Matcher matcher = pattern.matcher(keywords); return matcher.matches(); } //存放已经访问过的url public static ArrayList<String> history_list = new ArrayList<String>(); //记录一路走来的父结点 public static Stack<String> parent_stack = new Stack<String>(); public static void searchUrl(String url){ PageInfo info = getPageInfo(url); String keywords = info.getKeywords(); int hlist_size = history_list.size(); System.out.println(hlist_size+"-->"+history_list.get(hlist_size-1)); // if(keywordsIsAvailable(keywords)){//如果匹配上了, // System.out.println(url+"===>"+keywords); // } ArrayList<String> href_list = info.getHref_list(); if(href_list==null){//该结点不可用,回到父亲结点继续走0 parent_stack.pop(); if(!parent_stack.empty()){//不为空栈 searchUrl(parent_stack.peek()); }else{//空栈 System.out.println("Yir,爬虫1号已完成任务!!!"); } }else{//结点可用 int size = href_list.size(); for(int i=0;i<size;i++){ String strUrl = href_list.get(i); if(history_list.contains(strUrl)){//如果当前链接已经被访问过了 continue; } history_list.add(strUrl); parent_stack.push(strUrl); searchUrl(strUrl); } } } public static void hrefShow(String url){ PageInfo info = getPageInfo(url); ArrayList<String> href_list = info.getHref_list(); int size = href_list.size(); for(int i=0;i<size;i++){ System.out.println(href_list.get(i)); } } }
有兴趣的童鞋可以到这里下载工程代码:
http://download.csdn.net/detail/u011700203/8410597
时间: 2024-10-07 15:31:43