1.0示例学习:Web爬虫
public class WebCrawler { // 种子url private static String url = "http://www.cnblogs.com/"; public static void main(String[] args) { ArrayList<String> list = crawler(url); System.out.println("Length of listOfPendingURLs: " + list.size()); } /** * 根据种子URL抓取100个url */ public static ArrayList<String> crawler(String StartingURL) { ArrayList<String> listOfPendingURLs = new ArrayList<String>(); //待抓取的url列表 ArrayList<String> listOfTraversedURLs = new ArrayList<String>(); //已抓取的url列表 listOfPendingURLs.add(StartingURL); while(!listOfPendingURLs.isEmpty() && listOfTraversedURLs.size() <= 100) { String urlString = listOfPendingURLs.remove(0); //每次只取 待抓取url列表 的第一个地址 if(!listOfTraversedURLs.contains(urlString)) { listOfTraversedURLs.add(urlString); System.out.println("Crawl " + urlString); for(String s : getSubURLs(urlString)) { //根据种子url遍历该页面所有url,并存入带抓取url列表 if(!listOfTraversedURLs.contains(s)) { listOfPendingURLs.add(s); } } } } return listOfPendingURLs; } /** * 抓取种子url页面的所有http链接,并返回ArrayList */ public static ArrayList<String> getSubURLs(String urlString) { ArrayList<String> list = new ArrayList<String>(); try { URL url = new URL(urlString); @SuppressWarnings("resource") Scanner input = new Scanner(url.openStream()); int begain = 0; while(input.hasNextLine()) { String line = input.nextLine(); begain = line.indexOf("http:", begain); while(begain > 0) { int end = line.indexOf("\"", begain); if(end > 0) { list.add(line.substring(begain, end)); begain = line.indexOf("http:", end); } else { begain = 0; } } } } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return list; } }
时间: 2024-09-30 14:11:44