Java 爬虫

import java.awt.BorderLayout;
import java.awt.Cursor;
import java.awt.Font;
import java.awt.GridBagConstraints;
import java.awt.GridBagLayout;
import java.awt.Insets;
import java.awt.event.ActionEvent;
import java.awt.event.ActionListener;
import java.awt.event.KeyEvent;
import java.awt.event.WindowAdapter;
import java.awt.event.WindowEvent;
import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.swing.BorderFactory;
import javax.swing.JButton;
import javax.swing.JCheckBox;
import javax.swing.JComboBox;
import javax.swing.JFrame;
import javax.swing.JLabel;
import javax.swing.JMenu;
import javax.swing.JMenuBar;
import javax.swing.JMenuItem;
import javax.swing.JOptionPane;
import javax.swing.JPanel;
import javax.swing.JProgressBar;
import javax.swing.JScrollPane;
import javax.swing.JSeparator;
import javax.swing.JTable;
import javax.swing.JTextField;
import javax.swing.table.DefaultTableModel;

public class SearchCrawler extends JFrame {
private static final String[] MAX_URLS = { "50", "100", "500", "1000" };

// Cache of robot disallow lists.
private HashMap disallowListCache = new HashMap();

// Search GUI controls.
private JTextField startTextField;

private JComboBox maxComboBox;

private JCheckBox limitCheckBox;

private JTextField logTextField;

private JTextField searchTextField;

private JCheckBox caseCheckBox;

private JButton searchButton;

// Search stats GUI controls.
private JLabel crawlingLabel2;

private JLabel crawledLabel2;

private JLabel toCrawlLabel2;

private JProgressBar progressBar;

private JLabel matchesLabel2;

// Table listing search matches.
private JTable table;

// Flag for whether or not crawling is underway.
private boolean crawling;

// Matches log file print writer.
private PrintWriter logFileWriter;

// Constructor for Search Web Crawler.
public SearchCrawler() {
// Set application title.
setTitle("Search Crawler");

// Set window size.
setSize(600, 600);

// Handle window closing events.
addWindowListener(new WindowAdapter() {
public void windowClosing(WindowEvent e) {
actionExit();
}
});

// Set up File menu.
JMenuBar menuBar = new JMenuBar();
JMenu fileMenu = new JMenu("File");
fileMenu.setMnemonic(KeyEvent.VK_F);
JMenuItem fileExitMenuItem = new JMenuItem("Exit", KeyEvent.VK_X);
fileExitMenuItem.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent e) {
actionExit();
}
});
fileMenu.add(fileExitMenuItem);
menuBar.add(fileMenu);
setJMenuBar(menuBar);

// Set up search panel.
JPanel searchPanel = new JPanel();
GridBagConstraints constraints;
GridBagLayout layout = new GridBagLayout();
searchPanel.setLayout(layout);

JLabel startLabel = new JLabel("Start URL:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(startLabel, constraints);
searchPanel.add(startLabel);

startTextField = new JTextField();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(startTextField, constraints);
searchPanel.add(startTextField);

JLabel maxLabel = new JLabel("Max URLs to Crawl:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(maxLabel, constraints);
searchPanel.add(maxLabel);

maxComboBox = new JComboBox(MAX_URLS);
maxComboBox.setEditable(true);
constraints = new GridBagConstraints();
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(maxComboBox, constraints);
searchPanel.add(maxComboBox);

limitCheckBox = new JCheckBox("Limit crawling to Start URL site");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.WEST;
constraints.insets = new Insets(0, 10, 0, 0);
layout.setConstraints(limitCheckBox, constraints);
searchPanel.add(limitCheckBox);

JLabel blankLabel = new JLabel();
constraints = new GridBagConstraints();
constraints.gridwidth = GridBagConstraints.REMAINDER;
layout.setConstraints(blankLabel, constraints);
searchPanel.add(blankLabel);

JLabel logLabel = new JLabel("Matches Log File:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(logLabel, constraints);
searchPanel.add(logLabel);

String file = System.getProperty("user.dir")
+ System.getProperty("file.separator") + "crawler.log";
logTextField = new JTextField(file);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(logTextField, constraints);
searchPanel.add(logTextField);

JLabel searchLabel = new JLabel("Search String:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(searchLabel, constraints);
searchPanel.add(searchLabel);

searchTextField = new JTextField();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.insets = new Insets(5, 5, 0, 0);
constraints.gridwidth = 2;
constraints.weightx = 1.0d;
layout.setConstraints(searchTextField, constraints);
searchPanel.add(searchTextField);

caseCheckBox = new JCheckBox("Case Sensitive");
constraints = new GridBagConstraints();
constraints.insets = new Insets(5, 5, 0, 5);
constraints.gridwidth = GridBagConstraints.REMAINDER;
layout.setConstraints(caseCheckBox, constraints);
searchPanel.add(caseCheckBox);

searchButton = new JButton("Search");
searchButton.addActionListener(new ActionListener() {
public void actionPerformed(ActionEvent e) {
actionSearch();
}
});
constraints = new GridBagConstraints();
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 5, 5);
layout.setConstraints(searchButton, constraints);
searchPanel.add(searchButton);

JSeparator separator = new JSeparator();
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 5, 5);
layout.setConstraints(separator, constraints);
searchPanel.add(separator);

JLabel crawlingLabel1 = new JLabel("Crawling:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(crawlingLabel1, constraints);
searchPanel.add(crawlingLabel1);

crawlingLabel2 = new JLabel();
crawlingLabel2.setFont(crawlingLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(crawlingLabel2, constraints);
searchPanel.add(crawlingLabel2);

JLabel crawledLabel1 = new JLabel("Crawled URLs:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(crawledLabel1, constraints);
searchPanel.add(crawledLabel1);

crawledLabel2 = new JLabel();
crawledLabel2.setFont(crawledLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(crawledLabel2, constraints);
searchPanel.add(crawledLabel2);

JLabel toCrawlLabel1 = new JLabel("URLs to Crawl:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(toCrawlLabel1, constraints);
searchPanel.add(toCrawlLabel1);

toCrawlLabel2 = new JLabel();
toCrawlLabel2.setFont(toCrawlLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(toCrawlLabel2, constraints);
searchPanel.add(toCrawlLabel2);

JLabel progressLabel = new JLabel("Crawling Progress:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 0, 0);
layout.setConstraints(progressLabel, constraints);
searchPanel.add(progressLabel);

progressBar = new JProgressBar();
progressBar.setMinimum(0);
progressBar.setStringPainted(true);
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 0, 5);
layout.setConstraints(progressBar, constraints);
searchPanel.add(progressBar);

JLabel matchesLabel1 = new JLabel("Search Matches:");
constraints = new GridBagConstraints();
constraints.anchor = GridBagConstraints.EAST;
constraints.insets = new Insets(5, 5, 10, 0);
layout.setConstraints(matchesLabel1, constraints);
searchPanel.add(matchesLabel1);
matchesLabel2 = new JLabel();
matchesLabel2.setFont(matchesLabel2.getFont().deriveFont(Font.PLAIN));
constraints = new GridBagConstraints();
constraints.fill = GridBagConstraints.HORIZONTAL;
constraints.gridwidth = GridBagConstraints.REMAINDER;
constraints.insets = new Insets(5, 5, 10, 5);
layout.setConstraints(matchesLabel2, constraints);
searchPanel.add(matchesLabel2);

// Set up matches table.
table = new JTable(new DefaultTableModel(new Object[][] {},
new String[] { "URL" }) {
public boolean isCellEditable(int row, int column) {
return false;
}
});

// Set up Matches panel.
JPanel matchesPanel = new JPanel();
matchesPanel.setBorder(BorderFactory.createTitledBorder("Matches"));
matchesPanel.setLayout(new BorderLayout());
matchesPanel.add(new JScrollPane(table), BorderLayout.CENTER);

// Add panels to display.
getContentPane().setLayout(new BorderLayout());
getContentPane().add(searchPanel, BorderLayout.NORTH);
getContentPane().add(matchesPanel, BorderLayout.CENTER);
}

// Exit this program.
private void actionExit() {
System.exit(0);
}

// Handle Search/Stop button being clicked.
private void actionSearch() {
// If stop button clicked, turn crawling flag off.
if (crawling) {
crawling = false;
return;
}

ArrayList errorList = new ArrayList();

// Validate that start URL has been entered.
String startUrl = startTextField.getText().trim();
if (startUrl.length() < 1) {
errorList.add("Missing Start URL.");
}
// Verify start URL.
else if (verifyUrl(startUrl) == null) {
errorList.add("Invalid Start URL.");
}

// Validate that Max URLs is either empty or is a number.
int maxUrls = 0;
String max = ((String) maxComboBox.getSelectedItem()).trim();
if (max.length() > 0) {
try {
maxUrls = Integer.parseInt(max);
} catch (NumberFormatException e) {
}
if (maxUrls < 1) {
errorList.add("Invalid Max URLs value.");
}
}

// Validate that matches log file has been entered.
String logFile = logTextField.getText().trim();
if (logFile.length() < 1) {
errorList.add("Missing Matches Log File.");
}

// Validate that search string has been entered.
String searchString = searchTextField.getText().trim();
if (searchString.length() < 1) {
errorList.add("Missing Search String.");
}

// Show errors, if any, and return.
if (errorList.size() > 0) {
StringBuffer message = new StringBuffer();

// Concatenate errors into single message.
for (int i = 0; i < errorList.size(); i++) {
message.append(errorList.get(i));
if (i + 1 < errorList.size()) {
message.append("\n");
}
}

showError(message.toString());
return;
}

// Remove "www" from start URL if present.
startUrl = removeWwwFromUrl(startUrl);

// Start the Search Crawler.
search(logFile, startUrl, maxUrls, searchString);
}

private void search(final String logFile, final String startUrl,
final int maxUrls, final String searchString) {
// Start the search in a new thread.
Thread thread = new Thread(new Runnable() {
public void run() {
// Show hour glass cursor while crawling is under way.
setCursor(Cursor.getPredefinedCursor(Cursor.WAIT_CURSOR));

// Disable search controls.
startTextField.setEnabled(false);
maxComboBox.setEnabled(false);
limitCheckBox.setEnabled(false);
logTextField.setEnabled(false);
searchTextField.setEnabled(false);
caseCheckBox.setEnabled(false);

// Switch Search button to "Stop."
searchButton.setText("Stop");

// Reset stats.
table.setModel(new DefaultTableModel(new Object[][] {},
new String[] { "URL" }) {
public boolean isCellEditable(int row, int column) {
return false;
}
});
updateStats(startUrl, 0, 0, maxUrls);

// Open matches log file.
try {
logFileWriter = new PrintWriter(new FileWriter(logFile));
} catch (Exception e) {
showError("Unable to open matches log file.");
return;
}

// Turn crawling flag on.
crawling = true;

// Perform the actual crawling.
crawl(startUrl, maxUrls, limitCheckBox.isSelected(),
searchString, caseCheckBox.isSelected());

// Turn crawling flag off.
crawling = false;

// Close matches log file.
try {
logFileWriter.close();
} catch (Exception e) {
showError("Unable to close matches log file.");
}

// Mark search as done.
crawlingLabel2.setText("Done");

// Enable search controls.
startTextField.setEnabled(true);
maxComboBox.setEnabled(true);
limitCheckBox.setEnabled(true);
logTextField.setEnabled(true);
searchTextField.setEnabled(true);
caseCheckBox.setEnabled(true);

// Switch search button back to "Search."
searchButton.setText("Search");

// Return to default cursor.
setCursor(Cursor.getDefaultCursor());

// Show message if search string not found.
if (table.getRowCount() == 0) {
JOptionPane
.showMessageDialog(
SearchCrawler.this,
"Your Search String was not found. Please try another.",
"Search String Not Found",
JOptionPane.WARNING_MESSAGE);
}
}
});
thread.start();
}

// Show dialog box with error message.
private void showError(String message) {
JOptionPane.showMessageDialog(this, message, "Error",
JOptionPane.ERROR_MESSAGE);
}

// Update crawling stats.
private void updateStats(String crawling, int crawled, int toCrawl,
int maxUrls) {
crawlingLabel2.setText(crawling);
crawledLabel2.setText("" + crawled);
toCrawlLabel2.setText("" + toCrawl);

// Update progress bar.
if (maxUrls == -1) {
progressBar.setMaximum(crawled + toCrawl);
} else {
progressBar.setMaximum(maxUrls);
}
progressBar.setValue(crawled);

matchesLabel2.setText("" + table.getRowCount());
}

// Add match to matches table and log file.
private void addMatch(String url) {
// Add URL to matches table.
DefaultTableModel model = (DefaultTableModel) table.getModel();
model.addRow(new Object[] { url });

// Add URL to matches log file.
try {
logFileWriter.println(url);
} catch (Exception e) {
showError("Unable to log match.");
}
}

// Verify URL format.
private URL verifyUrl(String url) {
// Only allow HTTP URLs.
if (!url.toLowerCase().startsWith("http://"))
return null;

// Verify format of URL.
URL verifiedUrl = null;
try {
verifiedUrl = new URL(url);
} catch (Exception e) {
return null;
}

return verifiedUrl;
}

// Check if robot is allowed to access the given URL.
private boolean isRobotAllowed(URL urlToCheck) {
String host = urlToCheck.getHost().toLowerCase();

// Retrieve host‘s disallow list from cache.
ArrayList disallowList = (ArrayList) disallowListCache.get(host);

// If list is not in the cache, download and cache it.
if (disallowList == null) {
disallowList = new ArrayList();

try {
URL robotsFileUrl = new URL("http://" + host + "/robots.txt");

// Open connection to robot file URL for reading.
BufferedReader reader = new BufferedReader(
new InputStreamReader(robotsFileUrl.openStream()));

// Read robot file, creating list of disallowed paths.
String line;
while ((line = reader.readLine()) != null) {
if (line.indexOf("Disallow:") == 0) {
String disallowPath = line.substring("Disallow:"
.length());

// Check disallow path for comments and remove if
// present.
int commentIndex = disallowPath.indexOf("#");
if (commentIndex != -1) {
disallowPath = disallowPath.substring(0,
commentIndex);
}

// Remove leading or trailing spaces from disallow path.
disallowPath = disallowPath.trim();

// Add disallow path to list.
disallowList.add(disallowPath);
}
}

// Add new disallow list to cache.
disallowListCache.put(host, disallowList);
} catch (Exception e) {
/*
* Assume robot is allowed since an exception is thrown if the
* robot file doesn‘t exist.
*/
return true;
}
}

/*
* Loop through disallow list to see if crawling is allowed for the
* given URL.
*/
String file = urlToCheck.getFile();
for (int i = 0; i < disallowList.size(); i++) {
String disallow = (String) disallowList.get(i);
if (file.startsWith(disallow)) {
return false;
}
}

return true;
}

// Download page at given URL.
private String downloadPage(URL pageUrl) {
try {
// Open connection to URL for reading.
BufferedReader reader = new BufferedReader(new InputStreamReader(
pageUrl.openStream()));

// Read page into buffer.
String line;
StringBuffer pageBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
pageBuffer.append(line);
}

return pageBuffer.toString();
} catch (Exception e) {
}

return null;
}

// Remove leading "www" from a URL‘s host if present.
private String removeWwwFromUrl(String url) {
int index = url.indexOf("://www.");
if (index != -1) {
return url.substring(0, index + 3) + url.substring(index + 7);
}

return (url);
}

// Parse through page contents and retrieve links.
private ArrayList retrieveLinks(URL pageUrl, String pageContents,
HashSet crawledList, boolean limitHost) {
// Compile link matching pattern.
Pattern p = Pattern.compile("<a\\s+href\\s*=\\s*\"?(.*?)[\" |>]",
Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(pageContents);

// Create list of link matches.
ArrayList linkList = new ArrayList();
while (m.find()) {
String link = m.group(1).trim();

// Skip empty links.
if (link.length() < 1) {
continue;
}

// Skip links that are just page anchors.
if (link.charAt(0) == ‘#‘) {
continue;
}

// Skip mailto links.
if (link.indexOf("mailto:") != -1) {
continue;
}

// Skip JavaScript links.
if (link.toLowerCase().indexOf("javascript") != -1) {
continue;
}

// Prefix absolute and relative URLs if necessary.
if (link.indexOf("://") == -1) {
// Handle absolute URLs.
if (link.charAt(0) == ‘/‘) {
link = "http://" + pageUrl.getHost() + link;
// Handle relative URLs.
} else {
String file = pageUrl.getFile();
if (file.indexOf(‘/‘) == -1) {
link = "http://" + pageUrl.getHost() + "/" + link;
} else {
String path = file.substring(0,
file.lastIndexOf(‘/‘) + 1);
link = "http://" + pageUrl.getHost() + path + link;
}
}
}

// Remove anchors from link.
int index = link.indexOf(‘#‘);
if (index != -1) {
link = link.substring(0, index);
}

// Remove leading "www" from URL‘s host if present.
link = removeWwwFromUrl(link);

// Verify link and skip if invalid.
URL verifiedLink = verifyUrl(link);
if (verifiedLink == null) {
continue;
}

/*
* If specified, limit links to those having the same host as the
* start URL.
*/
if (limitHost
&& !pageUrl.getHost().toLowerCase().equals(
verifiedLink.getHost().toLowerCase())) {
continue;
}

// Skip link if it has already been crawled.
if (crawledList.contains(link)) {
continue;
}

// Add link to list.
linkList.add(link);
}

return (linkList);
}

/*
* Determine whether or not search string is matched in the given page
* contents.
*/
private boolean searchStringMatches(String pageContents,
String searchString, boolean caseSensitive) {
String searchContents = pageContents;

/*
* If case-sensitive search, lowercase page contents for comparison.
*/
if (!caseSensitive) {
searchContents = pageContents.toLowerCase();
}
// Split search string into individual terms.
Pattern p = Pattern.compile("[\\s]+");
String[] terms = p.split(searchString);

// Check to see if each term matches.
for (int i = 0; i < terms.length; i++) {
if (caseSensitive) {
if (searchContents.indexOf(terms[i]) == -1) {
return false;
}
} else {
if (searchContents.indexOf(terms[i].toLowerCase()) == -1) {
return false;
}
}
}

return true;
}

// Perform the actual crawling, searching for the search string.
public void crawl(String startUrl, int maxUrls, boolean limitHost,
String searchString, boolean caseSensitive) {
// Set up crawl lists.
HashSet crawledList = new HashSet();
LinkedHashSet toCrawlList = new LinkedHashSet();

// Add start URL to the to crawl list.
toCrawlList.add(startUrl);

/*
* Perform actual crawling by looping through the To Crawl list.
*/
while (crawling && toCrawlList.size() > 0) {
/*
* Check to see if the max URL count has been reached, if it was
* specified.
*/
if (maxUrls != -1) {
if (crawledList.size() == maxUrls) {
break;
}
}

// Get URL at bottom of the list.
String url = (String) toCrawlList.iterator().next();
System.out.println(url);
// Remove URL from the To Crawl list.
toCrawlList.remove(url);

// Convert string url to URL object.
URL verifiedUrl = verifyUrl(url);

// Skip URL if robots are not allowed to access it.
if (!isRobotAllowed(verifiedUrl)) {
continue;
}

// Update crawling stats.
updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);

// Add page to the crawled list.
crawledList.add(url);

// Download the page at the given URL.
String pageContents = downloadPage(verifiedUrl);

/*
* If the page was downloaded successfully, retrieve all its links
* and then see if it contains the search string.
*/
if (pageContents != null && pageContents.length() > 0) {
// Retrieve list of valid links from page.
ArrayList links = retrieveLinks(verifiedUrl, pageContents,
crawledList, limitHost);

// Add links to the To Crawl list.
toCrawlList.addAll(links);

/*
* Check if search string is present in page, and if so, record
* a match.
*/
if (searchStringMatches(pageContents, searchString,
caseSensitive)) {
addMatch(url);
}
}

// Update crawling stats.
updateStats(url, crawledList.size(), toCrawlList.size(), maxUrls);
}
}

// Run the Search Crawler.
public static void main(String[] args) {
SearchCrawler crawler = new SearchCrawler();
crawler.show();
} // Max URLs drop-down values.
}

Java 爬虫

时间: 2024-10-06 22:32:51

Java 爬虫的相关文章

JAVA爬虫 WebCollector

爬虫简介: WebCollector是一个无须配置.便于二次开发的JAVA爬虫框架(内核),它提供精简的的API,只需少量代码即可实现一个功能强大的爬虫. 爬虫内核: WebCollector致力于维护一个稳定.可扩的爬虫内核,便于开发者进行灵活的二次开发.内核具有很强的扩展性,用户可以在内核基础上开发自己想要的爬虫.源码中集成了Jsoup,可进行精准的网页解析. 量级: WebCollector最常用的爬取器BreadthCrawler使用2^24的布隆过滤器进行URL管理,可处理2^24量级

福利贴——爬取美女图片的Java爬虫小程序代码

自己做的一个Java爬虫小程序 废话不多说,先上图. 文件夹命名是用标签缩写,如果大家看得不顺眼可以等下载完成后手动改一下,比如像有强迫症的我一样... 这是挂了一个晚上下载的总大小,不过还有很多因为一些问题没有遍历下载到,而且会产生很多空文件,最下面我附带了一个递归删除空文件夹的小程序代码. 接下来是文件夹内部~ 图片存放位置默认为d:\picture,可在程序中更改,main函数的开头就是,有注释.爬取的网站为http://www.mmonly.cc/,大家有更好的资源网站可以私我. 爬虫源

Java爬虫

1.昨天复习了Java基础(I/O流)和正则表达式 今天不讲Java中的 I/O 主要用一个实例来爬取网站中的邮箱 代码如下: 1 package com.miao.baba.pacong; 2 3 import java.io.BufferedReader; 4 import java.io.IOException; 5 import java.io.InputStream; 6 import java.io.InputStreamReader; 7 import java.net.URL;

Java爬虫项目实战(一)

目的: 通过网络爬虫爬取中国最小粒度的区域维度信息,包括省(Province) .市(City).县(County).镇(town).村委会(village) 主网站链接: http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html 主要jar包: http://jsoup.org/packages/jsoup-1.8.1.jar 之前一节我们说过java爬虫从网络上利用jsoup获取网页文本,也就是说我们可以有三种方法获取h

Java爬虫框架SeimiCrawler的工程自动打包工具使用

为了便于开发者对使用SeimiCrawler框架开发的爬虫工程的打包部署,SeimiCrawler现已推出maven-seimicrawler-plugin,一个maven工程的自动化打包插件.下面简要介绍下他的使用. 开始 pom添加添加plugin <plugin> <groupId>cn.wanghaomiao</groupId> <artifactId>maven-seimicrawler-plugin</artifactId> <

webmagic的设计机制及原理-如何开发一个Java爬虫 转

此文章是webmagic 0.1.0版的设计手册,后续版本的入门及用户手册请看这里:https://github.com/code4craft/webmagic/blob/master/user-manual.md 之前就有网友在博客里留言,觉得webmagic的实现比较有意思,想要借此研究一下爬虫.最近终于集中精力,花了三天时间,终于写完了这篇文章.之前垂直爬虫写了一年多,webmagic框架写了一个多月,这方面倒是有一些心得,希望对读者有帮助. webmagic的目标 一般来说,一个爬虫包括

Java爬虫实战(二):抓取一个视频网站上2015年所有电影的下载链接

前言:这是Java爬虫实战的第二篇文章,在第一篇文章仅仅只是抓取目标网站的链接的基础上,进一步提高难度,抓取目标页面上我们所需要的内容并保存在数据库中.这里的测试案例选用了一个我常用的电影下载网站(http://www.80s.la/).本来是想抓取网站上的所有电影的下载链接,后来感觉需要的时间太长,因此改成了抓取2015年电影的下载链接. 注:文末有我抓取到的整个列表的下载链接(包括:电影名称和迅雷下载链接) 一 原理简介 其实原理都跟第一篇文章差不多,不同的是鉴于这个网站的分类列表实在太多,

关于Java爬虫的研究

起因 最近突然发了羊癫疯,对爬虫十分感兴趣,开始想写几个爬虫练练手,于是,洗手开搞. 像我这种懒人,对爬虫了解个大概之后就开始偷懒了,开始找框架了,Google关键字“Java 爬虫”,第一个搜索结果就是 高票回答推荐的几款爬虫框架:nutch.Heritrix.crawler4j.WebCollector和WebMagic,果断选择了WebMagic,支持国人作品嘛(肯定是中文文档啊) 下手 使用Maven添加框架到项目中,在poxm.xml文件中添加以下依赖.国内的Maven库居然没有Web

超简单的java爬虫

最简单的爬虫,不需要设定代理服务器,不需要设定cookie,不需要http连接池,使用httpget方法,只是为了获取html代码... 好吧,满足这个要求的爬虫应该是最基本的爬虫了.当然这也是做复杂的爬虫的基础. 使用的是httpclient4的相关API.不要跟我讲网上好多都是httpclient3的代码该怎么兼容的问题,它们差不太多,但是我们应该选择新的能用的接口! 当然,还是有很多细节可以去关注一下,比如编码问题(我一般都是强制用UTF-8的) 放码过来: 1 package chris

Java 爬虫工具Jsoup解析

Jsoup是一款 Java 的 HTML 解析器,可直接解析某个 URL 地址.HTML 文本内容.它提供了一套非常省力的 API,可通过 DOM,CSS 以及类似于 jQuery 的操作方法来取出和操作数据. jsoup 的主要功能如下: 1. 从一个 URL,文件或字符串中解析 HTML: 2. 使用 DOM 或 CSS 选择器来查找.取出数据: 3. 可操作 HTML 元素.属性.文本: jsoup 是基于 MIT 协议发布的,可放心使用于商业项目. jsoup 可以从包括字符串.URL