在学生时期,可能听到网络爬虫这个词会觉得很高大上,但是它的简单实现可能学生都不难懂。
网络爬虫应用,就是把整个互联网真的就当做一张网,像蜘蛛网那样,应用就像一个虫子,在网上面按照一定的规则爬动。
现在互联网应用最广的就是http(s)协议了,本文例子就是基于使用http(s)协议的,只作为示例,不涉及复杂的算法(实际上是最重要的)。
设计思路:
程序入口从一个或多个url开始,通过http(s)获取url的内容,对获取到内容处理,获取内容中需要爬取的信息,获取到内容中的url链接,再重复以上步骤。
不多说,详情看代码已经注释:
/**
* 功能概要:主程序
*
* @author hwz
*/
public class MainApp {
private Integer corePoolSize = 10;
private Integer maxPoolSize = 20;
private ThreadPoolExecutor executor;
/** 工作队列 */
private SpiderQueue workQueue;
public void start(String url) throws Exception {
//初始化线程池
LinkedBlockingDeque<Runnable> executorQueue = new LinkedBlockingDeque<Runnable>(maxPoolSize);
executor = new ThreadPoolExecutor(corePoolSize, maxPoolSize, 60L, TimeUnit.SECONDS,
executorQueue);
workQueue = new SpiderQueue(1024);
SpiderUrl spiderUrl = new SpiderUrl(url, 0);
try {
workQueue.add(spiderUrl);
}
catch (Exception e) {
System.out.println("insert url into workQueue error,url=" + url);
e.printStackTrace();
}
//提交第一个执行任务
executor.submit(new SimpleSpider(workQueue, "thread-" + "main"));
int i=0;
int idle = 0;
while(true) {
//判断是否增加更多线程执行任务
if (workQueue.size() > 20 && executor.getActiveCount() < maxPoolSize) {
idle = 0;
System.out.println("submit new thread,workQueue.size=" + workQueue.size() +
",executorQueue.activeCount=" + executor.getActiveCount() + ",i=" + i);
executor.submit(new SimpleSpider(workQueue, "thread-" + i++));
Thread.sleep(500);
}
else if (workQueue.size() == 0){
idle++;
System.out.println("main method, idle times=" + idle);
//主线程空闲20次,结束运行
if (idle > 20) {
System.out.println("main method, idle times=" + idle + ",end!");
break;
}
Thread.sleep(1000);
}
else {
Thread.sleep(2000);
}
}
System.out.println("End!,workQueue.size=" + workQueue.size() +
",executorQueue.activeCount=" + executor.getActiveCount() + ",executorQueue.CompletedTaskCount" +
executor.getCompletedTaskCount() + ",i=" + i);
workQueue.printAll();
executor.shutdown();
System.exit(0);
}
public static void main(String[] args) throws Exception {
MainApp app = new MainApp();
app.start("http://www.csdn.net/");
}
}
/**
*
* 功能概要:自定义爬虫工作同步队列,使用ArrayList实现
*
* @author hwz
*/publicclass SpiderQueue {/** 存储器 */private List<SpiderUrl> queue;
publicSpiderQueue(int size) {
queue = new ArrayList<SpiderUrl>(size);
}
publicsynchronizedvoidadd(SpiderUrl spiderUrl) {
queue.add(spiderUrl);
}
publicsynchronized SpiderUrl poll() {
if (queue.isEmpty()) {
returnnull;
}
//控制台打印结果,方便查看
SpiderUrl spiderUrl = queue.remove(0);
System.out.println("SpiderQueue,poll,SpiderUrl=" + spiderUrl.toString() + ",remain size=" + queue.size());
return spiderUrl;
}
publicsynchronized SpiderUrl peek() {
if (queue.isEmpty()) {
returnnull;
}
return queue.get(0);
}
publicsynchronizedbooleanisExsit(SpiderUrl spiderUrl) {
return queue.contains(spiderUrl);
}
publicsynchronizedintsize() {
return queue.size();
}
publicvoidprintAll() {
System.out.println("Enter printAll.");
for (SpiderUrl spiderUrl : queue) {
System.out.println(spiderUrl);
}
}
}
/**
*
* 功能概要:爬虫工作的url
*
* @author hwz
*/publicclass SpiderUrl {/** http(s) url */private String url;
/** 该url是入口url的第几层 */privateint deep;
publicSpiderUrl(String url, int deep) {
this.url = url;
this.deep = deep;
}
public String getUrl() {
return url;
}
publicvoidsetUrl(String url) {
this.url = url;
}
publicintgetDeep() {
return deep;
}
publicvoidsetDeep(int deep) {
this.deep = deep;
}
@Overridepublicbooleanequals(Object obj) {
if (!(obj instanceof SpiderUrl)) {
returnfalse;
}
SpiderUrl oth = (SpiderUrl) obj;
returnthis.url.equals(oth.getUrl());
}
@OverridepublicinthashCode() {
return url.hashCode();
}
@Overridepublic String toString() {
return getClass().toString() + "[url:" + url + ",deep:" + deep +"]";
}
}
/**
*
* 功能概要:爬虫工作类,主要实现类
*
* @author hwz
*/publicclass SimpleSpider implements Runnable{private String threadName;
private SpiderUrl url;
private SpiderQueue workQueue;
publicSimpleSpider(SpiderQueue workQueue, String threadName) {
this.workQueue = workQueue;
this.threadName = threadName;
}
@Overridepublicvoidrun() {
System.out.println(threadName + " start run");
//连续空闲10次循环,结束任务int idle = 0;
while (idle < 10) {
url = workQueue.poll();
if (url != null) {
//url 解析
parseUrl(url);
idle = 0;
}
else {
System.out.println(threadName + " idle...,times=" + idle++);
try {
Thread.sleep(1000);
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
}
System.out.println(threadName + " end run...");
}
/**
* url解析
* @param url
* @return void
*/privatevoidparseUrl(SpiderUrl url) {
if (url == null) {
return;
}
try {
int deep = url.getDeep() + 1;
URL netUrl = new URL(url.getUrl());
URLConnection connection = netUrl.openConnection();
String contentType = connection.getContentType();
//获取内容
String resource = getResource(connection);
//获取标题
String title = getTitle(resource);
//获取链接
List<String> urls = getUrls(resource);
System.out.println(threadName + ",parseUrl url=" + url + ",contentType=" + contentType + ",title=" + title + ",urls=" + urls);
//控制爬取链接层数,如果获取到的url全部加入工作队列,将会是指数级增加,最后程序挂掉if (deep < 3) {
SpiderUrl newUrl;
for (String u : urls) {
newUrl = new SpiderUrl(u,deep);
if(!workQueue.isExsit(newUrl)) {
workQueue.add(newUrl);
}
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}
/**
* 读取http url 内容
* @param connection
* @return
* @return String
*/private String getResource(URLConnection connection) {
if (connection == null) {
returnnull;
}
StringBuilder sb = new StringBuilder();
try {
InputStream inputStream = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(inputStream, "UTF-8");
int input;
while ( (input = isr.read()) != -1) {
sb.append((char)input);
}
}
catch (IOException e) {
System.out.println(threadName + ",get resource error,connection=" + connection);
}
return sb.toString();
}
/**
* 从url内容获取标题
* @param content
* @return
* @return String
*/private String getTitle(String content) {
if (content == null) {
returnnull;
}
Pattern pattern = Pattern.compile("(<title>.{1,}</title>)");
Matcher matcher = pattern.matcher(content);
String title = null;
if (matcher.find()) {
title = matcher.group(0).replaceAll("<title>", "").replaceAll("</title>", "");
}
return title;
}
/**
* 从url内容中获取存在的url链接
* @param content
* @return
* @return List<String>
*/private List<String> getUrls(String content) {
if (content == null) {
returnnull;
}
Pattern pattern = Pattern.compile("(<a.{1,}?href=[‘\"]?[a-zA-z]+:\\/\\/[^\\s]*?[\\s>]{1})");
Matcher matcher = pattern.matcher(content);
String a;
String lastChar;
List<String> links = new ArrayList<String>();
while (matcher.find()) {
a = matcher.group(0).replaceAll("<a.{1,}?href=[‘\"]?", "");
a = a.trim();
lastChar = a.substring(a.length()-1);
if (lastChar.equals("‘") || lastChar.equals("\"") || lastChar.equals(">")) {
a = a.substring(0,a.length()-1);
}
links.add(a);
}
return links;
}
}
在学生时期,可能听到网络爬虫这个词会觉得很高大上,但是它的简单实现可能学生都不难懂。
网络爬虫应用,就是把整个互联网真的就当做一张网,像蜘蛛网那样,应用就像一个虫子,在网上面按照一定的规则爬动。
现在互联网应用最广的就是http(s)协议了,本文例子就是基于使用http(s)协议的,只作为示例,不涉及复杂的算法(实际上是最重要的)。
设计思路:
程序入口从一个或多个url开始,通过http(s)获取url的内容,对获取到内容处理,获取内容中需要爬取的信息,获取到内容中的url链接,再重复以上步骤。
不多说,详情看代码已经注释:
/**
* 功能概要:主程序
*
* @author hwz
*/
public class MainApp {
private Integer corePoolSize = 10;
private Integer maxPoolSize = 20;
private ThreadPoolExecutor executor;
/** 工作队列 */
private SpiderQueue workQueue;
public void start(String url) throws Exception {
//初始化线程池
LinkedBlockingDeque<Runnable> executorQueue = new LinkedBlockingDeque<Runnable>(maxPoolSize);
executor = new ThreadPoolExecutor(corePoolSize, maxPoolSize, 60L, TimeUnit.SECONDS,
executorQueue);
workQueue = new SpiderQueue(1024);
SpiderUrl spiderUrl = new SpiderUrl(url, 0);
try {
workQueue.add(spiderUrl);
}
catch (Exception e) {
System.out.println("insert url into workQueue error,url=" + url);
e.printStackTrace();
}
//提交第一个执行任务
executor.submit(new SimpleSpider(workQueue, "thread-" + "main"));
int i=0;
int idle = 0;
while(true) {
//判断是否增加更多线程执行任务
if (workQueue.size() > 20 && executor.getActiveCount() < maxPoolSize) {
idle = 0;
System.out.println("submit new thread,workQueue.size=" + workQueue.size() +
",executorQueue.activeCount=" + executor.getActiveCount() + ",i=" + i);
executor.submit(new SimpleSpider(workQueue, "thread-" + i++));
Thread.sleep(500);
}
else if (workQueue.size() == 0){
idle++;
System.out.println("main method, idle times=" + idle);
//主线程空闲20次,结束运行
if (idle > 20) {
System.out.println("main method, idle times=" + idle + ",end!");
break;
}
Thread.sleep(1000);
}
else {
Thread.sleep(2000);
}
}
System.out.println("End!,workQueue.size=" + workQueue.size() +
",executorQueue.activeCount=" + executor.getActiveCount() + ",executorQueue.CompletedTaskCount" +
executor.getCompletedTaskCount() + ",i=" + i);
workQueue.printAll();
executor.shutdown();
System.exit(0);
}
public static void main(String[] args) throws Exception {
MainApp app = new MainApp();
app.start("http://www.csdn.net/");
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
/**
*
* 功能概要:自定义爬虫工作同步队列,使用ArrayList实现
*
* @author hwz
*/
public class SpiderQueue {
/** 存储器 */
private List<SpiderUrl> queue;
public SpiderQueue(int size) {
queue = new ArrayList<SpiderUrl>(size);
}
public synchronized void add(SpiderUrl spiderUrl) {
queue.add(spiderUrl);
}
public synchronized SpiderUrl poll() {
if (queue.isEmpty()) {
return null;
}
//控制台打印结果,方便查看
SpiderUrl spiderUrl = queue.remove(0);
System.out.println("SpiderQueue,poll,SpiderUrl=" + spiderUrl.toString() + ",remain size=" + queue.size());
return spiderUrl;
}
public synchronized SpiderUrl peek() {
if (queue.isEmpty()) {
return null;
}
return queue.get(0);
}
public synchronized boolean isExsit(SpiderUrl spiderUrl) {
return queue.contains(spiderUrl);
}
public synchronized int size() {
return queue.size();
}
public void printAll() {
System.out.println("Enter printAll.");
for (SpiderUrl spiderUrl : queue) {
System.out.println(spiderUrl);
}
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
/**
*
* 功能概要:爬虫工作的url
*
* @author hwz
*/
public class SpiderUrl {
/** http(s) url */
private String url;
/** 该url是入口url的第几层 */
private int deep;
public SpiderUrl(String url, int deep) {
this.url = url;
this.deep = deep;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public int getDeep() {
return deep;
}
public void setDeep(int deep) {
this.deep = deep;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof SpiderUrl)) {
return false;
}
SpiderUrl oth = (SpiderUrl) obj;
return this.url.equals(oth.getUrl());
}
@Override
public int hashCode() {
return url.hashCode();
}
@Override
public String toString() {
return getClass().toString() + "[url:" + url + ",deep:" + deep +"]";
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
/**
*
* 功能概要:爬虫工作类,主要实现类
*
* @author hwz
*/
public class SimpleSpider implements Runnable{
private String threadName;
private SpiderUrl url;
private SpiderQueue workQueue;
public SimpleSpider(SpiderQueue workQueue, String threadName) {
this.workQueue = workQueue;
this.threadName = threadName;
}
@Override
public void run() {
System.out.println(threadName + " start run");
//连续空闲10次循环,结束任务
int idle = 0;
while (idle < 10) {
url = workQueue.poll();
if (url != null) {
//url 解析
parseUrl(url);
idle = 0;
}
else {
System.out.println(threadName + " idle...,times=" + idle++);
try {
Thread.sleep(1000);
}
catch (InterruptedException e) {
e.printStackTrace();
}
}
}
System.out.println(threadName + " end run...");
}
/**
* url解析
* @param url
* @return void
*/
private void parseUrl(SpiderUrl url) {
if (url == null) {
return;
}
try {
int deep = url.getDeep() + 1;
URL netUrl = new URL(url.getUrl());
URLConnection connection = netUrl.openConnection();
String contentType = connection.getContentType();
//获取内容
String resource = getResource(connection);
//获取标题
String title = getTitle(resource);
//获取链接
List<String> urls = getUrls(resource);
System.out.println(threadName + ",parseUrl url=" + url + ",contentType=" + contentType + ",title=" + title + ",urls=" + urls);
//控制爬取链接层数,如果获取到的url全部加入工作队列,将会是指数级增加,最后程序挂掉
if (deep < 3) {
SpiderUrl newUrl;
for (String u : urls) {
newUrl = new SpiderUrl(u,deep);
if(!workQueue.isExsit(newUrl)) {
workQueue.add(newUrl);
}
}
}
}
catch (IOException e) {
e.printStackTrace();
}
}
/**
* 读取http url 内容
* @param connection
* @return
* @return String
*/
private String getResource(URLConnection connection) {
if (connection == null) {
return null;
}
StringBuilder sb = new StringBuilder();
try {
InputStream inputStream = connection.getInputStream();
InputStreamReader isr = new InputStreamReader(inputStream, "UTF-8");
int input;
while ( (input = isr.read()) != -1) {
sb.append((char)input);
}
}
catch (IOException e) {
System.out.println(threadName + ",get resource error,connection=" + connection);
}
return sb.toString();
}
/**
* 从url内容获取标题
* @param content
* @return
* @return String
*/
private String getTitle(String content) {
if (content == null) {
return null;
}
Pattern pattern = Pattern.compile("(<title>.{1,}</title>)");
Matcher matcher = pattern.matcher(content);
String title = null;
if (matcher.find()) {
title = matcher.group(0).replaceAll("<title>", "").replaceAll("</title>", "");
}
return title;
}
/**
* 从url内容中获取存在的url链接
* @param content
* @return
* @return List<String>
*/
private List<String> getUrls(String content) {
if (content == null) {
return null;
}
Pattern pattern = Pattern.compile("(<a.{1,}?href=[‘\"]?[a-zA-z]+:\\/\\/[^\\s]*?[\\s>]{1})");
Matcher matcher = pattern.matcher(content);
String a;
String lastChar;
List<String> links = new ArrayList<String>();
while (matcher.find()) {
a = matcher.group(0).replaceAll("<a.{1,}?href=[‘\"]?", "");
a = a.trim();
lastChar = a.substring(a.length()-1);
if (lastChar.equals("‘") || lastChar.equals("\"") || lastChar.equals(">")) {
a = a.substring(0,a.length()-1);
}
links.add(a);
}
return links;
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
- 85
- 86
- 87
- 88
- 89
- 90
- 91
- 92
- 93
- 94
- 95
- 96
- 97
- 98
- 99
- 100
- 101
- 102
- 103
- 104
- 105
- 106
- 107
- 108
- 109
- 110
- 111
- 112
- 113
- 114
- 115
- 116
- 117
- 118
- 119
- 120
- 121
- 122
- 123
- 124
- 125
- 126
- 127
- 128
- 129
- 130
- 131
- 132
- 133
- 134
- 135
- 136
- 137
- 138
- 139
- 140
- 141
- 142
- 143
- 144
- 145
- 146
- 147
- 148
- 149
- 150
- 151
- 152
该代码示例,旨在说明一个简单的爬虫,关于多线程和http的处理没有过多考虑,如存在错误,请指出。
时间: 2024-10-24 07:10:30