转自:http://blog.csdn.net/huangxy10/article/details/8120106 备注:把项目属性中的字符集改成多字节集合? 1 // 网络爬虫.cpp : 定义控制台应用程序的入口点。 2 // 3 4 #include "stdafx.h" 5 /* 6 7 int _tmain(int argc, _TCHAR* argv[]) 8 { 9 return 0; 10 } 11 12 */ 13 14 //#include <Windows.h> 15 #include <string> 16 #include <iostream> 17 #include <fstream> 18 #include <vector> 19 #include "winsock2.h" 20 #include <time.h> 21 #include <queue> 22 #include <hash_set> 23 24 #pragma comment(lib, "ws2_32.lib") 25 using namespace std; 26 27 #define DEFAULT_PAGE_BUF_SIZE 1048576 28 29 queue<string> hrefUrl; 30 hash_set<string> visitedUrl; 31 hash_set<string> visitedImg; 32 int depth=0; 33 int g_ImgCnt=1; 34 35 //解析URL,解析出主机名,资源名 36 bool ParseURL( const string & url, string & host, string & resource){ 37 if ( strlen(url.c_str()) > 2000 ) { 38 return false; 39 } 40 41 const char * pos = strstr( url.c_str(), "http://" ); 42 if( pos==NULL ) pos = url.c_str(); 43 else pos += strlen("http://"); 44 if( strstr( pos, "/")==0 ) 45 return false; 46 char pHost[100]; 47 char pResource[2000]; 48 sscanf( pos, "%[^/]%s", pHost, pResource ); 49 host = pHost; 50 resource = pResource; 51 return true; 52 } 53 54 //使用Get请求,得到响应 55 bool GetHttpResponse( const string & url, char * &response, int &bytesRead ){ 56 string host, resource; 57 if(!ParseURL( url, host, resource )){ 58 cout << "Can not parse the url"<<endl; 59 return false; 60 } 61 62 //建立socket 63 struct hostent * hp= gethostbyname( host.c_str() ); 64 if( hp==NULL ){ 65 cout<< "Can not find host address"<<endl; 66 return false; 67 } 68 69 SOCKET sock = socket( AF_INET, SOCK_STREAM, IPPROTO_TCP); 70 if( sock == -1 || sock == -2 ){ 71 cout << "Can not create sock."<<endl; 72 return false; 73 } 74 75 //建立服务器地址 76 SOCKADDR_IN sa; 77 sa.sin_family = AF_INET; 78 sa.sin_port = htons( 80 ); 79 //char addr[5]; 80 //memcpy( addr, hp->h_addr, 4 ); 81 //sa.sin_addr.s_addr = inet_addr(hp->h_addr); 82 memcpy( &sa.sin_addr, hp->h_addr, 4 ); 83 84 //建立连接 85 if( 0!= connect( sock, (SOCKADDR*)&sa, sizeof(sa) ) ){ 86 cout << "Can not connect: "<< url <<endl; 87 closesocket(sock); 88 return false; 89 }; 90 91 //准备发送数据 92 string request = "GET " + resource + " HTTP/1.1\r\nHost:" + host + "\r\nConnection:Close\r\n\r\n"; 93 94 //发送数据 95 if( SOCKET_ERROR ==send( sock, request.c_str(), request.size(), 0 ) ){ 96 cout << "send error" <<endl; 97 closesocket( sock ); 98 return false; 99 } 100 101 //接收数据 102 int m_nContentLength = DEFAULT_PAGE_BUF_SIZE; 103 char *pageBuf = (char *)malloc(m_nContentLength); 104 memset(pageBuf, 0, m_nContentLength); 105 106 bytesRead = 0; 107 int ret = 1; 108 cout <<"Read: "; 109 while(ret > 0){ 110 ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0); 111 112 if(ret > 0) 113 { 114 bytesRead += ret; 115 } 116 117 if( m_nContentLength - bytesRead<100){ 118 cout << "\nRealloc memorry"<<endl; 119 m_nContentLength *=2; 120 pageBuf = (char*)realloc( pageBuf, m_nContentLength); //重新分配内存 121 } 122 cout << ret <<" "; 123 } 124 cout <<endl; 125 126 pageBuf[bytesRead] = ‘\0‘; 127 response = pageBuf; 128 closesocket( sock ); 129 return true; 130 //cout<< response <<endl; 131 } 132 133 //提取所有的URL以及图片URL 134 void HTMLParse ( string & htmlResponse, vector<string> & imgurls, const string & host ){ 135 //找所有连接,加入queue中 136 const char *p= htmlResponse.c_str(); 137 char *tag="href=\""; 138 const char *pos = strstr( p, tag ); 139 ofstream ofile("url.txt", ios::app); 140 while( pos ){ 141 pos +=strlen(tag); 142 const char * nextQ = strstr( pos, "\"" ); 143 if( nextQ ){ 144 char * url = new char[ nextQ-pos+1 ]; 145 //char url[100]; //固定大小的会发生缓冲区溢出的危险 146 sscanf( pos, "%[^\"]", url); 147 string surl = url; // 转换成string类型,可以自动释放内存 148 if( visitedUrl.find( surl ) == visitedUrl.end() ){ 149 visitedUrl.insert( surl ); 150 ofile << surl<<endl; 151 hrefUrl.push( surl ); 152 } 153 pos = strstr(pos, tag ); 154 delete [] url; // 释放掉申请的内存 155 } 156 } 157 ofile << endl << endl; 158 ofile.close(); 159 160 tag ="<img "; 161 const char* att1= "src=\""; 162 const char* att2="lazy-src=\""; 163 const char *pos0 = strstr( p, tag ); 164 while( pos0 ){ 165 pos0 += strlen( tag ); 166 const char* pos2 = strstr( pos0, att2 ); 167 if( !pos2 || pos2 > strstr( pos0, ">") ) { 168 pos = strstr( pos0, att1); 169 if(!pos) { 170 pos0 = strstr(att1, tag ); 171 continue; 172 } else { 173 pos = pos + strlen(att1); 174 } 175 } 176 else { 177 pos = pos2 + strlen(att2); 178 } 179 180 const char * nextQ = strstr( pos, "\""); 181 if( nextQ ){ 182 char * url = new char[nextQ-pos+1]; 183 sscanf( pos, "%[^\"]", url); 184 cout << url<<endl; 185 string imgUrl = url; 186 if( visitedImg.find( imgUrl ) == visitedImg.end() ){ 187 visitedImg.insert( imgUrl ); 188 imgurls.push_back( imgUrl ); 189 } 190 pos0 = strstr(pos0, tag ); 191 delete [] url; 192 } 193 } 194 cout << "end of Parse this html"<<endl; 195 } 196 197 //把URL转化为文件名 198 string ToFileName( const string &url ){ 199 string fileName; 200 fileName.resize( url.size()); 201 int k=0; 202 for( int i=0; i<(int)url.size(); i++){ 203 char ch = url[i]; 204 if( ch!=‘\\‘&&ch!=‘/‘&&ch!=‘:‘&&ch!=‘*‘&&ch!=‘?‘&&ch!=‘"‘&&ch!=‘<‘&&ch!=‘>‘&&ch!=‘|‘) 205 fileName[k++]=ch; 206 } 207 return fileName.substr(0,k) + ".txt"; 208 } 209 210 //下载图片到img文件夹 211 void DownLoadImg( vector<string> & imgurls, const string &url ){ 212 213 //生成保存该url下图片的文件夹 214 string foldname = ToFileName( url ); 215 foldname = "./img/"+foldname; 216 if(!CreateDirectory( (LPCSTR)foldname.c_str(),NULL )) 217 cout << "Can not create directory:"<< foldname<<endl; 218 char *image; 219 int byteRead; 220 for( int i=0; i<imgurls.size(); i++){ 221 //判断是否为图片,bmp,jgp,jpeg,gif 222 string str = imgurls[i]; 223 int pos = str.find_last_of("."); 224 if( pos == string::npos ) 225 continue; 226 else{ 227 string ext = str.substr( pos+1, str.size()-pos-1 ); 228 if( ext!="bmp"&& ext!="jpg" && ext!="jpeg"&& ext!="gif"&&ext!="png") 229 continue; 230 } 231 //下载其中的内容 232 if( GetHttpResponse(imgurls[i], image, byteRead)){ 233 if ( strlen(image) ==0 ) { 234 continue; 235 } 236 const char *p=image; 237 const char * pos = strstr(p,"\r\n\r\n")+strlen("\r\n\r\n"); 238 int index = imgurls[i].find_last_of("/"); 239 if( index!=string::npos ){ 240 string imgname = imgurls[i].substr( index , imgurls[i].size() ); 241 ofstream ofile( foldname+imgname, ios::binary ); 242 if( !ofile.is_open() ) 243 continue; 244 cout <<g_ImgCnt++<< foldname+imgname<<endl; 245 ofile.write( pos, byteRead- (pos-p) ); 246 ofile.close(); 247 } 248 free(image); 249 } 250 } 251 } 252 253 254 255 //广度遍历 256 void BFS( const string & url ){ 257 char * response; 258 int bytes; 259 // 获取网页的相应,放入response中。 260 if( !GetHttpResponse( url, response, bytes ) ){ 261 cout << "The url is wrong! ignore." << endl; 262 return; 263 } 264 string httpResponse=response; 265 free( response ); 266 string filename = ToFileName( url ); 267 ofstream ofile( "./html/"+filename ); 268 if( ofile.is_open() ){ 269 // 保存该网页的文本内容 270 ofile << httpResponse << endl; 271 ofile.close(); 272 } 273 vector<string> imgurls; 274 //解析该网页的所有图片链接,放入imgurls里面 275 HTMLParse( httpResponse, imgurls, url ); 276 277 //下载所有的图片资源 278 DownLoadImg( imgurls, url ); 279 } 280 281 void main() 282 { 283 //初始化socket,用于tcp网络连接 284 WSADATA wsaData; 285 if( WSAStartup(MAKEWORD(2,2), &wsaData) != 0 ){ 286 return; 287 } 288 289 // 创建文件夹,保存图片和网页文本文件 290 CreateDirectory((LPCSTR) "./img",0); 291 CreateDirectory((LPCSTR)"./html",0); 292 //string urlStart = "http://hao.360.cn/meinvdaohang.html"; 293 294 // 遍历的起始地址 295 // string urlStart = "http://www.wmpic.me/tupian"; 296 string urlStart = "http://item.taobao.com/item.htm?spm=a230r.1.14.19.sBBNbz&id=36366887850&ns=1#detail"; 297 298 // 使用广度遍历 299 // 提取网页中的超链接放入hrefUrl中,提取图片链接,下载图片。 300 BFS( urlStart ); 301 302 // 访问过的网址保存起来 303 visitedUrl.insert( urlStart ); 304 305 while( hrefUrl.size()!=0 ){ 306 string url = hrefUrl.front(); // 从队列的最开始取出一个网址 307 cout << url << endl; 308 BFS( url ); // 遍历提取出来的那个网页,找它里面的超链接网页放入hrefUrl,下载它里面的文本,图片 309 hrefUrl.pop(); // 遍历完之后,删除这个网址 310 } 311 WSACleanup(); 312 return; 313 }
时间: 2024-10-25 13:05:03