#!/usr/bin/env python
from sys import
argv
from os import
makedirs, unlink, sep
from os.path import
dirname, exists, isdir, splitext
from string import
replace, find, lower
#from htmllib import HTMLParser
from urllib import
urlretrieve
from urlparse import
urlparse, urljoin
from
formatter import
DumbWriter, AbstractFormatter
from
cStringIO import
StringIO
from
HTMLParser import
HTMLParser<br> ‘‘‘下面的三行代码是为了设置默认编码 utf8.如果不这样做,python会默认用ascii编码方式去解析,那么如果遇到unicode的编码就出错了。这里先import sys后 reload sys是因为,sys在默认导入的时候通常会删掉setdefaultencoding这个函数,所以需要用reload加载一下‘‘‘
import
sys
reload (sys)
sys.setdefaultencoding( ‘utf8‘ )
class
RetrieveURL(HTMLParser): #我们用HTMLParser新生成了一个类
def
__init__( self ):
HTMLParser.__init__( self )
self .anchorlist = [] #重写__init__函数的唯一目的就是对该类的对象增加一个anchorlist
def
handle_starttag( self , tag, attrs): #重写handle_starttag函数,让它在遇到<A>标签的时候把href属性代表的超链接记录在anchorlist中
if
tag = = ‘a‘
or tag = = ‘A‘ :
for
t in
attrs :
if
t[ 0 ] = =
‘href‘ or t[ 0 ] = = ‘HREF‘ :
self .anchorlist.append(t[ 1 ])
class
Retriever( object ): # download Web pages
def
__init__( self , url):
self .url =
url
self . file
= self .filename(url)
def
filename( self , url, deffile = ‘index.htm‘ ):
parsedurl =
urlparse(url, ‘http:‘ , 0 ) ## parse path
path =
parsedurl[ 1 ] +
parsedurl[ 2 ]
ext =
splitext(path)
if
ext[ 1 ] = =
‘‘: # no file, use default. ( what kind of situation this could be? https://www.baidu.com/file1)
if
path[ - 1 ] = =
‘/‘ :
path + =
deffile
else :
path + =
‘/‘ + deffile
ldir =
dirname(path) # local directory
if
sep ! =
‘/‘ : # os-indep. path separator
ldir =
replace(ldir, ‘/‘ , sep)
if
not isdir(ldir): # create archive dir if nec.
if
exists(ldir): unlink(ldir)
print
‘ldir is ‘ ,ldir
makedirs(ldir)
return
path
def
download( self ): # download Web page
try :
retval =
urlretrieve( self .url, self . file )
except
IOError:
retval =
( ‘*** ERROR: invalid URL "%s"‘
% self .url,)
return
retval
return
retval
‘‘‘def parseAndGetLinks(self):# parse HTML, save links
self.parser = HTMLParser(AbstractFormatter(DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parser.close()
return self.parser.anchorlist‘‘‘
def
parseAndGetLinks( self ):
self .parser = RetrieveURL()
self .parser.feed( open ( self . file ).read())
self .parser.close()
return
self .parser.anchorlist
class
Crawler( object ): # manage entire crawling process
count =
0 # static downloaded page counter
def
__init__( self , url):
self .q =
[url]
self .seen =
[]
self .dom =
urlparse(url)[ 1 ]
def
getPage( self , url):
r =
Retriever(url)
retval =
r.download()
if
retval[ 0 ] = =
‘*‘ : # error situation, do not parse
print
retval, ‘... skipping parse‘
return
Crawler.count + =
1
print
‘\n(‘ , Crawler.count, ‘)‘
print
‘URL:‘ , url
print
‘FILE:‘ , retval[ 0 ]
self .seen.append(url)
links =
r.parseAndGetLinks() # get and process links
for
eachLink in
links:
if
eachLink[: 4 ] ! =
‘http‘ and find(eachLink, ‘://‘ ) = =
- 1 :
eachLink =
urljoin(url, eachLink)
print
‘* ‘ , eachLink,
if
find(lower(eachLink), ‘mailto:‘ ) ! =
- 1 :
print
‘... discarded, mailto link‘
continue
if
eachLink not
in self .seen:
if
find(eachLink, self .dom) = =
- 1 :
print
‘... discarded, not in domain‘
else :
if
eachLink not
in self .q:
self .q.append(eachLink)
print
‘... new, added to Q‘
else :
print
‘... discarded, already in Q‘
else :
print
‘... discarded, already processed‘
def
go( self ): # process links in queue
while
self .q:
url =
self .q.pop()
self .getPage(url)
def
main():
if
len (argv) > 1 :
url =
argv[ 1 ]
else :
try :
url =
raw_input ( ‘Enter starting URL: ‘ )
except
(KeyboardInterrupt, EOFError):
url =
‘‘
if
not url: return
robot =
Crawler(url)
robot.go()
if
__name__ = =
‘__main__‘ :
main()
|