


我原本希望能只下载感兴趣的文章。网页上每篇文章对应有一个勾选框,勾选后对应的文章就会高亮,说实话我不知道网站用这个来干什么。。也许我可以勾选感兴趣的文章后再下载。勾选后这个元素的class会从noselectrow变为selectedrow. 相关的代码如下:

function hightlightrowaction(rowid) {
    var thisrow = $("#"+rowid);
    if ($(thisrow).hasClass("selectedrow")) {
    } else {



 1 # -*- coding: utf-8 -*-
 2 """
 3 This script is used to download file from《物理》(http://www.wuli.ac.cn/CN/volumn/home.shtml) automatically.
 4 example usage:
 6 downloadFiles(u‘f:\\物理\\‘, "http://www.wuli.ac.cn/CN/volumn/volumn_1696.shtml")
 7 """
 8 import requests
 9 from bs4 import BeautifulSoup
10 import urllib
11 import re
12 import os
13 def hasDownloadLink(tag):
14     return tag.has_attr(‘onclick‘) and tag[‘onclick‘].startswith(‘showArticleFile‘)
16 def getFileTypeAndID(fileInfo):
17     """
18     :param fileInfo:
19     :return: file type(usually pdf) and file ID
20     """
21     m = re.match(r‘[^,]*,\s*[\‘\"](.*)[\‘\"][^,]*,\s*([^\)]*).*‘, fileInfo)
22     return m.groups()[0], m.groups()[1]
24 def getPublicationYearMonth(tag):
25     """
26     :param tag:
27     :return: publication year and month in the form YYYY-MM
28     """
29     return re.match(r‘.*(\d{4}-\d{2}).*‘, tag.get_text()).groups()[0]
31 def modifyFileName(fname):
32     # get rid of characters which are not allowed to be used in file name by Windows
33     for inValidChar in r‘\/:?"<>|‘:
34         fname = fname.replace(inValidChar, ‘‘)
35     return fname
37 def writeLog(saveDirectory, errMsg):
38     fhandle = open(saveDirectory + "download log.txt", ‘w‘)
39     for msg in errMsg:
40         fhandle.write(msg.encode(‘utf-8‘));
41     fhandle.close()
43 def downloadFiles(saveDirectory, url, onlyDownloadSeleted = False):
44     """
45     :param saveDirectory: directory to store the downloaded files
46     :param url: url of the download page
47     :param onlyDownloadSeleted: not implemented yet. Ideally, it should allow one to download only interested instead of all files.
48     :return: None
49     """
50     page = urllib.urlopen(url)
51     soup = BeautifulSoup(page)
52     volumeAndDateTag = soup.find(class_="STYLE5")
53     yearMonth = getPublicationYearMonth(volumeAndDateTag)
54     year = yearMonth[:4]
55     relativePath = year + "\\" + yearMonth + "\\"
56     absolutePath = saveDirectory + relativePath
57     if not os.path.exists(absolutePath):
58         os.makedirs(absolutePath)
59     articleMark = "selectedrow" if onlyDownloadSeleted else "noselectrow"
60     articles = soup.find_all(class_ = articleMark)
61     errMsg = []
62     for index, article in enumerate(articles, 1):
63         print ‘Downloading the %d th file, %d left.‘ % (index, len(articles) - index)
64         # the title of one article in contained in the first anchor
65         title = article.find(‘a‘).get_text()
66         title = modifyFileName(title)
67         try:
68             downloadAnchor = article.find(hasDownloadLink)
69             fileInfo = downloadAnchor[‘onclick‘]
70             fileType, fileID = getFileTypeAndID(fileInfo)
71             fileName = title+‘.‘+fileType.lower()
72             filePath = absolutePath + fileName
73             param = {"attachType":fileType, "id":fileID}
74             if not os.path.exists(filePath):
75                    articleFile = requests.get("http://www.wuli.ac.cn/CN/article/downloadArticleFile.do",params=param)
76                    fhandle = open(filePath, "wb")
77                    fhandle.write(articleFile.content)
78                    fhandle.close()
79         except:
80             errMsg.append(title + " download failed")
82     if len(errMsg) > 0:
83         writeLog(absolutePath, errMsg)
85 if __name__ == "__main__":
86     downloadFiles(u‘f:\\物理\\‘, "http://www.wuli.ac.cn/CN/volumn/volumn_921.shtml")

