因为工作需要,需要做深度学习识别恶意二进制文件,所以爬一些资源。
# -*- coding: utf-8 -*- import requests import re import sys import logging reload(sys) sys.setdefaultencoding(‘utf-8‘) logger = logging.getLogger("rrjia") formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") file_handler = logging.FileHandler("/home/rrjia/Python/test.log") file_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.setLevel("INFO") if __name__ == ‘__main__‘: # url = ‘http://malwaredb.malekal.com‘ # http://malwaredb.malekal.com/index.php?page=1 # <td width="30px" align="center"><a href="./files.php?file=25e8bf41343bda75a9170aad44094647"><img src="img/tetedemort.gif" width="26px,height=26px"></a></td> count = 1 error_count = 0 begin_url = ‘http://malwaredb.malekal.com‘ begin_html = requests.get(begin_url) img_src = re.findall(‘<a href="\./files\.php\?file=\w+">‘, begin_html.text, re.S) imgUrl = [] for each_src in img_src: arr = each_src.split("=") imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace(‘"‘, "").replace(">", "")) logger.info("0 page contains %d virus file" % len(imgUrl)) for each in imgUrl: try: imgContext = requests.get(each, timeout=120).content fileName = each.split("=")[1] with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code: code.write(imgContext) code.close() logger.info("success download %d page %d file " % (0, count) + str(fileName) + ".zip") except Exception as e: error_count += 1 logger.info("this url error download failed") count += 1 # for page in range(1, 828): # url = "http://malwaredb.malekal.com/index.php?page=" + str(page) # html = requests.get(url) # img_src = re.findall(‘<a href="\./files\.php\?file=\w+">‘, html.text, re.S) # imgUrl = [] # for each_src in img_src: # arr = each_src.split("=") # imgUrl.append("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace(‘"‘, "").replace(">", "")) # logger.info("%d page contains %d virus file" % (page, len(imgUrl))) # for each in imgUrl: # try: # imgContext = requests.get(each, timeout=120).content # fileName = each.split("=")[1] # with open("/home/rrjia/TestData/" + str(fileName) + ".zip", "wb+") as code: # code.write(imgContext) # code.close() # logger.info("success download %d page %d file " % (page, count) + str(fileName) + ".zip") # except Exception as e: # error_count += 1 # logger.info("this url error") # count += 1
时间: 2024-11-09 12:03:12