最近发生了一次因为hdfs的坏块导致的hive计算问题,因此写了个监控脚本用来监控hdfs的坏块,脚本如下:
#!/usr/bin/python # -*- coding: utf8 -*- #edit by ericni #20140724 #monitor hdfs corrupt import sys import property import sendmail import re,os reload(sys) sys.setdefaultencoding(‘utf-8‘) if __name__ == "__main__": corruptlist = [] cmd = "hadoop fsck -list-corruptfileblocks" re = os.popen(cmd) result = re.readlines() print result for line in result: if "blk_" in line and ".Trash" not in line: #if "blk_" in line: corruptlist.append(line) if len(corruptlist) != 0: mailcontent = """ <meta http-equiv="Content-Type" content="text/html";charset=utf-8> <title>Hadoop集群坏块监控</title> <style type="text/css"> body { font-size: 14px; color: #333;background-color: #fff;} td { border: 1px solid #C1DAD7;padding:"4px 10px"; line-height: 24px;} table {border-collapse: collapse; width: 96%s;} .divtd {color:#E28E40;} .divcss5{ color:#F00;} </style> """%("%") mailcontent += """ <p style="margin: 0; padding: 5px 0; line-height: 28px; font: bold 18px/1.5;">坏块数量 %s,具体信息如下:</p> <div> <table style="border-collapse: collapse; width: 96%s;"> <tbody> <tr> <td><div class="divtd">序号</div></td> <td><div class="divtd">块号</div></td> <td><div class="divtd">文件信息</div></td> </tr> """%(str(len(corruptlist)),"%") id = 1 for clist in corruptlist: print "blkid is " + clist.split()[0] + " file is " + clist.split()[1] mailcontent += """ <tr> """ mailcontent += """<td><div>%s </div></td>""" % id mailcontent += """<td><div>%s </div></td>""" % clist.split()[0] mailcontent += """<td><div>%s </div></td>""" % clist.split()[1] mailcontent += """ </tr> """ id = id + 1 mailcontent += "</tbody></table></div>" print mailcontent mailto = [‘[email protected]‘] subject = "Haddop集群坏块监控" sendmail.send_mail_withoutSSL(subject,mailcontent.encode(‘utf-8‘),mailto)
时间: 2024-10-13 12:35:56