之前写了个一个检查mysql从机的脚本(http://www.simonzhang.net/?p=1823),但是在使用中发现一个问题。如果数据库被重启了,但是同步的没有启动,此脚本检查还是正常,不会进行报警,数据不会同步。
我做了个调整,每次检查同步主机的pos,通过crontab进行调用,如果多次都没有变化则进行告警。如果10分钟调用一次,设为3次,就是半个小时内没有更新则报警。
crontab配置如下:
*/10 * * * * /bin/bash /script/check_mysql_slave/check_mysql_slave.sh start >/dev/null 2>&1
部分代码如下:
#!/usr/local/bin/python # -*- coding:utf-8 -*- # ------------------------------------------------------------------------------- # Filename: check_nagios.py # Revision: 1.1 # Date: 2013-06-24 # Author: simonzhang # Email: simon-zzm@163.com # ------------------------------------------------------------------------------- import os import pexpect import time import smtplib from email.mime.text import MIMEText #### base se mysql_bin = '/program/mysql5/bin/mysql' mysql_user = 'checkslavestatus' mysql_pass = 'xxxxxxxxxx' #设置错多少次开始告警 max_error = 3 mail_host = 'smtp.exmail.qq.com' mail_user = 'warning@xxx.net' mail_pwd = 'xxxxxxxxx' mail_cc = "simon-zzm@163.com" #### def mail_warn(error_ip): content = 'IP %s mysql slave is error!'%error_ip msg = MIMEText(content) msg['From'] = mail_user msg['Subject'] = 'mysql warnning %s'%error_ip msg['To'] = mail_to try: s = smtplib.SMTP() s.connect(mail_host) s.login(mail_user,mail_pwd) s.sendmail(mail_user,[mail_to],msg.as_string()) s.close() except Exception ,e: print e def main(): error_context = '' #读取上次检查master同步点的记录 try: f = open('MasterPos.txt', 'rb').read() try: old_master_pos = f.split(':')[0] error_count = f.split(':')[1] except: old_master_pos = 0 error_count = 0 except: old_master_pos = 0 error_count = 0 pass # 获得数据库同步状态 status = os.popen("%s -u%s -p%s -e 'show slave status\G'"% (mysql_bin,mysql_user,mysql_pass)).readlines() # 查看同步主节点数据 for status_l in status: if status_l.find('Read_Master_Log_Pos: ') > 0: f = open('MasterPos.txt', 'wb') # 防止出现空值 try: new_master_pos = int(status_l.split(': ')[1]) except: new_master_pos = 0 if int(new_master_pos) == int(old_master_pos) or int(old_master_pos): f.write('%s:%s' % (new_master_pos, int(error_count)+1)) else: f.write('%s:0' % new_master_pos) f.close() if int(error_count)+1 > max_error: error_context += 'slave error!' # 判断是否报警 print error_context: if len(error_context) > 1: ip = os.popen("/sbin/ifconfig|grep 'inet addr'|awk '{print $2}'").read() get_local_ip = ip[ip.find(':')+1:ip.find('n')] mail_warn("%s"%get_local_ip) if __name__ == "__main__": main()