之前写了个一个检查mysql从机的脚本(http://www.simonzhang.net/?p=1823),但是在使用中发现一个问题。如果数据库被重启了,但是同步的没有启动,此脚本检查还是正常,不会进行报警,数据不会同步。
我做了个调整,每次检查同步主机的pos,通过crontab进行调用,如果多次都没有变化则进行告警。如果10分钟调用一次,设为3次,就是半个小时内没有更新则报警。
crontab配置如下:
*/10 * * * * /bin/bash /script/check_mysql_slave/check_mysql_slave.sh start >/dev/null 2>&1
部分代码如下:
#!/usr/local/bin/python
# -*- coding:utf-8 -*-
# -------------------------------------------------------------------------------
# Filename: check_nagios.py
# Revision: 1.1
# Date: 2013-06-24
# Author: simonzhang
# Email: simon-zzm@163.com
# -------------------------------------------------------------------------------
import os
import pexpect
import time
import smtplib
from email.mime.text import MIMEText
#### base se
mysql_bin = '/program/mysql5/bin/mysql'
mysql_user = 'checkslavestatus'
mysql_pass = 'xxxxxxxxxx'
#设置错多少次开始告警
max_error = 3
mail_host = 'smtp.exmail.qq.com'
mail_user = 'warning@xxx.net'
mail_pwd = 'xxxxxxxxx'
mail_cc = "simon-zzm@163.com"
####
def mail_warn(error_ip):
content = 'IP %s mysql slave is error!'%error_ip
msg = MIMEText(content)
msg['From'] = mail_user
msg['Subject'] = 'mysql warnning %s'%error_ip
msg['To'] = mail_to
try:
s = smtplib.SMTP()
s.connect(mail_host)
s.login(mail_user,mail_pwd)
s.sendmail(mail_user,[mail_to],msg.as_string())
s.close()
except Exception ,e:
print e
def main():
error_context = ''
#读取上次检查master同步点的记录
try:
f = open('MasterPos.txt', 'rb').read()
try:
old_master_pos = f.split(':')[0]
error_count = f.split(':')[1]
except:
old_master_pos = 0
error_count = 0
except:
old_master_pos = 0
error_count = 0
pass
# 获得数据库同步状态
status = os.popen("%s -u%s -p%s -e 'show slave status\G'"%
(mysql_bin,mysql_user,mysql_pass)).readlines()
# 查看同步主节点数据
for status_l in status:
if status_l.find('Read_Master_Log_Pos: ') > 0:
f = open('MasterPos.txt', 'wb')
# 防止出现空值
try:
new_master_pos = int(status_l.split(': ')[1])
except:
new_master_pos = 0
if int(new_master_pos) == int(old_master_pos) or int(old_master_pos):
f.write('%s:%s' % (new_master_pos, int(error_count)+1))
else:
f.write('%s:0' % new_master_pos)
f.close()
if int(error_count)+1 > max_error:
error_context += 'slave error!'
# 判断是否报警
print error_context:
if len(error_context) > 1:
ip = os.popen("/sbin/ifconfig|grep 'inet addr'|awk '{print $2}'").read()
get_local_ip = ip[ip.find(':')+1:ip.find('n')]
mail_warn("%s"%get_local_ip)
if __name__ == "__main__":
main()
源代码