Python | Monitoring, Alarms

之前项目上,写过一个小功能:

  • 周期性监测设备连通性
  • 如果down,发送告警邮件(一次)
  • 24小时后一直没恢复,再发一次
  • 凡是有up-down状态切换,都发
  • 发告警的同时,自动干点其它操作,例如重启该设备

这种监控需求,非常合理,但自己作为一个python爱好者,去实现,发现没开始想象的那么容易,因为这需求是一个‘状态化’的需求,需要程序记住,如果发现设备down了,发一次告警即可,如果超过24小时,仍然没恢复,才再发一次告警;如果设备状态反复切换,up-down,无视时间限制。

最终还是引入数据库,设计表结构,字段,用来保存需要的信息,最后通过程序来逻辑判断,完成这个需求后,我对一个设计良好的数据库表,能够简化程序的复杂性这点有了一点实际的体会。

记录在这,方便下次Control + CV

表结构如下:

manage_ip
(string)
online_or_not
(bool)
monitor_or_not
(bool)
sent_or_not
(bool)
last_sent_time
(datetime)
设备管理IP设备状态是否监控是否已发送告警最后一次告警时间
1.1.1.11102022-08-05 20:34:09.153643

然后分为三个模块:

  • 监控模块 纯粹的监控设备,周期性监控,然后更新状态到数据库
  • 邮件模块 查询状态,做判断,更新对于表格条目
  • 其他操作模块 同时对告警设备操作,方便扩展功能

关于邮件模块的判断

def to_be_notice(self, db):
        # 如果 设备down+没有发过邮件+开启监控,则发送告警,同时更新状态
        if db.online_or_not == False and db.sent_or_not == False and db.monitor_or_not == True:
            db.sent_or_not = True
            db.last_sent_time = datetime.now()
            db.save()
            self.email_context.append({'manage_ip': db.manage_ip})
        # 如果 设备down+已经发过邮件+开启监控+距离上次发送已经超过24小时,则发送告警,同时更新状态
        if db.online_or_not == False and db.sent_or_not == True and db.monitor_or_not == True and (datetime.now() - timedelta(days=1))> db.last_sent_time:
            db.last_sent_time = datetime.now()
            db.save()
            self.email_context.append({'manage_ip': db.manage_ip})
        其他一概不发送,另外周期性监控模块,一旦检测设备为UP,则重置设备状态:sent_or_not = False

完整代码如下:

数据库模块相关脚本,使用的python ORM库 peewee

from peewee import *
import datetime


db = SqliteDatabase('ip_monitor_db.db')

class ip_monitor_db(Model):
    manage_ip = CharField(unique=True)
    online_or_not = BooleanField(default=True)
    monitor_or_not = BooleanField(default=True)
    sent_or_not = BooleanField(default=False)
    last_sent_time = DateTimeField(default=datetime.datetime.now)
    

    class Meta:
        database = db # if there are many databases ,This model uses the "testing.db" database.






if __name__ == '__main__':

    ############################# create DB and Tables
    # db.connect()
    # db.create_tables([ip_monitor_db,])
 
    ############################### insert items
    # ip1 = ip_monitor_db(manage_ip = 'baidu.com')
    # ip1.save()
    #ip2 = ip_monitor_db(manage_ip = '114.114.114.114', monitor_or_not = False)
    #ip2.save()
    # ip3 = ip_monitor_db.create(manage_ip = '100.64.0.1')

    ############################# query items
    # all = ip_monitor_db.select()
    # print(all)
    
    # all1 = ip_monitor_db.get(ip_monitor_db.manage_ip == 'baidu.com')
    # print(all1)
    # all2 = ip_monitor_db.select().where(ip_monitor_db.online_or_not == True)
    # all2 = ip_monitor_db.select().where(ip_monitor_db.online_or_not == True).get() # only get the first one element when there are many targets
    # print(all2)
    ##########################delete items
    # all3 = ip_monitor_db.delete().where(ip_monitor_db.monitor_or_not == False).execute()
    # print(all3)


    ####################################update items
    # all4 = ip_monitor_db.get(ip_monitor_db.manage_ip == 'baidu.com')
    # all4.manage_ip = '114.114.114.114'
    # all4.monitor_or_not = False
    # all4.save()
    # print(all4.manage_ip)
    # print(all4.monitor_or_not)

    ##############################reference
    # http://docs.peewee-orm.com/en/latest/peewee/query_examples.html
    # https://github.com/coleifer/peewee
    pass

监控模块脚本:

import threading
import time
import subprocess
from queue import Queue
from sql_orm_peewee import ip_monitor_db

WORD_THREAD = 50
IP_monitor_list = ip_monitor_db.select()
# print(IP_monitor_list)
IP_QUEUE = Queue()
for i in IP_monitor_list:
	IP_QUEUE.put(i.manage_ip)


def live_monitor():
	while not IP_QUEUE.empty():
		ip = IP_QUEUE.get()
		# if the ip is alive,res = 0,otherwise res = 1
		res = subprocess.call(f'ping -n 2 -w 5 {ip}', stdout = subprocess.PIPE)
		if not res:
			print(f'{ip }: {res} True')
			ip_monitor_item = ip_monitor_db.get(ip_monitor_db.manage_ip == ip)
			ip_monitor_item.online_or_not = True
			ip_monitor_item.sent_or_not = False
			ip_monitor_item.save()
		else:
			print(f'{ip }: {res} False')
			ip_monitor_item = ip_monitor_db.get(ip_monitor_db.manage_ip == ip)
			ip_monitor_item.online_or_not = False
			ip_monitor_item.save()



if __name__ == '__main__':
	threads = []
	start = time.perf_counter()
	for i in range(WORD_THREAD):
		thread = threading.Thread(target=live_monitor)
		thread.start()
		threads.append(thread)

	for thread in threads:
		thread.join()

	print()
	print("all done: time", time.perf_counter() - start, "\n")
    

邮件模块脚本

from datetime import datetime, timedelta
import threading
import time
import zmail
from sql_orm_peewee import ip_monitor_db

class notice:

    def __init__(self):
        self.email_context = []

    # collecting all the devices which need to be further operation and save to self.email_context list
    def to_be_notice(self, db):
            if db.online_or_not == False and db.sent_or_not == False and db.monitor_or_not == True:
                db.sent_or_not = True
                db.last_sent_time = datetime.now()
                db.save()
                self.email_context.append({'manage_ip': db.manage_ip})
            if db.online_or_not == False and db.sent_or_not == True and db.monitor_or_not == True and (datetime.now() - timedelta(days=1))> db.last_sent_time:
                db.last_sent_time = datetime.now()
                db.save()
                self.email_context.append({'manage_ip': db.manage_ip})

    def my_send_mail(self):
        self.threading_tbn()
        if self.email_context:
            msg = 'Important notice:\n\n\nPlease check below devices\n'
            for i in self.email_context:
                msg = msg + 'manage_ip' + '\t' + i['manage_ip'] + '\n'
            print(msg)
            mail = {
                'subject': 'Important notice: MY Homelab Monitor alert',  # Anything you want.
                'content_text': msg # Anything you want.
                # 'attachments': ['/Users/zyh/Documents/example.zip','/root/1.jpg'],  # Absolute path will be better.
                }
            server = zmail.server('xqw3e2dqw6@qq.com', 'cigbf231wr3bdde')
            server.send_mail('xsdd22dd@163.com', mail)
            # server.send_mail(['foo@163.com','foo@126.com'],mail,cc=['bar@163.com'])

    # collecting all the necessary devices with multithread
    def threading_tbn(self):
        start = time.perf_counter()
        ip_monitor_base = ip_monitor_db.select()
        thread_list = [threading.Thread(target=self.to_be_notice, args=(item,)) for item in ip_monitor_base]
        for t in thread_list:
            t.start()
        for t in thread_list:
            if t.is_alive():
                t.join()

        #print(threading.active_count())
        print()
        print("all done: time", time.perf_counter() - start, "\n")


if __name__ == '__main__':
    my_notice = notice()
    my_notice.my_send_mail()
    # my_notice.threading_tbn()

其他操作模块,例如PVE 环境下虚拟机操作,参考单独这个

项目完整代码仓库

https://github.com/sshuangliu/Proxmox-VE-api

没了。

Related Posts

Leave a Reply

Your email address will not be published.