之前项目上,写过一个小功能:
- 周期性监测设备连通性
- 如果down,发送告警邮件(一次)
- 24小时后一直没恢复,再发一次
- 凡是有up-down状态切换,都发
- 发告警的同时,自动干点其它操作,例如重启该设备
这种监控需求,非常合理,但自己作为一个python爱好者,去实现,发现没开始想象的那么容易,因为这需求是一个‘状态化’的需求,需要程序记住,如果发现设备down了,发一次告警即可,如果超过24小时,仍然没恢复,才再发一次告警;如果设备状态反复切换,up-down,无视时间限制。
最终还是引入数据库,设计表结构,字段,用来保存需要的信息,最后通过程序来逻辑判断,完成这个需求后,我对一个设计良好的数据库表,能够简化程序的复杂性这点有了一点实际的体会。
记录在这,方便下次Control + CV
表结构如下:
manage_ip (string) | online_or_not (bool) | monitor_or_not (bool) | sent_or_not (bool) | last_sent_time (datetime) |
设备管理IP | 设备状态 | 是否监控 | 是否已发送告警 | 最后一次告警时间 |
1.1.1.1 | 1 | 1 | 0 | 2022-08-05 20:34:09.153643 |
然后分为三个模块:
- 监控模块 纯粹的监控设备,周期性监控,然后更新状态到数据库
- 邮件模块 查询状态,做判断,更新对于表格条目
- 其他操作模块 同时对告警设备操作,方便扩展功能
关于邮件模块的判断
def to_be_notice(self, db):
# 如果 设备down+没有发过邮件+开启监控,则发送告警,同时更新状态
if db.online_or_not == False and db.sent_or_not == False and db.monitor_or_not == True:
db.sent_or_not = True
db.last_sent_time = datetime.now()
db.save()
self.email_context.append({'manage_ip': db.manage_ip})
# 如果 设备down+已经发过邮件+开启监控+距离上次发送已经超过24小时,则发送告警,同时更新状态
if db.online_or_not == False and db.sent_or_not == True and db.monitor_or_not == True and (datetime.now() - timedelta(days=1))> db.last_sent_time:
db.last_sent_time = datetime.now()
db.save()
self.email_context.append({'manage_ip': db.manage_ip})
其他一概不发送,另外周期性监控模块,一旦检测设备为UP,则重置设备状态:sent_or_not = False
完整代码如下:
数据库模块相关脚本,使用的python ORM库 peewee
from peewee import *
import datetime
db = SqliteDatabase('ip_monitor_db.db')
class ip_monitor_db(Model):
manage_ip = CharField(unique=True)
online_or_not = BooleanField(default=True)
monitor_or_not = BooleanField(default=True)
sent_or_not = BooleanField(default=False)
last_sent_time = DateTimeField(default=datetime.datetime.now)
class Meta:
database = db # if there are many databases ,This model uses the "testing.db" database.
if __name__ == '__main__':
############################# create DB and Tables
# db.connect()
# db.create_tables([ip_monitor_db,])
############################### insert items
# ip1 = ip_monitor_db(manage_ip = 'baidu.com')
# ip1.save()
#ip2 = ip_monitor_db(manage_ip = '114.114.114.114', monitor_or_not = False)
#ip2.save()
# ip3 = ip_monitor_db.create(manage_ip = '100.64.0.1')
############################# query items
# all = ip_monitor_db.select()
# print(all)
# all1 = ip_monitor_db.get(ip_monitor_db.manage_ip == 'baidu.com')
# print(all1)
# all2 = ip_monitor_db.select().where(ip_monitor_db.online_or_not == True)
# all2 = ip_monitor_db.select().where(ip_monitor_db.online_or_not == True).get() # only get the first one element when there are many targets
# print(all2)
##########################delete items
# all3 = ip_monitor_db.delete().where(ip_monitor_db.monitor_or_not == False).execute()
# print(all3)
####################################update items
# all4 = ip_monitor_db.get(ip_monitor_db.manage_ip == 'baidu.com')
# all4.manage_ip = '114.114.114.114'
# all4.monitor_or_not = False
# all4.save()
# print(all4.manage_ip)
# print(all4.monitor_or_not)
##############################reference
# http://docs.peewee-orm.com/en/latest/peewee/query_examples.html
# https://github.com/coleifer/peewee
pass
监控模块脚本:
import threading
import time
import subprocess
from queue import Queue
from sql_orm_peewee import ip_monitor_db
WORD_THREAD = 50
IP_monitor_list = ip_monitor_db.select()
# print(IP_monitor_list)
IP_QUEUE = Queue()
for i in IP_monitor_list:
IP_QUEUE.put(i.manage_ip)
def live_monitor():
while not IP_QUEUE.empty():
ip = IP_QUEUE.get()
# if the ip is alive,res = 0,otherwise res = 1
res = subprocess.call(f'ping -n 2 -w 5 {ip}', stdout = subprocess.PIPE)
if not res:
print(f'{ip }: {res} True')
ip_monitor_item = ip_monitor_db.get(ip_monitor_db.manage_ip == ip)
ip_monitor_item.online_or_not = True
ip_monitor_item.sent_or_not = False
ip_monitor_item.save()
else:
print(f'{ip }: {res} False')
ip_monitor_item = ip_monitor_db.get(ip_monitor_db.manage_ip == ip)
ip_monitor_item.online_or_not = False
ip_monitor_item.save()
if __name__ == '__main__':
threads = []
start = time.perf_counter()
for i in range(WORD_THREAD):
thread = threading.Thread(target=live_monitor)
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
print()
print("all done: time", time.perf_counter() - start, "\n")
邮件模块脚本
from datetime import datetime, timedelta
import threading
import time
import zmail
from sql_orm_peewee import ip_monitor_db
class notice:
def __init__(self):
self.email_context = []
# collecting all the devices which need to be further operation and save to self.email_context list
def to_be_notice(self, db):
if db.online_or_not == False and db.sent_or_not == False and db.monitor_or_not == True:
db.sent_or_not = True
db.last_sent_time = datetime.now()
db.save()
self.email_context.append({'manage_ip': db.manage_ip})
if db.online_or_not == False and db.sent_or_not == True and db.monitor_or_not == True and (datetime.now() - timedelta(days=1))> db.last_sent_time:
db.last_sent_time = datetime.now()
db.save()
self.email_context.append({'manage_ip': db.manage_ip})
def my_send_mail(self):
self.threading_tbn()
if self.email_context:
msg = 'Important notice:\n\n\nPlease check below devices\n'
for i in self.email_context:
msg = msg + 'manage_ip' + '\t' + i['manage_ip'] + '\n'
print(msg)
mail = {
'subject': 'Important notice: MY Homelab Monitor alert', # Anything you want.
'content_text': msg # Anything you want.
# 'attachments': ['/Users/zyh/Documents/example.zip','/root/1.jpg'], # Absolute path will be better.
}
server = zmail.server('xqw3e2dqw6@qq.com', 'cigbf231wr3bdde')
server.send_mail('xsdd22dd@163.com', mail)
# server.send_mail(['foo@163.com','foo@126.com'],mail,cc=['bar@163.com'])
# collecting all the necessary devices with multithread
def threading_tbn(self):
start = time.perf_counter()
ip_monitor_base = ip_monitor_db.select()
thread_list = [threading.Thread(target=self.to_be_notice, args=(item,)) for item in ip_monitor_base]
for t in thread_list:
t.start()
for t in thread_list:
if t.is_alive():
t.join()
#print(threading.active_count())
print()
print("all done: time", time.perf_counter() - start, "\n")
if __name__ == '__main__':
my_notice = notice()
my_notice.my_send_mail()
# my_notice.threading_tbn()
其他操作模块,例如PVE 环境下虚拟机操作,参考单独这个
项目完整代码仓库
https://github.com/sshuangliu/Proxmox-VE-api
没了。