為什么需要自動(dòng)化運(yùn)維
# 大批量機(jī)器靠人工運(yùn)維有以下弊端(以配置更新為例):
# 運(yùn)維時(shí)間長(zhǎng): 每一臺(tái)都需要遠(yuǎn)程上去更改
# 容易出錯(cuò): 人工輸入命令及其容易出錯(cuò)
# 結(jié)果反饋不明顯: 需要靠人工自己判斷
# 回退麻煩: 出錯(cuò)回退無法保證
# ps: 離職交接的時(shí)候及其繁瑣
為什么選用Elves
# 安裝簡(jiǎn)單
# 界面管理
# 擴(kuò)展性及其強(qiáng)悍(使用編程)
# 上手快
# 運(yùn)維范圍廣
# 安全管理
Elves安裝
# 沒有安裝docker可以安裝(默認(rèn)的docker版本會(huì)比較低),已經(jīng)有docker跳過這一步
centos:
yum install -y yum-utils device-mapper-persistent-data lvm2
yum-config-manager \
--add-repo \
https://download.docker.com/linux/centos/docker-ce.repo
yum makecache fast
yum install -y --setopt=obsoletes=0 \
docker-ce-18.06.1.ce-3.el7
# 沒有安裝docker-compose的可以安裝,已經(jīng)有docker-compose跳過這一步
centos: yum install -y docker-compose
# docker安裝 Elves
git clone https://github.com/elves-project/docker.git
cd docker
chmod u+x ./control
./control build //下載Base鏡像并構(gòu)建新鏡像。 心細(xì)的小伙伴可以自己把鏡像tag成自己的,再安裝避免rebuild。
./control start //調(diào)用docker-compose啟動(dòng)各容器,也可以docker-compose up -d.
./control insertsql //插入Elves 數(shù)據(jù)表結(jié)構(gòu). 失敗了可以手動(dòng)執(zhí)行,手動(dòng)查看標(biāo)注
./control restart //組件依賴mysql,重啟容器刷新程序.
# 為了正常執(zhí)行,更改ftp目錄權(quán)限
docker exec -it vsftp bash # 進(jìn)入容器
chown -R ftpuser:ftpuser /data/ # 修改權(quán)限
# 標(biāo)注
Elves-Dashboard頁(yè)面端口: 8004
Elves-supervisor頁(yè)面端口:9092 ; user/password: [admin@gyyx.cn](mailto:admin@gyyx.cn)/admin
Rabbitmq 頁(yè)面端口:15672 ; user/password: admin/1q2w3e4r
Nginx 頁(yè)面端口:80
Ftp 端口:21 ; user/passwd: ftpuser/1q2w3e4r
使用Elves
注冊(cè)主機(jī)
git clone https://github.com/elves-project/agent.git
cd agent
cp conf/cfg.example.json conf/cfg.json
vi conf/cfg.json # 更改配置ip,asset以及服務(wù)器的配置
chmod u+x ./control
./control start # 在服務(wù)界面上查看添加結(jié)果,可能需要幾分鐘同步心跳。
訪問 Elves-supervisor: 本機(jī): 127.0.0.1:9092

elves-supervisor
# 如上圖:
# agent列表: 注冊(cè)主機(jī)的列表
# app管理: 我們運(yùn)維邏輯,zip包管理
# auth管理: app 密鑰,使用app的時(shí)候需要驗(yàn)證
上傳app

app
# 如圖:
# 點(diǎn)擊右上角 + 添加
# 添加完成后點(diǎn)app右邊的編輯圖標(biāo)(橙色小筆)上傳app包(zip,制作參考 “app 制作”)
# 上傳成功后點(diǎn)擊app中間(淡藍(lán)色)選擇版本啟用
# 啟用成功后點(diǎn)擊app最右邊(深藍(lán)色)選擇需要運(yùn)維的主機(jī)
# 之后到auth管理設(shè)置app的秘鑰
# 之后通過命令啟用即可自動(dòng)運(yùn)維(啟動(dòng)命令參看 “app 啟動(dòng)”)
app (以machineCheck,開發(fā)語(yǔ)言選python2 為例)
- machineCheck 檢測(cè)主機(jī)進(jìn)程的線程,且這里不能使用python3,因?yàn)槠渌麢C(jī)器大概率用的是python2,不然會(huì)有版本問題
# 需要特定的目錄結(jié)構(gòu)(這里選擇實(shí)時(shí)反饋型)
tree machineCheck
├── appcfg.json
├── app-worker.py
└── machineCheck.py
appcfg.json 配置文件
{
"Processor":{
"Commnet" : "This Is Processor CFG , Do Not Use For Other",
"Addr" : "127.0.0.1",
"Port" : 10010,
"Timeout" : 0
}
}
app-worker.py app的入口
#!/usr/bin/python
# coding=utf-8
# Author: toryzen
#
# app worker入口
import sys
import json
import base64
import os
import traceback
sys.path.append(os.path.abspath(__file__))
def agentExec(app,func,jsonParam=""):
flag = "false"
try:
param = ""
if(jsonParam!=""):
param = json.loads(repr(base64.b64decode(jsonParam))[1:-1])
#print param
agentObj = __import__(app)
agentClass = getattr(agentObj,app) # 這里加載我們處理邏輯
obj = agentClass()
mtd = getattr(obj,func) # 這里加載我們方法
flag,result = mtd(param) # 這里調(diào)用,由于有param,所以我們app需要param選項(xiàng)
except Exception,e:
flag,result = "false",traceback.format_exc()
elvesPrint(flag,result)
def elvesPrint(flag,result):
print "<ElvesWFlag>"+str(flag)+"</ElvesWFlag> <ElvesWResult>"+str(result)+"</ElvesWResult>"
if __name__ == '__main__':
if(len(sys.argv)==3):
agentExec(sys.argv[1],sys.argv[2])
elif(len(sys.argv)==4):
agentExec(sys.argv[1],sys.argv[2],sys.argv[3])
else:
elvesPrint("false","param error")
machineCheck.py 實(shí)現(xiàn)邏輯
#!/usr/bin/python
# coding: utf-8
import logging
import socket
import traceback
import commands
import os
import json
if not os.path.isdir('/var/log/elves/machinecheck'):
os.makedirs('/var/log/elves/machinecheck')
logger = logging.getLogger(__name__)
logger.setLevel(level=logging.INFO)
handler = logging.FileHandler('/var/log/elves/machinecheck/info.log')
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
class machineCheck: # 這里需要和最后的zip包前綴一樣,elves會(huì)是使用同樣的名字
@staticmethod
def threads_over():
over_threads = []
pids=commands.getoutput("ps -xH|awk '{ print $1}'").split('\n')
set_pids = set(pids)
for i in set_pids:
if pids.count(i) > 1500: # 超過1500則進(jìn)行處理
over_threads.append(i)
return over_threads
@staticmethod
def write_log(flag, result): # 記錄結(jié)果,在服務(wù)器中的mysql里面也記錄了
message = 'status: %s, message: %s' % (flag, result)
if flag not in ('success', 'error'):
message = 'status: error, message: Function exec failed!'
logger.info(message)
def check(self, params=""): # 這里需要params
return_flag, return_result = ('error', 'Internal Error!')
try:
over_threads = machineCheck.threads_over()
ip = socket.gethostbyname(socket.getfqdn(socket.gethostname()))
if over_threads: # 處理邏輯
return_flag, return_result = ('error', 'have process\' threads more than 1500! Check /var/log/elves/machinecheck/error')
over_threads_command = ["pid: " + pid + commands.getoutput('cat /proc/%s/cmdline' % pid) + '\n' for pid in over_threads]
with open('/var/log/elves/machinecheck/error', 'a') as f:
json.dump(over_threads_command, f) # 記錄進(jìn)程信息
os.system("""curl 'https://oapi.dingtalk.com/robot/send?access_token=xxxxxx' -H 'Content-Type: application/json' -d '{"msgtype": "text","text": {"content": "threads too mush in %s"}}'""" % ip) # 發(fā)送報(bào)警,我這里接了釘釘報(bào)警。
return
return_flag, return_result = ('success', 'host is healthy')
except Exception as e:
return_flag, return_result = ('error', traceback.format_exc())
finally:
machineCheck.write_log(return_flag, return_result)
return_result = 'status: %s, message: %s!' % (return_flag, return_result)
if return_flag == "success":
return_flag = "true"
else:
return_flag = "false"
return (return_flag, return_result)
if __name__ == '__main__':
pass
制作zip包
cd machineCheck
zip ../machineCheck_1.0.zip * # 這里1.0是版本,elves會(huì)自己獲取。
chown 1000:1000 machineCheck_1.0.zip # 讓瀏覽器可以加載。
app 啟動(dòng)
這里還是以machineCheck為例,我寫成了python3腳本調(diào)用。啟動(dòng)需要訪問openapi(統(tǒng)一入口,8080端口),api詳情查看 “Elves Api”
簽名
# 調(diào)用前需要普及個(gè)概念,Elves交互時(shí)的簽名認(rèn)證
# Elves 簽名使用md5簽名.
# 拼湊簽名字段: 請(qǐng)求路徑 + ? + 參數(shù)(按字母排序,并且不帶sign_type和sign) + auth_key(在服務(wù)端9092端口的auth管理界面)
# 簽名: hashlib.md5(簽名字段).encode('utf-8')).hexdigest() # python中,下面案例詳細(xì)講解
運(yùn)行app進(jìn)行測(cè)試
machineCheck.py
#!/bin/env/python3
# coding: utf-8
import os
import json
import time
import hashlib
import requests
server_url = "http://127.0.0.1:8080"
ip = "10.1.9.173" # 運(yùn)維主機(jī)的ip
path = "/api/v2/rt/exec"
func = "check"
param = ""
app = "machineCheck"
auth_id = "0906DDE6518477A8" # auth管理界面的id
authkey = "FF6DB1AB43393D3F" # auth管理界面的key
sign_type = 'MD5'
def get_sign():
params = json.dumps(param)
timestamp = int(time.time())
# params = "app=%sp&auth_id=%s&func=%s&ip=%s¶m=%s&proxy=&timeout=×tamp=%s" % (app, auth_id, func, ip, params, timestamp)
params = "app=%s&auth_id=%s&func=%s&ip=%s¶m=%s×tamp=%s" % (app, auth_id, func, ip, params, timestamp)
paths = path + '?' + params
sign = hashlib.md5((paths + authkey).encode('utf-8')).hexdigest() # 簽名
return params, sign
def send_bytes(sign, params):
response = requests.post(server_url + path + "?%s&sign_type=MD5&sign=%s" % (params, sign)) # 發(fā)送post請(qǐng)求
return response.text # 返回結(jié)果
if __name__ == "__main__":
params, sign = get_sign()
print(send_bytes(sign, params))
使用定時(shí)任務(wù)來啟動(dòng)
add-cron.py
#!/bin/env/python3
import os
import json
import time
import hashlib
import requests
import sys
import json
import subprocess
server_url = "http://127.0.01:8080"
ip = None
try:
ip = sys.argv[1] # 運(yùn)行時(shí)候輸入ip,動(dòng)態(tài)調(diào)用
except Exception:
print('Error: exp. python x.py $IP')
sys.exit(-1)
path = "/api/v2/cron/add" # 路徑
func = "check" # 方法
app = "machineCheck" # app名
rule = "0 0 */1 * * ?" #cron規(guī)則
mode = "NP"
auth_id = "0906DDE6518477A8" # auth 管理界面的id
authkey = "FF6DB1AB43393D3F" # auth管理界面的key
timestamp = int(time.time()) # 時(shí)間戳
sign_type = 'MD5'
def get_sign():
params = "app=%s&auth_id=%s&func=%s&ip=%s&mode=%s&rule=%s×tamp=%s" % (app, auth_id, func, ip, mode, rule, timestamp)
paths = path + '?' + params
sign = hashlib.md5((paths + authkey).encode('utf-8')).hexdigest()
return params, sign
def send_bytes(sign, params):
url = server_url + path + "?%s&sign_type=MD5&sign=%s" % (params, sign)
response = requests.post(url)
return response.text
if __name__ == "__main__":
params, sign = get_sign()
message = send_bytes(sign, params)
print('Add cron:', message)
cron_id = json.loads(message).get('result').get('id')
os.system("echo '%s %s' >> ./cron_id" % (cron_id, ip)) # 記錄cron_id,在服務(wù)端mysql中也有記錄
print('Start cron:', subprocess.getoutput("python3 ./start-cron.py %s" % cron_id)) # 啟動(dòng)cron
start-cron.py
#!/bin/env/python3
import os
import json
import time
import hashlib
import requests
import sys
server_url = "http://127.0.0.1:8080"
path = "/api/v2/cron/start"
cron_id = None
try:
cron_id = sys.argv[1]
except Exception:
print('Error: exp. python x.py $cron_id')
sys.exit(-1)
auth_id = "0906DDE6518477A8"
authkey = "FF6DB1AB43393D3F"
timestamp = int(time.time())
sign_type = 'MD5'
def get_sign():
params = "auth_id=%s&cron_id=%s×tamp=%s" % (auth_id, cron_id, timestamp)
paths = path + '?' + params
sign = hashlib.md5((paths + authkey).encode('utf-8')).hexdigest()
return params, sign
def send_bytes(sign, params):
url = server_url + path + "?%s&sign_type=MD5&sign=%s" % (params, sign)
response = requests.post(url)
return response.text
if __name__ == "__main__":
params, sign = get_sign()
print(send_bytes(sign, params))
標(biāo)注
# 簽名的時(shí)候一定要保證參數(shù)齊全且順序排列正確,不然簽名驗(yàn)證不通過
# cron規(guī)則如果報(bào)mysql字段范圍錯(cuò)誤,可以手動(dòng)更改mysql字段
alter table task_cron modify column mode varchar(6) not null;
# cron 規(guī)則遵循quartz cron,和linux cron有區(qū)別,周那里使用? 代替
參考文獻(xiàn)
Elves 官網(wǎng): [https://gy-games.gitbooks.io/elves/module/elves-app.html](https://gy-games.gitbooks.io/elves/module/elves-app.html)
Evels-Api: [https://gy-games.gitbooks.io/elves/api.html](https://gy-games.gitbooks.io/elves/api.html)
Elves-docker: [https://github.com/elves-project/docker](https://github.com/elves-project/docker)