You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

427 lines
22 KiB

'''
This is an example script for performing rolling upgrade in searchhead cluster
and should not be applied to a production instance without editing to suit
your environment and testing extensively.
Usage of this script:
python shc_upgrade_template.py -u uri_of_member -d directory_of_splunk_home -t timeout_before_shutdown -n new_splunk_package -r remote_ssh_user -s new_splunk_version_number --deployer yes/no --auth user:password
There are some preconditions to run this script:
1. The upgrade is happening between post-NightLight (including NightLight) to a higher version.
2. All the bits/binaries that are needed during the upgrading should have been put into place on the machines running Splunks.
3. The user running this script should have set up the keyless ssh login onto the machines running Splunks.
Workflow of this script:
1. check SHC status through REST "/services/shcluster/status"
2. if SHC status is not healthy, exit the script. otherwise
3. put SHC in upgrade state through REST "/services/shcluster/captain/control/default/upgrade-init"
4. for each node in the SHC
4.1 put the node in manual detention through REST "/services/shcluster/member/control/control/set_manual_detention"
4.2 check the existing search jobs' status through REST "/services/shcluster/member/info"
4.3 if there is no existing historical search jobs, or the timeout (configurable, and default to 180 seconds) expires, start the upgrade of the node
4.3.1 stop the node by "splunk stop"
4.3.2 back up the existing splunk installation (optional)
4.3.3 untar the new splunk package
4.3.4 start splunk by "splunk start"?
4.4 turn off the manual detention through REST "/services/shcluster/member/control/control/set_manual_detention"
5. finalize the upgrade through REST "/services/shcluster/captain/control/default/upgrade-finalize
'''
import logging as logger
import sys
import os
import requests
import time
import argparse
import subprocess
if sys.version_info >= (3, 0):
import urllib.parse
urlparse = urllib.parse.urlparse
else:
from urlparse import urlparse
from distutils.version import StrictVersion
import distutils.util
def log_status_exit(shc_logger, status, message):
shc_logger.error(message)
if status == 401:
shc_logger.error("Authentication failure: must pass valid credentials with request.")
else:
if status == 500:
shc_logger.error("Internal server error.")
sys.exit(message)
if __name__ == '__main__':
# default settings
USERNAME="admin"
PASSWORD="changme"
SSHUSER="root"
# rest api used
SHCLUSTER_STATUS_REST = "/services/shcluster/status?output_mode=json"
UPGRADE_INIT_REST = "/services/shcluster/captain/control/default/upgrade-init?output_mode=json"
UPGRADE_FINALIZE_REST = "/services/shcluster/captain/control/default/upgrade-finalize?output_mode=json"
MANUAL_DETENTION_REST = "/services/shcluster/member/control/control/set_manual_detention?output_mode=json"
MEMBER_INFO_REST = "/services/shcluster/member/info?output_mode=json"
SHCLUSTER_CONFIG_REST = "/services/shcluster/config?output_mode=json"
KVSTORE_STATUS_REST = "/services/kvstore/status?output_mode=json"
TIMEOUT = 180
TIMEOUT_INTERVAL = 5
SHC_UPGRADE_BASE_VERSION = "7.1.0"
#config the logger
logger.basicConfig(filename='shc_upgrade.log', level=logger.INFO)
example_text = '''example:
python shc_upgrade_template.py -u https://example.com:8089 -d /home/user/splunk -t 180 -n /opt/newsplunk.tar.gz -r splunk -s 7.2.2 --auth admin:changed
'''
parser = argparse.ArgumentParser(description='SHC upgrade script', epilog=example_text,
formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('-u', '--uri_of_member', required=True, action="store", type=str, help="Specify the mgmt_uri of any member in SHC")
parser.add_argument('-d', '--directory_of_splunk_home', required=True, action="store", type=str, help="Specify the directory of splunk home")
parser.add_argument('-n', '--new_splunk_package', required=True, action="store", type=str, help="Specify the full path for the new splunk package")
parser.add_argument('-t', '--timeout_before_shutdown', action="store", type=int, help="Specify the timeout in seconds this script uses before shutting down splunk. If -1 is given, the script will wait for all non-realtime searches to be completed before shutting down splunk")
parser.add_argument('-r', '--remote_ssh_user', action="store", type=str, help="Specify the user name used to access remote machines through ssh running SHC")
parser.add_argument('-s', '--splunk_new_version', required=True, action="store", type=str, help="Specify the version of the new splunk package")
parser.add_argument('-b', '--backup_directory', action="store", type=str, help="Specify the backup directory if user wants to back up existing splunk before the upgrade happens")
parser.add_argument('--deployer', action="store", type=str, help="Specify if the deployer needs to be upgraded")
parser.add_argument('-a', '--auth', action="store", type=str, help="Specify the username and password for the splunk account")
argList = parser.parse_args()
# check for username and password
if argList.auth:
newauth = argList.auth.split(':')
if len(newauth) != 2:
logger.error("Expected argument in 'username:password' format")
sys.exit("Expected argument in 'username:password' format")
USERNAME = newauth[0]
PASSWORD = newauth[1]
#check ssh login name
if argList.remote_ssh_user:
SSHUSER = argList.remote_ssh_user
# get shc status
statusUri = argList.uri_of_member + SHCLUSTER_STATUS_REST
logger.info('calling shc status at: %s', statusUri)
rStatus = requests.get(
statusUri, params = {'advanced' : 1},
auth=(USERNAME, PASSWORD), verify=False)
if rStatus.status_code != 200:
message = "Error during getting SHC status"
log_status_exit(logger, rStatus.status_code, message)
rStatusJson = rStatus.json()
# check shc status
captainInfo = {}
peerDictOrig = {}
cluster_master_version = None
try:
captainInfo = rStatusJson['entry'][0]['content']['captain']
if not captainInfo["dynamic_captain"]:
raise ValueError("SHC does not have a dynamic captain"
"please fix this before proceeding with rolling upgrade")
if not captainInfo["stable_captain"]:
raise ValueError("SHC does not have a stable captain "
"please fix this before proceeding with rolling upgrade")
if not captainInfo["service_ready_flag"]:
raise ValueError("SHC captain is not ready to provide service "
"please fix this before proceeding with rolling upgrade")
if captainInfo["rolling_restart_flag"]:
raise ValueError("SHC is in rolling restart "
"please fix this before proceeding with rolling upgrade")
if captainInfo["rolling_upgrade_flag"]:
raise ValueError("SHC is already in rolling upgrade , the reason for the failure "
"may be an already existing rolling upgrade is going on, please wait for it to "
"finish or may be the script failed in between, so finalize the script "
"please fix this issue before proceeding with rolling upgrade")
if captainInfo["max_failures_to_keep_majority"] <= 0:
raise ValueError("max_failures_to_keep_majority should be larger than 0 ."
"Run show shcluster-status to know which search head does not have the status Up."
"Please fix this before proceeding with rolling upgrade")
# version checking
if StrictVersion(argList.splunk_new_version) <= StrictVersion(SHC_UPGRADE_BASE_VERSION):
raise ValueError("the new splunk version number should be larger than %s" % (SHC_UPGRADE_BASE_VERSION))
cluster_master = rStatusJson['entry'][0]['content']['cluster_master']
if cluster_master:
for master in cluster_master:
version = cluster_master[master]['splunk_version']
if cluster_master_version is not None:
if StrictVersion(version) < StrictVersion(cluster_master_version):
cluster_master_version = version
else:
cluster_master_version = version
if StrictVersion(cluster_master_version) < StrictVersion(argList.splunk_new_version):
raise ValueError("cluster_master version %s is lower than the new SHC version %s" % (cluster_master_version, argList.splunk_new_version))
# gather the nodes that are needed to be upgraded
peerDictOrig = rStatusJson['entry'][0]['content']['peers']
if len(peerDictOrig) == 0:
raise ValueError("SHC has no members")
delete_list = []
for peer in peerDictOrig:
if peerDictOrig[peer]['out_of_sync_node']:
raise ValueError("SHC member %s out_of_sync_node is true" % peerDictOrig[peer]['mgmt_uri'])
kvstore_status_uri = peerDictOrig[peer]['mgmt_uri'] + KVSTORE_STATUS_REST
kvstore_status = requests.get(kvstore_status_uri, auth=(USERNAME, PASSWORD), verify=False)
if kvstore_status.status_code != 200:
raise ValueError("Can't get KVStore status for SHC member %s" % peerDictOrig[peer]['mgmt_uri'])
if kvstore_status.json()['entry'][0]['content']['current']['status'] != "ready":
raise ValueError("KVStore on SHC member %s is not ready, please fix "
"this before proceeding with rolling upgrade" % peerDictOrig[peer]['mgmt_uri'])
if "splunk_version" in peerDictOrig[peer]:
if StrictVersion(peerDictOrig[peer]["splunk_version"]) >= StrictVersion(argList.splunk_new_version):
delete_list.append(peer)
else:
raise ValueError("SHC member %s version number is less than %s" % (peerDictOrig[peer]['mgmt_uri'], SHC_UPGRADE_BASE_VERSION))
for peer in delete_list:
peerDictOrig.pop(peer, None)
except ValueError as err:
logger.error(err.args)
sys.exit(err.args)
peerDictPreferedCaptain = {}
for peer in peerDictOrig:
if peerDictOrig[peer]["preferred_captain"]:
peerDictPreferedCaptain[peer] = peerDictOrig[peer]
logger.info('The complete member list in shc: %s', peerDictOrig)
logger.info('The list of members who have preferred_captain set: %s', peerDictPreferedCaptain)
# signal the start of upgrade
logger.info("Starting upgrade of the search head cluster")
initUri = argList.uri_of_member + UPGRADE_INIT_REST
logger.info("initialize the start of upgrade: %s", initUri)
rInit = requests.post(
initUri,
auth=(USERNAME, PASSWORD), verify=False)
if rInit.status_code != 200:
message = "Error during upgrade-init"
logger.error(message)
sys.exit(message)
# default timeout is 180 seconds, user can override it with "-t timeout_before_shutdown"
if argList.timeout_before_shutdown:
TIMEOUT = argList.timeout_before_shutdown
first = True
try:
while len(peerDictOrig):
# get one peer, avoid captain
candidate = ""
selected = False
# try to pick a perfered captain
for peer in peerDictPreferedCaptain:
if peerDictPreferedCaptain[peer]["mgmt_uri"] == captainInfo["mgmt_uri"]:
continue
candidate = peer
selected = True
break
if not selected:
for peer in peerDictOrig:
if peerDictOrig[peer]["mgmt_uri"] == captainInfo["mgmt_uri"]:
continue
candidate = peer
selected = True
break
if not selected:
errorMessage= ("Upgrade script can't pick a member to upgrade while there are still some upgrade candidates available.\n"
"This usually happens when the candidate is holding the captaincy, not transferring the captaincy to an upgraded member.\n"
"The root reason might be that the captain is already running a higher version of Splunk, or the SHC is in an unhealthy state.")
raise ValueError(errorMessage)
logger.info("selected member %s to upgrade", peerDictOrig[candidate]["label"])
peer_mgmt_uri = peerDictOrig[candidate]['mgmt_uri']
detentionUri = peer_mgmt_uri + MANUAL_DETENTION_REST
logger.info("set %s to manual detention", peer_mgmt_uri)
rDetention = requests.post(
detentionUri,
params={'manual_detention': 'on'}, auth=(USERNAME, PASSWORD), verify=False)
if rDetention.status_code != 200:
raise ValueError("Error during setting manual detention")
infoUri = peer_mgmt_uri + MEMBER_INFO_REST
timeOut = TIMEOUT
while True:
# query status of the node
logger.info("get member information from %s", infoUri)
rInfo = requests.get(
infoUri,
auth=(USERNAME, PASSWORD), verify=False)
if rInfo.status_code != 200:
raise ValueError("Error during getting the member information")
rInfoJson = rInfo.json()
status = rInfoJson['entry'][0]['content']['status']
active_historical_search_count = rInfoJson['entry'][0]['content']['active_historical_search_count']
if status == 'ManualDetention' and active_historical_search_count == 0:
break
time.sleep(TIMEOUT_INTERVAL)
timeOut = timeOut - TIMEOUT_INTERVAL
if TIMEOUT != -1 and timeOut < 0:
break
#Check kvstore status
start = time.time()
kvstorestatusInfo = ''
while kvstorestatusInfo !='ready' and time.time() - start < 240:
kvstore_status = requests.get(kvstore_status_uri,
auth=(USERNAME, PASSWORD), verify=False)
if kvstore_status.status_code != 200:
raise ValueError("Can't get KVStore status for SHC member %s" %
peerDictOrig[peer]['mgmt_uri'])
else:
rKvstoreJson = kvstore_status.json()
kvstorestatusInfo = rKvstoreJson['entry'][0]['content']['current']['status']
if (kvstorestatusInfo != 'ready'):
time.sleep(60)
if kvstorestatusInfo !='ready':
raise ValueError("KVStore status is still not ready")
uriResult = urlparse(peer_mgmt_uri)
splunkcommand=argList.directory_of_splunk_home + "/bin/splunk stop"
sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand]
logger.info("stop splunk %s", sshcommand)
sshprocess = subprocess.Popen(sshcommand,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
sshresult, ssherror = sshprocess.communicate()
if sshprocess.returncode:
raise ValueError("Error during stopping splunk: %s" % ssherror)
# check if we need to back up the existing installation
if argList.backup_directory:
backupcommand = "cp -rf " + argList.directory_of_splunk_home + " " + argList.backup_directory
sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, backupcommand]
logger.info("back up splunk %s", sshcommand)
sshprocess = subprocess.Popen(sshcommand,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
sshresult, ssherror = sshprocess.communicate()
if sshprocess.returncode:
raise ValueError("Error during backing up splunk: %s" % ssherror)
installcommand = "tar -zxvf " + argList.new_splunk_package + " -C " + os.path.dirname(argList.directory_of_splunk_home)
sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, installcommand]
logger.info("upgrade splunk %s", sshcommand)
sshprocess = subprocess.Popen(sshcommand,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
sshresult, ssherror = sshprocess.communicate()
if sshprocess.returncode:
raise ValueError("Error during upgrading splunk: %s" % ssherror)
splunkcommand = argList.directory_of_splunk_home + "/bin/splunk start --accept-license --answer-yes"
sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand]
logger.info("start splunk %s", sshcommand)
sshprocess = subprocess.Popen(sshcommand,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
sshresult, ssherror = sshprocess.communicate()
if sshprocess.returncode:
raise ValueError("Error during starting splunk: %s" % ssherror)
# turn off manual detention
logger.info("turn off manual detention for %s", peer_mgmt_uri)
rDetention = requests.post(
detentionUri,
params={'manual_detention': 'off'}, auth=(USERNAME, PASSWORD), verify=False)
if rDetention.status_code != 200:
raise ValueError("Error during turning off manual detention")
# post processing after the node is upgraded
logger.info("waiting for the shc to be stable ...")
time.sleep(60)
peerDictOrig.pop(candidate, None)
peerDictPreferedCaptain.pop(candidate, None)
# update for possible new captain
statusUri = argList.uri_of_member + SHCLUSTER_STATUS_REST
logger.info('calling shc status at: %s', statusUri)
rStatus = requests.get(
statusUri, params={'advanced': 1},
auth=(USERNAME, PASSWORD), verify=False)
if rStatus.status_code != 200:
raise ValueError("Error during getting SHC status")
rStatusJson = rStatus.json()
# update the captain
captainInfo = rStatusJson['entry'][0]['content']['captain']
# check if deployer needs to be upgraded
if argList.deployer and distutils.util.strtobool(argList.deployer):
configUri = captainInfo['mgmt_uri'] + SHCLUSTER_CONFIG_REST
logger.info('getting deployer information at: %s', configUri)
rStatus = requests.get(
configUri, auth = (USERNAME, PASSWORD), verify = False)
if rStatus.status_code != 200:
raise ValueError("Error during getting deployer information")
rStatusJson = rStatus.json()
deployerInfo = rStatusJson['entry'][0]['content']['conf_deploy_fetch_url']
if deployerInfo:
uriResult = urlparse(deployerInfo)
splunkcommand = argList.directory_of_splunk_home + "/bin/splunk stop"
sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand]
logger.info("stop splunk %s", sshcommand)
sshprocess = subprocess.Popen(sshcommand,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
sshresult, ssherror = sshprocess.communicate()
if sshprocess.returncode:
raise ValueError("Error during stopping deployer: %s" % ssherror)
installcommand = "tar -zxvf " + argList.new_splunk_package + " -C " + os.path.dirname(
argList.directory_of_splunk_home)
sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, installcommand]
logger.info("upgrade splunk %s", sshcommand)
sshprocess = subprocess.Popen(sshcommand,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
sshresult, ssherror = sshprocess.communicate()
if sshprocess.returncode:
raise ValueError("Error during upgrading deployer: %s" % ssherror)
splunkcommand = argList.directory_of_splunk_home + "/bin/splunk start --accept-license --answer-yes"
sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand]
logger.info("start splunk %s", sshcommand)
sshprocess = subprocess.Popen(sshcommand,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
sshresult, ssherror = sshprocess.communicate()
if sshprocess.returncode:
raise ValueError("Error during starting deployer: %s" % ssherror)
except ValueError as err:
logger.error(err.args)
sys.exit(err.args)
finally:
finalizeUri = argList.uri_of_member + UPGRADE_FINALIZE_REST
logger.info('finalize the shc upgrade %s', finalizeUri)
rFinalize = requests.post(
finalizeUri,
auth=(USERNAME, PASSWORD), verify=False)
print('SHC is upgraded successfully')
logger.info('SHC is upgraded successfully')
sys.exit(0)

Powered by BW's shoe-string budget.