''' This is an example script for performing rolling upgrade in searchhead cluster and should not be applied to a production instance without editing to suit your environment and testing extensively. Usage of this script: python shc_upgrade_template.py -u uri_of_member -d directory_of_splunk_home -t timeout_before_shutdown -n new_splunk_package -r remote_ssh_user -s new_splunk_version_number --deployer yes/no --auth user:password There are some preconditions to run this script: 1. The upgrade is happening between post-NightLight (including NightLight) to a higher version. 2. All the bits/binaries that are needed during the upgrading should have been put into place on the machines running Splunks. 3. The user running this script should have set up the keyless ssh login onto the machines running Splunks. Workflow of this script: 1. check SHC status through REST "/services/shcluster/status" 2. if SHC status is not healthy, exit the script. otherwise 3. put SHC in upgrade state through REST "/services/shcluster/captain/control/default/upgrade-init" 4. for each node in the SHC 4.1 put the node in manual detention through REST "/services/shcluster/member/control/control/set_manual_detention" 4.2 check the existing search jobs' status through REST "/services/shcluster/member/info" 4.3 if there is no existing historical search jobs, or the timeout (configurable, and default to 180 seconds) expires, start the upgrade of the node 4.3.1 stop the node by "splunk stop" 4.3.2 back up the existing splunk installation (optional) 4.3.3 untar the new splunk package 4.3.4 start splunk by "splunk start"? 4.4 turn off the manual detention through REST "/services/shcluster/member/control/control/set_manual_detention" 5. finalize the upgrade through REST "/services/shcluster/captain/control/default/upgrade-finalize ''' import logging as logger import sys import os import requests import time import argparse import subprocess if sys.version_info >= (3, 0): import urllib.parse urlparse = urllib.parse.urlparse else: from urlparse import urlparse from distutils.version import StrictVersion import distutils.util def log_status_exit(shc_logger, status, message): shc_logger.error(message) if status == 401: shc_logger.error("Authentication failure: must pass valid credentials with request.") else: if status == 500: shc_logger.error("Internal server error.") sys.exit(message) if __name__ == '__main__': # default settings USERNAME="admin" PASSWORD="changme" SSHUSER="root" # rest api used SHCLUSTER_STATUS_REST = "/services/shcluster/status?output_mode=json" UPGRADE_INIT_REST = "/services/shcluster/captain/control/default/upgrade-init?output_mode=json" UPGRADE_FINALIZE_REST = "/services/shcluster/captain/control/default/upgrade-finalize?output_mode=json" MANUAL_DETENTION_REST = "/services/shcluster/member/control/control/set_manual_detention?output_mode=json" MEMBER_INFO_REST = "/services/shcluster/member/info?output_mode=json" SHCLUSTER_CONFIG_REST = "/services/shcluster/config?output_mode=json" KVSTORE_STATUS_REST = "/services/kvstore/status?output_mode=json" TIMEOUT = 180 TIMEOUT_INTERVAL = 5 SHC_UPGRADE_BASE_VERSION = "7.1.0" #config the logger logger.basicConfig(filename='shc_upgrade.log', level=logger.INFO) example_text = '''example: python shc_upgrade_template.py -u https://example.com:8089 -d /home/user/splunk -t 180 -n /opt/newsplunk.tar.gz -r splunk -s 7.2.2 --auth admin:changed ''' parser = argparse.ArgumentParser(description='SHC upgrade script', epilog=example_text, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('-u', '--uri_of_member', required=True, action="store", type=str, help="Specify the mgmt_uri of any member in SHC") parser.add_argument('-d', '--directory_of_splunk_home', required=True, action="store", type=str, help="Specify the directory of splunk home") parser.add_argument('-n', '--new_splunk_package', required=True, action="store", type=str, help="Specify the full path for the new splunk package") parser.add_argument('-t', '--timeout_before_shutdown', action="store", type=int, help="Specify the timeout in seconds this script uses before shutting down splunk. If -1 is given, the script will wait for all non-realtime searches to be completed before shutting down splunk") parser.add_argument('-r', '--remote_ssh_user', action="store", type=str, help="Specify the user name used to access remote machines through ssh running SHC") parser.add_argument('-s', '--splunk_new_version', required=True, action="store", type=str, help="Specify the version of the new splunk package") parser.add_argument('-b', '--backup_directory', action="store", type=str, help="Specify the backup directory if user wants to back up existing splunk before the upgrade happens") parser.add_argument('--deployer', action="store", type=str, help="Specify if the deployer needs to be upgraded") parser.add_argument('-a', '--auth', action="store", type=str, help="Specify the username and password for the splunk account") argList = parser.parse_args() # check for username and password if argList.auth: newauth = argList.auth.split(':') if len(newauth) != 2: logger.error("Expected argument in 'username:password' format") sys.exit("Expected argument in 'username:password' format") USERNAME = newauth[0] PASSWORD = newauth[1] #check ssh login name if argList.remote_ssh_user: SSHUSER = argList.remote_ssh_user # get shc status statusUri = argList.uri_of_member + SHCLUSTER_STATUS_REST logger.info('calling shc status at: %s', statusUri) rStatus = requests.get( statusUri, params = {'advanced' : 1}, auth=(USERNAME, PASSWORD), verify=False) if rStatus.status_code != 200: message = "Error during getting SHC status" log_status_exit(logger, rStatus.status_code, message) rStatusJson = rStatus.json() # check shc status captainInfo = {} peerDictOrig = {} cluster_master_version = None try: captainInfo = rStatusJson['entry'][0]['content']['captain'] if not captainInfo["dynamic_captain"]: raise ValueError("SHC does not have a dynamic captain" "please fix this before proceeding with rolling upgrade") if not captainInfo["stable_captain"]: raise ValueError("SHC does not have a stable captain " "please fix this before proceeding with rolling upgrade") if not captainInfo["service_ready_flag"]: raise ValueError("SHC captain is not ready to provide service " "please fix this before proceeding with rolling upgrade") if captainInfo["rolling_restart_flag"]: raise ValueError("SHC is in rolling restart " "please fix this before proceeding with rolling upgrade") if captainInfo["rolling_upgrade_flag"]: raise ValueError("SHC is already in rolling upgrade , the reason for the failure " "may be an already existing rolling upgrade is going on, please wait for it to " "finish or may be the script failed in between, so finalize the script " "please fix this issue before proceeding with rolling upgrade") if captainInfo["max_failures_to_keep_majority"] <= 0: raise ValueError("max_failures_to_keep_majority should be larger than 0 ." "Run show shcluster-status to know which search head does not have the status Up." "Please fix this before proceeding with rolling upgrade") # version checking if StrictVersion(argList.splunk_new_version) <= StrictVersion(SHC_UPGRADE_BASE_VERSION): raise ValueError("the new splunk version number should be larger than %s" % (SHC_UPGRADE_BASE_VERSION)) cluster_master = rStatusJson['entry'][0]['content']['cluster_master'] if cluster_master: for master in cluster_master: version = cluster_master[master]['splunk_version'] if cluster_master_version is not None: if StrictVersion(version) < StrictVersion(cluster_master_version): cluster_master_version = version else: cluster_master_version = version if StrictVersion(cluster_master_version) < StrictVersion(argList.splunk_new_version): raise ValueError("cluster_master version %s is lower than the new SHC version %s" % (cluster_master_version, argList.splunk_new_version)) # gather the nodes that are needed to be upgraded peerDictOrig = rStatusJson['entry'][0]['content']['peers'] if len(peerDictOrig) == 0: raise ValueError("SHC has no members") delete_list = [] for peer in peerDictOrig: if peerDictOrig[peer]['out_of_sync_node']: raise ValueError("SHC member %s out_of_sync_node is true" % peerDictOrig[peer]['mgmt_uri']) kvstore_status_uri = peerDictOrig[peer]['mgmt_uri'] + KVSTORE_STATUS_REST kvstore_status = requests.get(kvstore_status_uri, auth=(USERNAME, PASSWORD), verify=False) if kvstore_status.status_code != 200: raise ValueError("Can't get KVStore status for SHC member %s" % peerDictOrig[peer]['mgmt_uri']) if kvstore_status.json()['entry'][0]['content']['current']['status'] != "ready": raise ValueError("KVStore on SHC member %s is not ready, please fix " "this before proceeding with rolling upgrade" % peerDictOrig[peer]['mgmt_uri']) if "splunk_version" in peerDictOrig[peer]: if StrictVersion(peerDictOrig[peer]["splunk_version"]) >= StrictVersion(argList.splunk_new_version): delete_list.append(peer) else: raise ValueError("SHC member %s version number is less than %s" % (peerDictOrig[peer]['mgmt_uri'], SHC_UPGRADE_BASE_VERSION)) for peer in delete_list: peerDictOrig.pop(peer, None) except ValueError as err: logger.error(err.args) sys.exit(err.args) peerDictPreferedCaptain = {} for peer in peerDictOrig: if peerDictOrig[peer]["preferred_captain"]: peerDictPreferedCaptain[peer] = peerDictOrig[peer] logger.info('The complete member list in shc: %s', peerDictOrig) logger.info('The list of members who have preferred_captain set: %s', peerDictPreferedCaptain) # signal the start of upgrade logger.info("Starting upgrade of the search head cluster") initUri = argList.uri_of_member + UPGRADE_INIT_REST logger.info("initialize the start of upgrade: %s", initUri) rInit = requests.post( initUri, auth=(USERNAME, PASSWORD), verify=False) if rInit.status_code != 200: message = "Error during upgrade-init" logger.error(message) sys.exit(message) # default timeout is 180 seconds, user can override it with "-t timeout_before_shutdown" if argList.timeout_before_shutdown: TIMEOUT = argList.timeout_before_shutdown first = True try: while len(peerDictOrig): # get one peer, avoid captain candidate = "" selected = False # try to pick a perfered captain for peer in peerDictPreferedCaptain: if peerDictPreferedCaptain[peer]["mgmt_uri"] == captainInfo["mgmt_uri"]: continue candidate = peer selected = True break if not selected: for peer in peerDictOrig: if peerDictOrig[peer]["mgmt_uri"] == captainInfo["mgmt_uri"]: continue candidate = peer selected = True break if not selected: errorMessage= ("Upgrade script can't pick a member to upgrade while there are still some upgrade candidates available.\n" "This usually happens when the candidate is holding the captaincy, not transferring the captaincy to an upgraded member.\n" "The root reason might be that the captain is already running a higher version of Splunk, or the SHC is in an unhealthy state.") raise ValueError(errorMessage) logger.info("selected member %s to upgrade", peerDictOrig[candidate]["label"]) peer_mgmt_uri = peerDictOrig[candidate]['mgmt_uri'] detentionUri = peer_mgmt_uri + MANUAL_DETENTION_REST logger.info("set %s to manual detention", peer_mgmt_uri) rDetention = requests.post( detentionUri, params={'manual_detention': 'on'}, auth=(USERNAME, PASSWORD), verify=False) if rDetention.status_code != 200: raise ValueError("Error during setting manual detention") infoUri = peer_mgmt_uri + MEMBER_INFO_REST timeOut = TIMEOUT while True: # query status of the node logger.info("get member information from %s", infoUri) rInfo = requests.get( infoUri, auth=(USERNAME, PASSWORD), verify=False) if rInfo.status_code != 200: raise ValueError("Error during getting the member information") rInfoJson = rInfo.json() status = rInfoJson['entry'][0]['content']['status'] active_historical_search_count = rInfoJson['entry'][0]['content']['active_historical_search_count'] if status == 'ManualDetention' and active_historical_search_count == 0: break time.sleep(TIMEOUT_INTERVAL) timeOut = timeOut - TIMEOUT_INTERVAL if TIMEOUT != -1 and timeOut < 0: break #Check kvstore status start = time.time() kvstorestatusInfo = '' while kvstorestatusInfo !='ready' and time.time() - start < 240: kvstore_status = requests.get(kvstore_status_uri, auth=(USERNAME, PASSWORD), verify=False) if kvstore_status.status_code != 200: raise ValueError("Can't get KVStore status for SHC member %s" % peerDictOrig[peer]['mgmt_uri']) else: rKvstoreJson = kvstore_status.json() kvstorestatusInfo = rKvstoreJson['entry'][0]['content']['current']['status'] if (kvstorestatusInfo != 'ready'): time.sleep(60) if kvstorestatusInfo !='ready': raise ValueError("KVStore status is still not ready") uriResult = urlparse(peer_mgmt_uri) splunkcommand=argList.directory_of_splunk_home + "/bin/splunk stop" sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand] logger.info("stop splunk %s", sshcommand) sshprocess = subprocess.Popen(sshcommand, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sshresult, ssherror = sshprocess.communicate() if sshprocess.returncode: raise ValueError("Error during stopping splunk: %s" % ssherror) # check if we need to back up the existing installation if argList.backup_directory: backupcommand = "cp -rf " + argList.directory_of_splunk_home + " " + argList.backup_directory sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, backupcommand] logger.info("back up splunk %s", sshcommand) sshprocess = subprocess.Popen(sshcommand, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sshresult, ssherror = sshprocess.communicate() if sshprocess.returncode: raise ValueError("Error during backing up splunk: %s" % ssherror) installcommand = "tar -zxvf " + argList.new_splunk_package + " -C " + os.path.dirname(argList.directory_of_splunk_home) sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, installcommand] logger.info("upgrade splunk %s", sshcommand) sshprocess = subprocess.Popen(sshcommand, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sshresult, ssherror = sshprocess.communicate() if sshprocess.returncode: raise ValueError("Error during upgrading splunk: %s" % ssherror) splunkcommand = argList.directory_of_splunk_home + "/bin/splunk start --accept-license --answer-yes" sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand] logger.info("start splunk %s", sshcommand) sshprocess = subprocess.Popen(sshcommand, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sshresult, ssherror = sshprocess.communicate() if sshprocess.returncode: raise ValueError("Error during starting splunk: %s" % ssherror) # turn off manual detention logger.info("turn off manual detention for %s", peer_mgmt_uri) rDetention = requests.post( detentionUri, params={'manual_detention': 'off'}, auth=(USERNAME, PASSWORD), verify=False) if rDetention.status_code != 200: raise ValueError("Error during turning off manual detention") # post processing after the node is upgraded logger.info("waiting for the shc to be stable ...") time.sleep(60) peerDictOrig.pop(candidate, None) peerDictPreferedCaptain.pop(candidate, None) # update for possible new captain statusUri = argList.uri_of_member + SHCLUSTER_STATUS_REST logger.info('calling shc status at: %s', statusUri) rStatus = requests.get( statusUri, params={'advanced': 1}, auth=(USERNAME, PASSWORD), verify=False) if rStatus.status_code != 200: raise ValueError("Error during getting SHC status") rStatusJson = rStatus.json() # update the captain captainInfo = rStatusJson['entry'][0]['content']['captain'] # check if deployer needs to be upgraded if argList.deployer and distutils.util.strtobool(argList.deployer): configUri = captainInfo['mgmt_uri'] + SHCLUSTER_CONFIG_REST logger.info('getting deployer information at: %s', configUri) rStatus = requests.get( configUri, auth = (USERNAME, PASSWORD), verify = False) if rStatus.status_code != 200: raise ValueError("Error during getting deployer information") rStatusJson = rStatus.json() deployerInfo = rStatusJson['entry'][0]['content']['conf_deploy_fetch_url'] if deployerInfo: uriResult = urlparse(deployerInfo) splunkcommand = argList.directory_of_splunk_home + "/bin/splunk stop" sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand] logger.info("stop splunk %s", sshcommand) sshprocess = subprocess.Popen(sshcommand, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sshresult, ssherror = sshprocess.communicate() if sshprocess.returncode: raise ValueError("Error during stopping deployer: %s" % ssherror) installcommand = "tar -zxvf " + argList.new_splunk_package + " -C " + os.path.dirname( argList.directory_of_splunk_home) sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, installcommand] logger.info("upgrade splunk %s", sshcommand) sshprocess = subprocess.Popen(sshcommand, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sshresult, ssherror = sshprocess.communicate() if sshprocess.returncode: raise ValueError("Error during upgrading deployer: %s" % ssherror) splunkcommand = argList.directory_of_splunk_home + "/bin/splunk start --accept-license --answer-yes" sshcommand = ["ssh", "-l", SSHUSER, uriResult.hostname, splunkcommand] logger.info("start splunk %s", sshcommand) sshprocess = subprocess.Popen(sshcommand, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) sshresult, ssherror = sshprocess.communicate() if sshprocess.returncode: raise ValueError("Error during starting deployer: %s" % ssherror) except ValueError as err: logger.error(err.args) sys.exit(err.args) finally: finalizeUri = argList.uri_of_member + UPGRADE_FINALIZE_REST logger.info('finalize the shc upgrade %s', finalizeUri) rFinalize = requests.post( finalizeUri, auth=(USERNAME, PASSWORD), verify=False) print('SHC is upgraded successfully') logger.info('SHC is upgraded successfully') sys.exit(0)