Splunk_Docker/files/splunk-etc/apps/splunk_instrumentation/bin/instrumentation.py

# scripted inputs entry point

import os
import sys
import argparse
import datetime
import splunk_instrumentation.datetime_util as datetime_util
from time import sleep

'''
This must happen before splunk_instrumentation.constants is imported.
'''
parser = argparse.ArgumentParser()
parser.add_argument('--scheme', action='store_true')
parser.add_argument('-v', '--validate-arguments', action='store_true')
parser.add_argument('--no-collect', action='store_true', help='will not collect and index data')
parser.add_argument('--no-send', action='store_true', help='will not query _telemetry and send data')
parser.add_argument('-m', '--mode', default="INPUT", help='is required if not running from splund modular inputs')
parser.add_argument('--test-schema')
parser.add_argument('--log-level')
parser.add_argument('--username')
parser.add_argument('--password')
parser.add_argument('--execution-id')
parser.add_argument('--quickdraw-url', help='used to override the quickdraw-url')
parser.add_argument('--run-unscheduled', help='Run even if not scheduled', default=False)
parser.add_argument('--default-quickdraw', help='used to override the quickdraw-url response')
parser.add_argument('--start-date', help='first date to query, in YYYY-MM-DD format (defaults to yesterday)')
parser.add_argument('--stop-date', help='last date to query, in YYY-MM-DD format (inclusive) (defaults to yesterday)')
parser.add_argument('--batch-num', help='batch number')
args = parser.parse_args()


# configuration is done through environmental variables. Convert command line to environmental.

if args.mode:
    os.environ['INST_MODE'] = args.mode
if args.no_collect:
    os.environ['INST_NO_COLLECT'] = args.no_collect
if args.no_send:
    os.environ['INST_NO_SEND'] = args.no_send
if args.test_schema:
    os.environ['INST_TEST_SCHEMA'] = args.test_schema
if args.log_level:
    os.environ['INST_DEBUG_LEVEL'] = args.log_level
if args.execution_id:
    os.environ['INST_EXECUTION_ID'] = args.execution_id
if args.quickdraw_url:
    os.environ['QUICKDRAW_URL'] = args.quickdraw_url
if args.default_quickdraw:
    os.environ['DEFAULT_QUICKDRAW'] = args.default_quickdraw
if args.username:
    os.environ['SPLUNK_USERNAME'] = args.username
if args.password:
    os.environ['SPLUNK_PASSWORD'] = args.password
if args.run_unscheduled:
    os.environ['RUN_UNSCHEDULE'] = args.run_unscheduled


# Routine to get the value of an input token
def get_key():
    # read everything from stdin
    config_str = sys.stdin.read()
    # stdin is just a token
    os.environ['INST_TOKEN'] = config_str.rstrip()

if not os.environ.get("SPLUNK_DB"):
    os.environ['SPLUNK_DB'] = os.path.join(os.environ.get('SPLUNK_HOME') + 'var', 'lib', 'splunk')

# the default mode is INPUT and is what scripted inputs uses and implies
# there is a token passed in to stdin.
if os.environ['INST_MODE'] == "INPUT":
    get_key()

# these imports inlude splunk_instrumentation.constants which need to be imported after environmental vars are set
from splunk_instrumentation.constants import SPLUNKRC, INST_PRE_EXECUTE_SLEEP, SPLUNKD_URI, BATCHES_PER_HOUR, BATCHES_MAX_SIZE   # noqa: E402
from splunk_instrumentation.service_bundle import ServiceBundle  # noqa: E402
from splunk_instrumentation.splunkd import Splunkd  # noqa: E402
from splunk_instrumentation.input import run_input  # noqa: E402
from splunk_instrumentation.report import report  # noqa: E402


def normalize_date_range_params(args, report_start_date):
    '''
    Normalizes date range used for Data collection.
    Start date for Data collection could be args.start_date, reportStartDate or yesterday
    End data for Data collection could be args.stop_date or yesterday
    :param args: List of arguments provided through CLI
    :param report_start_date: reportStartDate specified in telemetry.conf
    :return:
    '''
    yesterday = datetime.date.today() - datetime.timedelta(days=1)

    args.start_date = datetime_util.str_to_date(args.start_date) if args.start_date\
        else datetime_util.str_to_date(report_start_date) if report_start_date else yesterday

    args.stop_date = datetime_util.str_to_date(args.stop_date) if args.stop_date else yesterday


def validate_date_range(args):
    # SPL-153360 This can happen when the user has gone from no opt-in to some opt-in
    # on the same day of the scheduled collection, before the script has run. This is
    # due to the TelemetryHandler.cpp file, which detects the switch from no opt-in to
    # some opt-in and sets the reportStartDate to today.
    #
    # When the script finally runs, it has an default stop date of yesterday, but
    # reportStartDate sets the lower bound, which is today in that case. We do not
    # want to generate alarming error messages, so just log the occurrence and exit
    # gracefully.
    if args.stop_date < args.start_date:
        report.report('collection-canceled', {
            'reason': 'Start date is after stop date. No data to collect.',
            'start_date': args.start_date,
            'stop_date': args.stop_date
        })
        exit(0)


def should_input_run(telemetry_conf_service, batch_num):
    '''
    Compares current time with the scheduledDay and scheduledHour
    to determine whether Input should execute or not
    :param telemetry_conf_service: Service for telemetry.conf
    :return: True if current time matched scheduling in telemetry.conf
    '''
    scheduled_day = telemetry_conf_service.content.get('scheduledDay')
    scheduled_hour = telemetry_conf_service.content.get('scheduledHour')
    # Compare day and hour to time now
    now = datetime.datetime.now()

    # we execute all savedseaerches in batches[0, BATCHES_MAX_SIZE) in two hours; verify if current batch number should be part of
    # scheduledHour [0, BATCHES_PER_HOUR) or scheduledHour + 1 [BATCHES_PER_HOUR, BATCHES_MAX_SIZE);
    should_run = False
    if (scheduled_day == '*' or scheduled_day == str(now.weekday())):
        if batch_num is not None:
            # all batches which are marked to be executed at scheduledHour will have 'execute_hour' value of 0;
            # all batches which marked to be executed at (scheduledHour + 1) will have 1.
            execute_hour = batch_num // BATCHES_PER_HOUR
            if(scheduled_hour == str(now.hour) and execute_hour == 0):
                should_run = True
            elif (int(scheduled_hour) + 1 == now.hour and execute_hour == 1):
                should_run = True
        else:
            # batch num is not provided as part of this script invocation; run all batches by default
            if(scheduled_hour == str(now.hour)):
                should_run = True

    report.report('schedule-data', {
        'schedule': {
            'day': scheduled_day,
            'hour': scheduled_hour
        },
        'now': {
            'day': str(now.weekday()),
            'hour': str(now.hour)
        },
        'batchNum': str(batch_num),
        'should_run': should_run
    })
    return should_run


def process_input_params(telemetry_conf_service, args):
    '''
    Processes Input date range params and sets reportStartDate in telemetery.conf
    :param telemetry_conf_service: Service for telemetry.conf
    :param args: List of arguments passed to Scripted input
    :return:
    '''
    report_start_date = telemetry_conf_service.content.get('reportStartDate')
    report.report('reportStartDate', report_start_date)

    normalize_date_range_params(args, report_start_date)
    validate_date_range(args)

    # update the 'reportStartDate' before triggering input.py ONLY if
    # 1. if the batch num not provided; we execute all batches by default OR
    # 2. if it is currently  executing the last batch (BATCHES_MAX_SIZE - 1, since we start from 0) OR
    # 3. if it is an unscheduled invocation
    batch_num = get_batch_num(args)
    if((batch_num is None) or (batch_num == BATCHES_MAX_SIZE - 1) or os.environ.get('RUN_UNSCHEDULE')):
        reportStartDate = args.stop_date
        if type(args.stop_date) == datetime.date:
            reportStartDate = reportStartDate.strftime('%Y-%m-%d')
        telemetry_conf_service.update({
            'reportStartDate': reportStartDate
        })


def get_batch_num(args):
    '''
    get input argument --batch-num
    '''
    if args.batch_num and args.batch_num != "None":
        return int(args.batch_num);
    return None;


# Routine to index data
def main():
    if os.environ['INST_MODE'] == "DEV":
        splunkd = Splunkd(**SPLUNKRC)
    else:
        sleep(INST_PRE_EXECUTE_SLEEP)
        splunkd = Splunkd(token=os.environ['INST_TOKEN'], server_uri=SPLUNKD_URI)

    services = ServiceBundle(splunkd)
    telemetry_conf_service = services.telemetry_conf_service

    batch_num = get_batch_num(args)
    if os.environ.get('RUN_UNSCHEDULE') or should_input_run(telemetry_conf_service, batch_num):
        process_input_params(telemetry_conf_service, args)
        run_input({'start': args.start_date, 'stop': args.stop_date, 'batchNum': batch_num})
    else:
        # indicate to caller that input wasn't executed
        sys.exit(114)


# Script must implement these args: scheme, validate-arguments
main()

sys.exit(0)