from __future__ import absolute_import from __future__ import print_function # Version 4.0 # # Intersplunk provides simple access to the comm protocol between Splunk search # operators. # # The intersplunk format is plain CSV, with a first-line field header. # from builtins import zip from builtins import range import csv import sys import copy import re if sys.version_info >= (3, 0): from io import (BytesIO, TextIOWrapper, StringIO) else: from StringIO import StringIO BytesIO = StringIO from future.moves.urllib import parse as urllib_parse import os # set the maximum allowable CSV field size # # The default of the csv module is 128KB; upping to 10MB. See SPL-12117 for # the background on issues surrounding field sizes. # (this method is new in python 2.5) csv.field_size_limit(10485760) MV_ENABLED = True def set_binary_mode(fileobj): # Pylint can't handle platform-dependent code. # pylint: disable-all # This works around a design error in Intersplunk where it assumes that the # bytes it writes to stdout will be identical to the bytes which are # emitted. # This is false on windows where \n is mapped to \r\n # The typical solution is to simply open the file in binary mode, but stdout # is already open, thus this hack if sys.platform == 'win32': import msvcrt msvcrt.setmode(fileobj.fileno(), os.O_BINARY) def default_stdout_stream(): if sys.version_info >= (3, 0): return sys.stdout.buffer set_binary_mode(sys.stdout) return sys.stdout def splunkHome(): import os return os.path.normpath(os.environ["SPLUNK_HOME"]) def isGetInfo(args): if (len(args) >= 2) and (args[1] == "__GETINFO__"): newargs = [args[0]] newargs.extend(args[2:]) return (True, newargs) elif (len(args) >= 2) and (args[1] == "__EXECUTE__"): newargs = [args[0]] newargs.extend(args[2:]) return (False, newargs) else: # invalid invocation, exit and return error message immediately generateErrorResults("Unexpected first argument to script, expected '__GETINFO__' or '__EXECUTE__'.") sys.exit() def parseError(msg): generateErrorResults(msg) sys.exit() def outputInfo(streaming, generating, retevs, reqsop, preop, timeorder=False, clear_req_fields=False, req_fields = None): infodict = { 'streaming_preop' : preop, 'streaming' : '0', 'generating' : '0', 'retainsevents' : '0', 'requires_preop' : '0', 'generates_timeorder' : '0', 'overrides_timeorder' : '1', 'clear_required_fields' : '0' } if streaming: infodict['streaming'] = '1' if generating: infodict['generating'] = '1' if timeorder: infodict['generates_timeorder'] = '1' else: if timeorder: infodict['overrides_timeorder'] = '0' if retevs: infodict['retainsevents'] = '1' if reqsop: infodict['requires_preop'] = '1' if clear_req_fields: infodict['clear_required_fields'] = '1' if req_fields is not None and len(req_fields) > 0: infodict['required_fields'] = req_fields outputResults([ infodict ], mvdelim=',') sys.exit() ''' For multivalues, values are wrapped in '$' and separated using ';' Literal '$' values are represented with'$$' ''' def getEncodedMV(vals): s = "" for val in vals: val = val.replace('$', '$$') if len(s): s += ';' s += '$' + val + '$' return s def decodeMV(s, vals): if len(s) == 0: return False tok = "" inval = False i = 0 while i < len(s): if not inval: if s[i] == '$': inval = True elif s[i] != ';': return False else: if s[i] == '$' and i+1 < len(s) and s[i+1] == '$': tok += '$' i += 1 elif s[i] == '$': inval = False vals.append(tok) tok = "" else: tok += s[i] i += 1 return True def addMessage(messages, msg, key): if key not in messages: messages[key] = [] messages[key].append(msg) def addInfoMessage(messages, msg): addMessage(messages, msg, "info_message") def addWarnMessage(messages, msg): addMessage(messages, msg, "warn_message") def addErrorMessage(messages, msg): addMessage(messages, msg, "error_message") def outputResults(results, messages = None, fields = None, mvdelim = '\n', outputfile = None): ''' Outputs the contents of a result set to STDOUT in Interplunk format, for consumption by the next search processor. ''' if outputfile is None: outputfile = default_stdout_stream() if messages is not None: # message header is everything before the first empty line, similar to the input # header format. also key = value, with stripping of whitespace for level, messages in messages.items(): for msg in messages: msg = "%s=%s\n" % (level, msg) if sys.version_info >= (3, 0): msg = msg.encode() outputfile.write(msg) outputfile.write(b"\n") if results is None: return s = set() l = [] ''' Check each entry to see if it is a list (multivalued). If so, set the multivalued key to the proper encoding Replace the list with a newline separated string of the values ''' for i in range(len(results)): for key in list(results[i].keys()): # We wrapped the call to keys() in a list() for py3's dictionary changed size during iteration. if(isinstance(results[i][key], list)): results[i]['__mv_' + key] = getEncodedMV(results[i][key]) results[i][key] = mvdelim.join(results[i][key]) for k in list(results[i].keys()): # We wrapped the call to keys() in a list() for py3's dictionary changed size during iteration. if k not in s: s.add(k) l.append(k) #s.update(results[i].keys()) if fields is None: h = l else: h = fields if sys.version_info >= (3, 0): outputfile = TextIOWrapper(outputfile, encoding = 'utf-8', write_through = True) dw = csv.DictWriter(outputfile, h, extrasaction='ignore') dw.writerow(dict(zip(h, h))) dw.writerows(results) if sys.version_info >= (3, 0): outputfile.detach() # Don't close the underlying file def outputStreamResults(results, version = "4.3", header = None, mvdelim = '\n', outputfile = None): if outputfile is None: outputfile = default_stdout_stream() header_io = BytesIO() header_str = b"" if header is not None: outputResults(header, None, None, mvdelim, header_io) header_str = header_io.getvalue() header_io.close() body_io = BytesIO() body_str = b"" outputResults(results, None, None, mvdelim, body_io) body_str = body_io.getvalue() body_io.close() if sys.version_info >= (3, 0): version = version.encode() outputfile.write(b"splunk %s,%d,%d\n" % (version, len(header_str), len(body_str))) if len(header_str) > 0: outputfile.write(header_str) if len(body_str) > 0: outputfile.write(body_str) def generateErrorResults(errorStr): ''' Generates a properly formatted error message for use by the outputResults() method. ''' h = ["ERROR"] results = [ {"ERROR": errorStr} ] outputfile = default_stdout_stream() if sys.version_info >= (3, 0): outputfile = TextIOWrapper(outputfile, encoding = 'utf-8', write_through = True) dw = csv.DictWriter(outputfile, h) dw.writerow(dict(zip(h, h))) dw.writerows(results) if sys.version_info >= (3, 0): outputfile.detach() # Don't close the underlying file # return [{"ERROR": errorStr}] return None # legacy calls tried to use this value. def readResults(input_buf = None, settings = None, has_header = True): ''' Converts an Intersplunk-formatted file object into a dict representation of the contained events. ''' if input_buf is None: if sys.version_info >= (3, 0): input_buf = TextIOWrapper(sys.stdin.buffer, encoding='utf-8') else: input_buf = sys.stdin results = [] if settings is None: settings = {} # dummy if has_header: # until we get a blank line, read "attr:val" lines, setting the values in 'settings' attr = last_attr = None while True: line = input_buf.readline() line = line[:-1] # remove lastcharacter(newline) if len(line) == 0: break colon = line.find(':') if colon < 0: if last_attr: settings[attr] = settings[attr] + '\n' + urllib_parse.unquote(line) else: continue # extract it and set value in settings last_attr = attr = line[:colon] val = urllib_parse.unquote(line[colon+1:]) settings[attr] = val csvr = csv.reader(input_buf) header = [] first = True mv_fields = [] for line in csvr: if first: header = line first = False # Check which fields are multivalued (for a field 'foo', '__mv_foo' also exists) if MV_ENABLED: for field in header: if "__mv_" + field in header: mv_fields.append(field) continue # need to maintain field order import splunk.util as util result = util.OrderedDict() i = 0 for val in line: result[header[i]] = val i = i+1 for key in mv_fields: mv_key = "__mv_" + key if key in result and mv_key in result: # Expand the value of __mv_[key] to a list, store it in key, and delete __mv_[key] vals = [] if decodeMV(result[mv_key], vals): result[key] = copy.deepcopy(vals) if len(result[key]) == 1: result[key] = result[key][0] del result[mv_key] results.append(result) return results def getOrganizedResults(input_str = None): ''' Converts an Intersplunk-formatted file object into a dict representation of the contained events, and returns a tuple of: (results, dummyresults, settings) "dummyresults" is always an empty list, and "settings" is always an empty dict, since the change to csv stopped sending the searchinfo. It has not been updated to store the auth token. ''' settings = {} dummyresults = [] results = readResults(input_str, settings) return results, dummyresults, settings def rawresultsToString(results): ''' Extracts the raw event data from a result set and returns all of them as a single CR-delimited string. ''' # TODO: is this method still being used? # TODO: this can be optimized by list comprehensions rawresults = [] for result in results: for k, v in result.items(): if k == "_raw": rawresults.append(v) resultstext = "\n".join(rawresults) return resultstext def win32_utf8_argv(): """Uses shell32.GetCommandLineArgvW to get sys.argv as a list of UTF-8 strings. Versions 2.5 and older of Python don't support Unicode in sys.argv on Windows, with the underlying Windows API instead replacing multi-byte characters with '?'. Returns None on failure. Example usage: >>> def main(argv=None): ... if argv is None: ... argv = win32_utf8_argv() or sys.argv ... """ if sys.version_info >= (3, 0): return sys.argv try: from ctypes import POINTER, byref, cdll, c_int, windll from ctypes.wintypes import LPCWSTR, LPWSTR GetCommandLineW = cdll.kernel32.GetCommandLineW GetCommandLineW.argtypes = [] GetCommandLineW.restype = LPCWSTR CommandLineToArgvW = windll.shell32.CommandLineToArgvW CommandLineToArgvW.argtypes = [LPCWSTR, POINTER(c_int)] CommandLineToArgvW.restype = POINTER(LPWSTR) cmd = GetCommandLineW() argc = c_int(0) argv = CommandLineToArgvW(cmd, byref(argc)) if argc.value > 0: # Remove Python executable if present if argc.value - len(sys.argv) == 1: start = 1 else: start = 0 return [argv[i].encode('utf-8') for i in range(start, argc.value)] except Exception: pass def getKeywordNewlineSafe(arg, argname): argnamelen = len(argname) if arg.startswith('"') and arg.endswith('"'): arg = arg[1:-1] if arg.startswith(argname): # pick off just the search string and construct the list # technically we could have gotten '::' or '==' and not just '=' if arg.startswith("%s::" % argname) or arg.startswith("%s==" % argname): val = arg[argnamelen+2:] else: val = arg[argnamelen+1:] return [(argname, '=', val)] else: return [] # from sys.argv, get key=value args as well as other plain keyword args (e.g. "file") # decode the values if charset is provided def getKeywordsAndOptions(charset=None): keywords = [] kvs = {} first = True # SPL-30670 - handle unicode args specially in windows argv = win32_utf8_argv() or sys.argv # for each arg for arg in argv: if first: first = False continue # ssquery could have newlines within the search, don't lose them - SPL-65995 if re.match( "\"?ssquery(::|={1,2})", arg.lower()): matches = getKeywordNewlineSafe(arg, 'ssquery') # message could have newlines within it, don't lose them elif re.match( "\"?message(::|={1,2})", arg.lower()): matches = getKeywordNewlineSafe(arg, 'message') # footer could have newlines within it, don't lose them elif re.match( "\"?footer(::|={1,2})", arg.lower()): matches = getKeywordNewlineSafe(arg, 'footer') else: # handle case where arg is surrounded by quotes # remove outter quotes and accept attr= if arg.startswith('"') and arg.endswith('"'): arg = arg[1:-1] matches = re.findall(r'(?:^|\s+)([a-zA-Z0-9_-]+)\s*(::|==|=)\s*(.*)', arg) else: matches = re.findall(r'(?:^|\s+)([a-zA-Z0-9_-]+)\s*(::|==|=)\s*((?:[^"\s]+)|(?:"[^"]*"))', arg) def needs_decoding(obj): if sys.version_info >= (3, 0): return isinstance(obj, bytes) return isinstance(obj, str) if len(matches) == 0: if charset is not None and needs_decoding(arg): arg = arg.decode(charset) keywords.append(arg) else: # for each k=v match for match in matches: attr, eq, val = match # put arg in a match if charset is not None and needs_decoding(val): kvs[attr] = val.decode(charset) else: kvs[attr] = val return keywords, kvs