#!/usr/bin/env python # This work contains confidential material of Splunk Inc. Its use or disclosure in #whole or in part without the express written permission of Splunk Inc. is prohibited. import os import re import random import glob import urllib import sys import splunk.Intersplunk import splunk.clilib.cli_common as comm from builtins import chr, range from splunk.mining.DateParser import _validateDate, _validateTime from splunk.clilib.bundle_paths import make_splunkhome_path WORD_REGEX = re.compile(r'[^a-zA-Z0-9]+') WORD_SPLIT = re.compile(r'([^a-zA-Z0-9]+)') def _generateReplacement(term, nameterms): replacement = "" if looksLikeWord(term): # get list of names with the same length as the term names = nameterms.get(len(term), None) if names != None: nameCount = len(names) if nameCount > 0: index = random.randint(1, nameCount) replacement = names[index-1] del names[index-1] return replacement for ch in term: if ch.isdigit(): # return a new number that is randomly less than the given value, so that ip addresses, and codes # are not higher than the value given. otherwise we wil get ip addresses like 554.785.455.545. # this assumes that if given a number, a number lower than it will be equally valid maxVal = int(ch) newch = str(random.randint(0,maxVal)) elif ch.isalpha(): if ch.islower(): newch = chr(random.randint(97,122)) else: newch = chr(random.randint(65,90)) else: newch = ch replacement += newch return replacement def lengthLists(terms): result = dict() for key in terms.keys(): addToMapList(result, len(key), key) return result ############################# DATEFINDER def findAllDatesAndTimes(text, timeInfoTuplet): global today, _MIN_YEAR, _MAX_YEAR timeExpressions = timeInfoTuplet[0] dateExpressions = timeInfoTuplet[1] matches = getAllMatches(text, dateExpressions, _validateDate) matches.extend(getAllMatches(text, timeExpressions, _validateTime)) return matches def getAllMatches(text, expressions, validator): index = -1 matches = list() for expression in expressions: index += 1 for match in expression.finditer(text): values = match.groupdict() isvalid = validator(values) if isvalid: matches.append(match.span()) return matches # return true if position is between any start-end in list of regions def inRegions(position, regions): for region in regions: start = region[0] end = region[1] if start <= position <= end: return True return False def compilePatterns(formats): compiledList = list() for format in formats: compiledList.append(re.compile(format, re.I)) return compiledList def getTimeInfoTuplet(timestampconfilename): root = os.path.realpath(make_splunkhome_path(['etc', 'anonymizer'])) if not os.path.isabs(root): root = os.path.abspath(root) timestampconfilename = os.path.realpath(os.path.normpath(timestampconfilename)) if not os.path.isabs(timestampconfilename): timestampconfilename = os.path.abspath(timestampconfilename) if root != os.path.commonprefix([root, timestampconfilename]): print('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root)) raise Exception('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root)) text = readText(timestampconfilename) text = text.replace('\\n', '\n').replace('\n\n', '\n') results = {} exec(text, {"__builtins__":None}, results) compiledTimePatterns = compilePatterns(results['timePatterns']) compiledDatePatterns = compilePatterns(results['datePatterns']) timeInfoTuplet = [compiledTimePatterns, compiledDatePatterns, results['minYear'], results['maxYear']] return timeInfoTuplet ############################# DATEFINDER ################################### BEGIN COPIED FROM DCUTILS.PY def addToMapList(map, key, value): if key in map: l = map[key] else: l = list() map[key] = l safeAppend(l, value) return l def fileWords(filename, lowercase): terms = dict() with open(filename) as f: count = 1 while (True): line = f.readline() if (lowercase): line = line.lower() if len(line) == 0: break tokenize(line, False, terms) ##Is it possible to do previews from a search script? #if count % 100000 == 0: # print('\t%u processed...' % count) count += 1 return terms def readText(filename): # really, this needs a function? with open(filename) as f: text = f.read() return text MAX_SEGMENT = 1024 def findBreak(start, segSize, text): segEnd = start + segSize - 1 if segEnd > len(text): return len(text)-1 for end in range(segEnd, max(start+1, segEnd-100), -1): if not text[end].isalnum(): return end # failed to find break by going back 100 chars. give up and break at will. return segEnd # returns maps of terms and phrases to their count def tokenize(text, wordsOnlyP, vector = dict()): segCount = int((len(text) + MAX_SEGMENT-1) / MAX_SEGMENT) segStart = 0 for seg in range(0, segCount): segEnd = findBreak(segStart, MAX_SEGMENT, text) segText = text[segStart:segEnd+1] tokens = WORD_REGEX.split(segText) for token in tokens: if len(token) == 0: continue if not wordsOnlyP or looksLikeWord(token): incCount(vector, token, 1) segStart = segEnd+1 return vector def looksLikeWord(token): upper = lower = 0 for c in token: if not c.isalpha(): return False if c.isupper(): upper += 1 else: lower += 1 return len(token) > 2 and (upper == 0 or lower == 0 or upper == 1) def incCount(map, val, count): if val in map: map[val] += count else: map[val] = count def safeAppend(list, val): if val not in list: list.append(val) ################################### END COPIED FROM DCUTILS.PY def isInt(token): if len(token) > 0 and token[0].isdigit(): try: int(token) return True except: pass return False def caseSame(caseSource, textSource): result = ""; for i in range(0, len(caseSource)): casech = caseSource[i] textch = textSource[i] if casech.isupper(): textch = textch.upper() elif casech.islower(): textch = textch.lower() result += textch; return result; def scrubValue(result, val, isRaw, allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet): regions = [] if isRaw: regions = findAllDatesAndTimes(val, timeInfoTuplet) position = 0 tokens = re.split(WORD_SPLIT, val) newtokens = list() for token in tokens: lower = token.lower() newtoken = token incCount(allterms, token, 1) inDateRegion = inRegions(position, regions) # if term is name of not an attribute and not in a date region. # double check for numbers of public terms because date regions sometimes # have extraineous text if the regex matches contains a noise term or end of expression match if (result.get(lower, None) == None) and not (inDateRegion and (isInt(token) or (lower in publicTerms and lower not in privateTerms))): # if we haven't already made a replacement for this term and it's a private term or not a public term if lower not in replacements and (lower in privateTerms or lower not in publicTerms): replacements[lower] = newtoken = _generateReplacement(token, nameTerms) # make a replacement term newtoken = replacements.get(lower, token) newtoken = caseSame(token, newtoken) position += len(token) newtokens.append(newtoken) return ''.join(newtokens) def scrub(results, publictermsfilename, privatefilename, nametermsfilename, dictionaryfilename, timestampconfigfilename): replacements = dict() privateTerms = fileWords(privatefilename, True) publicTerms = fileWords(dictionaryfilename, True) userpublicTerms = fileWords(publictermsfilename, True) nameTerms = lengthLists(fileWords(nametermsfilename, True)) # add user public terms to default publicterms for t in userpublicTerms: publicTerms[t] = userpublicTerms[t] # add named entities to default publicterms protectedKeys = set(["eventtype", "linecount", "punct", "sourcetype", "timeendpos", "timestartpos"]) timeInfoTuplet = getTimeInfoTuplet(timestampconfigfilename) allterms = dict() # for each result for r in results: # for each attribute for key,val in r.items(): # only scrub attributes if doesn't start with '_' (except _raw) and if not a protected attribute and doesn't start with date_ if (not key.startswith("_") or key == "_raw") and not key in protectedKeys and not key.startswith("date_"): r[key] = scrubValue(r, val, key=="_raw", allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet) def locate_anonymize_file(filename, app_dir, err_collection): # paths aren't accepted if "/" in filename or "\\" in filename or ".." in filename: msg = ("Pathnames are not accepted for any of the filename arguments. " + "The file specifier '%s' is not permitted.") err_collection.append(msg % filename) return None anonymize_dir = 'anonymizer' if app_dir: app_file_path = os.path.join(app_dir, anonymize_dir, filename) if os.path.isfile(app_file_path): return app_file_path global_file_path = make_splunkhome_path(['etc', anonymize_dir, filename]) if os.path.isfile(global_file_path): return global_file_path # we couldn't find the file, so.. msg = "The filename '%s' could not be found in the " % filename if app_dir: msg += "app or " msg += "the global directory. Checked " if app_dir: msg += "'%s' and " % app_file_path msg += "'%s', but did not locate the file." % global_file_path err_collection.append(msg) return None if __name__ == '__main__': try: results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults() argc = len(sys.argv) argv = sys.argv # if this is nonempty later, we'll write it out as error err_results = [] # DEFAULT CONFIG FILE NAMES publictermsfilename = "public-terms.txt" privatetermsfilename = "private-terms.txt" nametermsfilename = "names.txt" dictionaryfilename = "dictionary.txt" timestampconfigfilename = 'anonymizer-time.ini' # GET ARGS keywords, argvals = splunk.Intersplunk.getKeywordsAndOptions() # argvals = splunk.dcutils.getArgValues() # ALLOW ARGS TO OVERRIDE DEFAULTS publictermsfilename = argvals.get("public-terms", publictermsfilename) privatetermsfilename = argvals.get("private-terms", privatetermsfilename) nametermsfilename = argvals.get("name-terms", nametermsfilename) dictionaryfilename = argvals.get("dictionary", dictionaryfilename) timestampconfigfilename = argvals.get("time-config", timestampconfigfilename) # locate the files app = argvals.get("namespace") # first find the app, if it exists app_dir = None if app: if "/" in app or "\\" in app or ".." in app: msg = "Error: namespace name may not include the '/' '\\' or '..' sequences" err_results.append(msg) else: app_dir = make_splunkhome_path(['etc', 'apps', app]) if not os.path.isdir(app_dir): app_dir = make_splunkhome_path(['etc', comm.getAppDir(), app]) if not os.path.isdir(app_dir): msg = "Error: could not find specified app '%s' on disk" % app err_results.append(msg) app_dir = None # now find each file in either the app or the global dir publicterms_path = locate_anonymize_file(publictermsfilename, app_dir, err_results) privateterms_path = locate_anonymize_file(privatetermsfilename, app_dir, err_results) nameterms_path = locate_anonymize_file(nametermsfilename, app_dir, err_results) dictionary_path = locate_anonymize_file(dictionaryfilename, app_dir, err_results) timestampconfig_path = locate_anonymize_file(timestampconfigfilename, app_dir, err_results) if not err_results: scrub(results, publicterms_path, privateterms_path, nameterms_path, dictionary_path, timestampconfig_path) except: import traceback stack = traceback.format_exc() results = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack)) err_results=[] # pass back explicitly determined errors if err_results: results = splunk.Intersplunk.generateErrorResults("\n".join(err_results)) splunk.Intersplunk.outputResults( results )