You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
383 lines
13 KiB
383 lines
13 KiB
#!/usr/bin/env python
|
|
# This work contains confidential material of Splunk Inc. Its use or disclosure in
|
|
#whole or in part without the express written permission of Splunk Inc. is prohibited.
|
|
|
|
import os
|
|
import re
|
|
import random
|
|
import glob
|
|
import urllib
|
|
import sys
|
|
import splunk.Intersplunk
|
|
import splunk.clilib.cli_common as comm
|
|
|
|
from builtins import chr, range
|
|
from splunk.mining.DateParser import _validateDate, _validateTime
|
|
from splunk.clilib.bundle_paths import make_splunkhome_path
|
|
|
|
|
|
WORD_REGEX = re.compile(r'[^a-zA-Z0-9]+')
|
|
WORD_SPLIT = re.compile(r'([^a-zA-Z0-9]+)')
|
|
|
|
def _generateReplacement(term, nameterms):
|
|
replacement = ""
|
|
if looksLikeWord(term):
|
|
# get list of names with the same length as the term
|
|
names = nameterms.get(len(term), None)
|
|
if names != None:
|
|
nameCount = len(names)
|
|
if nameCount > 0:
|
|
index = random.randint(1, nameCount)
|
|
replacement = names[index-1]
|
|
del names[index-1]
|
|
return replacement
|
|
|
|
for ch in term:
|
|
if ch.isdigit():
|
|
# return a new number that is randomly less than the given value, so that ip addresses, and codes
|
|
# are not higher than the value given. otherwise we wil get ip addresses like 554.785.455.545.
|
|
# this assumes that if given a number, a number lower than it will be equally valid
|
|
maxVal = int(ch)
|
|
newch = str(random.randint(0,maxVal))
|
|
elif ch.isalpha():
|
|
if ch.islower():
|
|
newch = chr(random.randint(97,122))
|
|
else:
|
|
newch = chr(random.randint(65,90))
|
|
else:
|
|
newch = ch
|
|
replacement += newch
|
|
return replacement
|
|
|
|
def lengthLists(terms):
|
|
result = dict()
|
|
for key in terms.keys():
|
|
addToMapList(result, len(key), key)
|
|
return result
|
|
|
|
|
|
############################# DATEFINDER
|
|
|
|
def findAllDatesAndTimes(text, timeInfoTuplet):
|
|
global today, _MIN_YEAR, _MAX_YEAR
|
|
|
|
timeExpressions = timeInfoTuplet[0]
|
|
dateExpressions = timeInfoTuplet[1]
|
|
matches = getAllMatches(text, dateExpressions, _validateDate)
|
|
matches.extend(getAllMatches(text, timeExpressions, _validateTime))
|
|
return matches
|
|
|
|
|
|
def getAllMatches(text, expressions, validator):
|
|
index = -1
|
|
matches = list()
|
|
for expression in expressions:
|
|
index += 1
|
|
for match in expression.finditer(text):
|
|
values = match.groupdict()
|
|
isvalid = validator(values)
|
|
if isvalid:
|
|
matches.append(match.span())
|
|
return matches
|
|
|
|
# return true if position is between any start-end in list of regions
|
|
def inRegions(position, regions):
|
|
for region in regions:
|
|
start = region[0]
|
|
end = region[1]
|
|
if start <= position <= end:
|
|
return True
|
|
return False
|
|
|
|
def compilePatterns(formats):
|
|
compiledList = list()
|
|
for format in formats:
|
|
compiledList.append(re.compile(format, re.I))
|
|
return compiledList
|
|
|
|
def getTimeInfoTuplet(timestampconfilename):
|
|
root = os.path.realpath(make_splunkhome_path(['etc', 'anonymizer']))
|
|
if not os.path.isabs(root):
|
|
root = os.path.abspath(root)
|
|
|
|
timestampconfilename = os.path.realpath(os.path.normpath(timestampconfilename))
|
|
if not os.path.isabs(timestampconfilename):
|
|
timestampconfilename = os.path.abspath(timestampconfilename)
|
|
|
|
if root != os.path.commonprefix([root, timestampconfilename]):
|
|
print('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root))
|
|
raise Exception('*** File is not inside proper directory %s should be in %s'%(timestampconfilename, root))
|
|
|
|
text = readText(timestampconfilename)
|
|
text = text.replace('\\n', '\n').replace('\n\n', '\n')
|
|
results = {}
|
|
exec(text, {"__builtins__":None}, results)
|
|
compiledTimePatterns = compilePatterns(results['timePatterns'])
|
|
compiledDatePatterns = compilePatterns(results['datePatterns'])
|
|
timeInfoTuplet = [compiledTimePatterns, compiledDatePatterns, results['minYear'], results['maxYear']]
|
|
return timeInfoTuplet
|
|
|
|
############################# DATEFINDER
|
|
|
|
################################### BEGIN COPIED FROM DCUTILS.PY
|
|
|
|
def addToMapList(map, key, value):
|
|
if key in map:
|
|
l = map[key]
|
|
else:
|
|
l = list()
|
|
map[key] = l
|
|
safeAppend(l, value)
|
|
return l
|
|
|
|
|
|
def fileWords(filename, lowercase):
|
|
terms = dict()
|
|
with open(filename) as f:
|
|
count = 1
|
|
while (True):
|
|
line = f.readline()
|
|
if (lowercase):
|
|
line = line.lower()
|
|
if len(line) == 0:
|
|
break
|
|
tokenize(line, False, terms)
|
|
##Is it possible to do previews from a search script?
|
|
#if count % 100000 == 0:
|
|
# print('\t%u processed...' % count)
|
|
count += 1
|
|
return terms
|
|
|
|
def readText(filename):
|
|
# really, this needs a function?
|
|
with open(filename) as f:
|
|
text = f.read()
|
|
return text
|
|
|
|
MAX_SEGMENT = 1024
|
|
|
|
def findBreak(start, segSize, text):
|
|
segEnd = start + segSize - 1
|
|
if segEnd > len(text):
|
|
return len(text)-1
|
|
for end in range(segEnd, max(start+1, segEnd-100), -1):
|
|
if not text[end].isalnum():
|
|
return end
|
|
# failed to find break by going back 100 chars. give up and break at will.
|
|
return segEnd
|
|
|
|
# returns maps of terms and phrases to their count
|
|
def tokenize(text, wordsOnlyP, vector = dict()):
|
|
segCount = int((len(text) + MAX_SEGMENT-1) / MAX_SEGMENT)
|
|
segStart = 0
|
|
|
|
for seg in range(0, segCount):
|
|
segEnd = findBreak(segStart, MAX_SEGMENT, text)
|
|
segText = text[segStart:segEnd+1]
|
|
tokens = WORD_REGEX.split(segText)
|
|
for token in tokens:
|
|
if len(token) == 0:
|
|
continue
|
|
if not wordsOnlyP or looksLikeWord(token):
|
|
incCount(vector, token, 1)
|
|
segStart = segEnd+1
|
|
return vector
|
|
|
|
|
|
def looksLikeWord(token):
|
|
upper = lower = 0
|
|
for c in token:
|
|
if not c.isalpha():
|
|
return False
|
|
if c.isupper():
|
|
upper += 1
|
|
else:
|
|
lower += 1
|
|
return len(token) > 2 and (upper == 0 or lower == 0 or upper == 1)
|
|
|
|
def incCount(map, val, count):
|
|
if val in map:
|
|
map[val] += count
|
|
else:
|
|
map[val] = count
|
|
|
|
|
|
def safeAppend(list, val):
|
|
if val not in list:
|
|
list.append(val)
|
|
|
|
################################### END COPIED FROM DCUTILS.PY
|
|
|
|
def isInt(token):
|
|
if len(token) > 0 and token[0].isdigit():
|
|
try:
|
|
int(token)
|
|
return True
|
|
except:
|
|
pass
|
|
return False
|
|
|
|
def caseSame(caseSource, textSource):
|
|
result = "";
|
|
for i in range(0, len(caseSource)):
|
|
casech = caseSource[i]
|
|
textch = textSource[i]
|
|
if casech.isupper():
|
|
textch = textch.upper()
|
|
elif casech.islower():
|
|
textch = textch.lower()
|
|
result += textch;
|
|
return result;
|
|
|
|
|
|
def scrubValue(result, val, isRaw, allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet):
|
|
|
|
regions = []
|
|
if isRaw:
|
|
regions = findAllDatesAndTimes(val, timeInfoTuplet)
|
|
position = 0
|
|
tokens = re.split(WORD_SPLIT, val)
|
|
newtokens = list()
|
|
for token in tokens:
|
|
lower = token.lower()
|
|
newtoken = token
|
|
incCount(allterms, token, 1)
|
|
inDateRegion = inRegions(position, regions)
|
|
# if term is name of not an attribute and not in a date region.
|
|
# double check for numbers of public terms because date regions sometimes
|
|
# have extraineous text if the regex matches contains a noise term or end of expression match
|
|
if (result.get(lower, None) == None) and not (inDateRegion and (isInt(token) or (lower in publicTerms and lower not in privateTerms))):
|
|
# if we haven't already made a replacement for this term and it's a private term or not a public term
|
|
if lower not in replacements and (lower in privateTerms or lower not in publicTerms):
|
|
replacements[lower] = newtoken = _generateReplacement(token, nameTerms) # make a replacement term
|
|
newtoken = replacements.get(lower, token)
|
|
newtoken = caseSame(token, newtoken)
|
|
position += len(token)
|
|
newtokens.append(newtoken)
|
|
return ''.join(newtokens)
|
|
|
|
def scrub(results, publictermsfilename, privatefilename, nametermsfilename, dictionaryfilename, timestampconfigfilename):
|
|
|
|
replacements = dict()
|
|
privateTerms = fileWords(privatefilename, True)
|
|
publicTerms = fileWords(dictionaryfilename, True)
|
|
userpublicTerms = fileWords(publictermsfilename, True)
|
|
nameTerms = lengthLists(fileWords(nametermsfilename, True))
|
|
|
|
# add user public terms to default publicterms
|
|
for t in userpublicTerms:
|
|
publicTerms[t] = userpublicTerms[t]
|
|
# add named entities to default publicterms
|
|
|
|
protectedKeys = set(["eventtype", "linecount", "punct", "sourcetype", "timeendpos", "timestartpos"])
|
|
|
|
timeInfoTuplet = getTimeInfoTuplet(timestampconfigfilename)
|
|
allterms = dict()
|
|
# for each result
|
|
for r in results:
|
|
# for each attribute
|
|
for key,val in r.items():
|
|
# only scrub attributes if doesn't start with '_' (except _raw) and if not a protected attribute and doesn't start with date_
|
|
if (not key.startswith("_") or key == "_raw") and not key in protectedKeys and not key.startswith("date_"):
|
|
r[key] = scrubValue(r, val, key=="_raw", allterms, replacements, publicTerms, privateTerms, nameTerms, timeInfoTuplet)
|
|
|
|
def locate_anonymize_file(filename, app_dir, err_collection):
|
|
# paths aren't accepted
|
|
if "/" in filename or "\\" in filename or ".." in filename:
|
|
msg = ("Pathnames are not accepted for any of the filename arguments. " +
|
|
"The file specifier '%s' is not permitted.")
|
|
err_collection.append(msg % filename)
|
|
return None
|
|
|
|
anonymize_dir = 'anonymizer'
|
|
if app_dir:
|
|
app_file_path = os.path.join(app_dir, anonymize_dir, filename)
|
|
if os.path.isfile(app_file_path):
|
|
return app_file_path
|
|
|
|
global_file_path = make_splunkhome_path(['etc', anonymize_dir, filename])
|
|
if os.path.isfile(global_file_path):
|
|
return global_file_path
|
|
|
|
# we couldn't find the file, so..
|
|
msg = "The filename '%s' could not be found in the " % filename
|
|
if app_dir:
|
|
msg += "app or "
|
|
msg += "the global directory. Checked "
|
|
if app_dir:
|
|
msg += "'%s' and " % app_file_path
|
|
msg += "'%s', but did not locate the file." % global_file_path
|
|
err_collection.append(msg)
|
|
return None
|
|
|
|
if __name__ == '__main__':
|
|
try:
|
|
results,dummyresults,settings = splunk.Intersplunk.getOrganizedResults()
|
|
argc = len(sys.argv)
|
|
argv = sys.argv
|
|
|
|
# if this is nonempty later, we'll write it out as error
|
|
err_results = []
|
|
|
|
# DEFAULT CONFIG FILE NAMES
|
|
publictermsfilename = "public-terms.txt"
|
|
privatetermsfilename = "private-terms.txt"
|
|
nametermsfilename = "names.txt"
|
|
dictionaryfilename = "dictionary.txt"
|
|
timestampconfigfilename = 'anonymizer-time.ini'
|
|
|
|
# GET ARGS
|
|
keywords, argvals = splunk.Intersplunk.getKeywordsAndOptions() # argvals = splunk.dcutils.getArgValues()
|
|
|
|
# ALLOW ARGS TO OVERRIDE DEFAULTS
|
|
publictermsfilename = argvals.get("public-terms", publictermsfilename)
|
|
privatetermsfilename = argvals.get("private-terms", privatetermsfilename)
|
|
nametermsfilename = argvals.get("name-terms", nametermsfilename)
|
|
dictionaryfilename = argvals.get("dictionary", dictionaryfilename)
|
|
timestampconfigfilename = argvals.get("time-config", timestampconfigfilename)
|
|
|
|
# locate the files
|
|
app = argvals.get("namespace")
|
|
|
|
# first find the app, if it exists
|
|
app_dir = None
|
|
if app:
|
|
if "/" in app or "\\" in app or ".." in app:
|
|
msg = "Error: namespace name may not include the '/' '\\' or '..' sequences"
|
|
err_results.append(msg)
|
|
else:
|
|
app_dir = make_splunkhome_path(['etc', 'apps', app])
|
|
if not os.path.isdir(app_dir):
|
|
app_dir = make_splunkhome_path(['etc', comm.getAppDir(), app])
|
|
if not os.path.isdir(app_dir):
|
|
msg = "Error: could not find specified app '%s' on disk" % app
|
|
err_results.append(msg)
|
|
app_dir = None
|
|
|
|
# now find each file in either the app or the global dir
|
|
publicterms_path = locate_anonymize_file(publictermsfilename,
|
|
app_dir, err_results)
|
|
privateterms_path = locate_anonymize_file(privatetermsfilename,
|
|
app_dir, err_results)
|
|
nameterms_path = locate_anonymize_file(nametermsfilename,
|
|
app_dir, err_results)
|
|
dictionary_path = locate_anonymize_file(dictionaryfilename,
|
|
app_dir, err_results)
|
|
timestampconfig_path = locate_anonymize_file(timestampconfigfilename,
|
|
app_dir, err_results)
|
|
|
|
if not err_results:
|
|
scrub(results, publicterms_path, privateterms_path,
|
|
nameterms_path, dictionary_path, timestampconfig_path)
|
|
|
|
except:
|
|
import traceback
|
|
stack = traceback.format_exc()
|
|
results = splunk.Intersplunk.generateErrorResults("Error : Traceback: " + str(stack))
|
|
err_results=[]
|
|
|
|
# pass back explicitly determined errors
|
|
if err_results:
|
|
results = splunk.Intersplunk.generateErrorResults("\n".join(err_results))
|
|
splunk.Intersplunk.outputResults( results )
|