Splunk_Docker/files/splunkbeta/bin/scrubber.py

#!/usr/bin/env python
# This work contains trade
#secrets and confidential material of Splunk Inc., and its use or disclosure in
#whole or in part without the express written permission of Splunk Inc. is prohibited.

import os, re, random, glob
from functools import cmp_to_key
from splunk.mining.DateParser import _validateDate, _validateTime

# set of values that imply attribute-value
# 'regex' : (attribute [, value]) -- if no value specified the value matched against will be the value
valuesMap = {
    'macintosh' : ['os'],
    'windows' : ['os'],
    'linux' : ['os'],
    'netscape' : ['browser'],
    'mozilla' : ['browser', 'firefox'],
    'firefox'  : ['browser'],
    '\Wie\W  ' : ['browser', 'ie'],
    'php' : ['language'],
    'java' : ['language'],
    'python' : ['language'],
    'c\+\+' : ['language'],
    '\wperl\w' : ['language', 'perl'],
    }

# common regex used
#start = '(?:^|[~`!@#$%&*()\.,?/;:\'\"]\s)\s*'
start = '(?:^|[~`!@#$%&*()\.,?/;:\'\"])\s*'
ending = '(?:$|[ ~`!@#$%&*()\.,?/;:\'\"])'

regexMap = {
    ## ips (63.215.194.99)
    'ip' : '(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})',
    ## emailaddress
    'email' : '(?:^|\s|\()(?P<email>[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4})',
    ## url
    'url' : '(?P<url>(ftp|http|https|gopher|mailto|news|nntp|telnet|wais|file|prospero|aim|webcal):(([A-Za-z0-9$_.+!*(),;/?:@&~=-])|%[A-Fa-f0-9]{2})+(#([a-zA-Z0-9][a-zA-Z0-9$_.+!*(),;/?:@&~=%-]*))?)',
    'java exception class' : '\sat (?P<class>[\w\.$_-]+)\(',
    ## attr=value (space or ; separated)
    ## attr:value (whitespace)
    #'nv1' : start + '(?P<attr>[a-z]\w+)=(?P<value>(?:\w?[@!:\.+-_]?\w)+)[\s,;.]',    #[^\s=;,>)\]}])*)(?!=)' + '(?:$|[ ~`!@#$%&*()\.,?/;\'\"])',
#    'nv1' : start + '(?P<attr>[a-z]\w+)=(?P<value>(?:\w+))[\s,;.]',    #[^\s=;,>)\]}])*)(?!=)' + '(?:$|[ ~`!@#$%&*()\.,?/;\'\"])',
    'nv2' : start + '(?P<attr>[a-z]\w+):(?P<value>(?:\w[^\s:;,>)\]}])+)(?!:)' + ending,
    #punct|start  words : number words punct|end
    'nv3a' : start + '(?P<attr>[a-z][\w_ -]+)\s*[:]\s*(?P<value>[0-9\.-]*[0-9]\s*[a-z][\w_ ]+)(?!:)', # // + ending,
    'nv3b' : start + '(?P<attr>[a-z][\w_ -]+)\s*[=]\s*(?P<value>[0-9\.-]*[0-9]\s*[a-z][\w_ ]+)(?!=)', # // + ending,
    #punct|start  words : number punct|end
    'nv4' : start + '(?P<attr>[a-z][\w_ -]+)\s*:\s*(?P<value>[0-9\.-]*[0-9])(?!:)' + ending,
    'nv5' : start + '(?P<attr>[a-z][\w_ -]+)\s*=\s*(?P<value>[0-9+:\.-]*[0-9])', #(?!=)' + ending,
    #to=<sdfsdfsdfds>
    'nv6a' : start + '(?P<attr>[a-z][\w_ -]+)=\<(?P<value>.+)\>', # + ending
    'nv6b' : start + '(?P<attr>[a-z][\w_ -]+)=\((?P<value>.+)\)', # + ending
    'nv6c' : start + '(?P<attr>[a-z][\w_ -]+)=\[(?P<value>.+)\]', # + ending
    # default.  word=word
    'nv7' : start + '(?P<attr>[a-z][\w_ -]+)=(?P<value>[\w_-]+)', # + ending
    }

#compile
compiledRegexMap = dict()
for thisType in regexMap.keys():
    compiledRegexMap[thisType] = re.compile(regexMap[thisType], re.I) # optimize.  recompiles each time

compiledValuesMap = dict()
for thisRegex in valuesMap.keys():
    compiledValuesMap[thisRegex] = re.compile(thisRegex, re.I)

def extractValues(text):
    result = dict()
    extractKeywords(result, text)
    # print("compiledRegexMap %s" % compiledRegexMap)
    for atype in compiledRegexMap.keys():
        expression = compiledRegexMap[atype]
        matches = expression.findall(text)
        if matches:
            #print("Matches: %s %s" % (atype, matches))
            if len(matches[0]) == 2: ## attr/value if two values in regex
                for attr, val in matches:
                    #sys.stdout.write("%s = %s ," % (attr, val))
                    #result[attr] = val
                    addToMapSet(result, attr, val)
            else:
                for val in matches:
                    if type(val) != str:
                        val = val[0]
                    #print('MATCHES: %s VAL: %s' % (matches, val))
                    addToMapSet(result, atype, val)
    return result

def extractKeywords(result, text):
    for regex in valuesMap.keys():
        expression = compiledValuesMap[regex]
        matches = expression.findall(text)
        if matches:
            values = valuesMap[regex]
            for val in matches:
                if len(values) == 1:
                    addToMapSet(result, values[0], val)
                else:
                    addToMapSet(result, values[0], values[1])

def addToMapSet(map, key, value):
    if key in map:
        s = map[key]
    else:
        s = set()
        map[key] = s

    doomed = list()
    for item in s:
        # if existing value is a substring of new value, mark it for deletion
        if item in value:
            doomed.append(item)
        # if value to add is a substring of existing value, ignore
        if value in item:
            return s
    for gone in doomed:
        s.remove(gone)
    s.add(value)
    return s


WATERMARK = "SPLUNK"
SPLUNK_ENTITY = "SPLUNK-COM"
WORD_REGEX = re.compile(r'[^a-zA-Z0-9]+')
WORD_SPLIT = re.compile(r'([^a-zA-Z0-9]+)')

def _generateReplacement(term, nameterms):
    replacement = ""
    if looksLikeWord(term):
        # get list of names with the same length as the term
        names = nameterms.get(len(term), None)
        if names != None:
            nameCount = len(names)
            if nameCount > 0:
                index = random.randint(1, nameCount)
                replacement = names[index-1]
                del names[index-1]
                return replacement

    for ch in term:
        if ch.isdigit():
            # return a new number that is randomly less than the given value, so that ip addresses, and codes
            # are not higher than the value given.  otherwise we wil get ip addresses like 554.785.455.545.
            # this assumes that if given a number, a number lower than it will be equally valid
            maxVal = int(ch)
            newch = str(random.randint(0,maxVal))
        elif ch.isalpha():
            if ch.islower():
                newch = chr(random.randint(97,122))
            else:
                newch = chr(random.randint(65,90))
        else:
            newch = ch
        replacement += newch
    return replacement

def allAlpha(token):
    for c in token:
        if not c.isalpha():
            return False
    return True

def lengthLists(terms):
    result = dict()
    for key in terms.keys():
        addToMapList(result, len(key), key)
    return result


def watermark(terms, replacements, mark):
    marklen = len(mark)
    for term in terms:
        if len(term) == marklen and looksLikeWord(term) and term in replacements:
            replacements[term] = mark
            break

def parentDirectory(filename):
    try:
        return filename[ : filename.rindex(os.sep)]
    except:
        return "."

def fileNameNoDirectory(filename):
    try:
        return filename[ filename.rindex(os.sep) + 1 :]
    except:
        return filename

############################# DATEFINDER

def findAllDatesAndTimes(text, timeInfoTuplet):
    global today, _MIN_YEAR, _MAX_YEAR

    timeExpressions = timeInfoTuplet[0]
    dateExpressions = timeInfoTuplet[1]
    matches = getAllMatches(text, dateExpressions, _validateDate)
    matches.extend(getAllMatches(text, timeExpressions, _validateTime))
    return matches


def getAllMatches(text, expressions, validator):
    index = -1
    matches = list()
    for expression in expressions:
        index += 1
        for match in expression.finditer(text):
            values = match.groupdict()
            isvalid = validator(values)
            if isvalid:
                #print("MATCHED: %s" % match.group())
                matches.append(match.span())
                # DOING ALL EXPRESSIONS FOR OPTIMIZATION DOES NOTHING.
                # # DC: WE HAVE A VALID MATCH, AND IT WASN'T THE FIRST EXPRESSION,
                # # MAKE THIS PATTERN THE FIRST ONE TRIED FROM NOW ON
                # if index > 0: # optimize search
                #     expressions.insert(0, expressions.pop(index))
    return matches

# return true if position is between any start-end in list of regions
def inRegions(position, regions):
    for region in regions:
        start = region[0]
        end = region[1]
        if start <= position <= end:
            return True
    return False

def compilePatterns(formats):
    compiledList = list()
    for format in formats:
        #print(str(format))
        compiledList.append(re.compile(format, re.I))
    return compiledList

def getTimeInfoTuplet(timestampconfilename):
    text = readText(timestampconfilename)
    text = text.replace('\\n', '\n').replace('\n\n', '\n')
    our_locals = dict()
    exec(text, globals(), our_locals)
    timePatterns = our_locals.get('timePatterns', None)
    datePatterns = our_locals.get('datePatterns', None)
    minYear = our_locals.get('minYear', None)
    maxYear = our_locals.get('maxYear', None)
    compiledTimePatterns = compilePatterns(timePatterns)
    compiledDatePatterns = compilePatterns(datePatterns)
    timeInfoTuplet = [compiledTimePatterns, compiledDatePatterns, minYear, maxYear]
    return timeInfoTuplet

############################# DATEFINDER

################################### BEGIN COPIED FROM DCUTILS.PY

def addToMapList(map, key, value):
    if key in map:
        l = map[key]
    else:
        l = list()
        map[key] = l
    safeAppend(l, value)
    return l


def fileWords(filename, lowercase):
    terms = dict()
    try:
        f = open(filename, 'r')
        count = 1
        while (True):
            line = f.readline()
            if (lowercase):
                line = line.lower()
            if len(line) == 0:
                break
            tokenize(line, False, terms)
            if count % 100000 == 0:
                print('\t%u processed...' % count)
            count += 1
        f.close()
    except Exception as e:
        print('*** Error reading file %s and getting terms: %s', (filename, e))
    return terms


def readText(filename):
    try:
        f = open(filename, 'r')
        text = f.read()
        f.close()
        return text
    except Exception as e:
        print('*** Error reading file %s: %s' % (filename, e))
        return ""

def writeText(filename, text):
    try:
        f = open(filename, 'w')
        f.write(text)
        f.close()
    except Exception as e:
        print('*** Error writing file %s: %s' % (filename, e))

MAX_SEGMENT = 1024

def findBreak(start, segSize, text):
    segEnd = start + segSize - 1
    if segEnd > len(text):
        return len(text)-1
    for end in range(segEnd, max(start+1, segEnd-100), -1):
        if not text[end].isalnum():
            return end
    # failed to find break by going back 100 chars.  give up and break at will.
    return segEnd

# returns maps of terms and phrases to their count
def tokenize(text, wordsOnlyP, vector = dict()):
    segCount = int((len(text) + MAX_SEGMENT-1) / MAX_SEGMENT)
    segStart = 0

    for seg in range(0, segCount):
        segEnd = findBreak(segStart, MAX_SEGMENT, text)
        segText = text[segStart:segEnd+1]
        tokens = WORD_REGEX.split(segText)
        for token in tokens:
            if len(token) == 0:
                continue
            if not wordsOnlyP or looksLikeWord(token):
                incCount(vector, token, 1)
        segStart = segEnd+1
    return vector


def looksLikeWord(token):
    upper = lower = 0
    for c in token:
        if not c.isalpha():
            return False
        if c.isupper():
            upper += 1
        else:
            lower += 1
    return len(token) > 2 and (upper == 0 or lower == 0 or upper == 1)

def incCount(map, val, count):
    if val in map:
        map[val] += count
    else:
        map[val] = count


def safeAppend(list, val):
    if val not in list:
        list.append(val)

################################### END COPIED FROM DCUTILS.PY

def suggestOtherPrivateTerms(scrubeefilename, privateTerms, publicTerms):
    import synonyms
    recommendedAlready = set()
    # for each private term
    for term in privateTerms:
        # find synonyms like it
        suggestions = synonyms.learnTerms(scrubeefilename, [term], 100, 100)
        if suggestions != None:
            keepers = set()
            # for each synonym
            for sug in suggestions:
                # if it's a public term, it's dangerous that it might be a private.
                # unpublic terms are not dangerous as they wil automatically be scrubbed
                # ...also check that we haven't already recommended it
                if sug in publicTerms and sug not in recommendedAlready and sug not in privateTerms:
                    keepers.add(sug) # keep it
                    recommendedAlready.add(sug)

        if len(keepers) >= 1:
            prettyKeepers = ', '.join(keepers)
            print('You\'ve specified (%s) as a private term.  You might want to also consider:\n\t%s' % (term, prettyKeepers))


# returns terms that occur between min and max times.
def getBestTerms(terms, minCount=0, maxCount=99999999999):
    tokensAndCounts = list(terms.items())
    tokensAndCounts.sort( key=cmp_to_key(lambda x,y: y[1] - x[1]) )
    result = list()
    for i in range(0, len(terms)):
        count = tokensAndCounts[i][1]
        if minCount <= count <= maxCount:
            result.append(tokensAndCounts[i][0])
    return result

def suggestTermsByFreq(terms, privateTerms, publicTerms):
    nonuniqueTerms = getBestTerms(terms, 2)
    privateresult = list()
    publicresult = list()
    for term in nonuniqueTerms:
        if looksLikeWord(term):
            lower = term.lower()
            if lower not in privateTerms and lower in publicTerms and lower not in privateresult:
                privateresult.append(lower)
            if lower not in publicTerms and lower not in privateTerms and lower not in publicresult:
                publicresult.append(lower)
    return privateresult, publicresult

def isInt(token):
    if len(token) > 0 and  token[0].isdigit():
        try:
            int(token)
            return True
        except:
            pass
    return False

def caseSame(caseSource, textSource):
    result = "";
    for i in range(0, len(caseSource)):
        casech = caseSource[i]
        textch = textSource[i]
        if casech.isupper():
            textch = textch.upper()
        elif casech.islower():
            textch = textch.lower()
        result += textch;
    return result;


def getNamedEntities(logfiles):
    print('Getting named entities')
    names = set()
    for logfile in logfiles:
        try:
            f = open(logfile, 'r')
            count = 1
            print('\tProcessing %s' % logfile)
            while (True):
                line = f.readline()
                if len(line) == 0:
                    break
                if count > 100000:
                    print('\tStopping named entity extractor after %u lines.' % count)
                    break
                if '=' in line: # condition speeds things up but potentially loses some namevalue pairs
                    nes = extractValues(line)
                    for n in nes.keys():
                        names.add(n)
                count += 1
            f.close()
        except Exception as e:
            print('*** Problem with named entity extraction on file: %s\nSkipping %s ...' % (e, logfile))
    return names

def scrub(logpath, publictermsfilename, privatefilename, nametermsfilename, dictionaryfilename, timestampconfigfilename, corporateEntity):

    try:

        replacements = dict()
        # load private terms
        privateTerms = fileWords(privatefilename, True)
        # load default public terms
        publicTerms = fileWords(dictionaryfilename, True)
        # load user specific public terms
        userpublicTerms = fileWords(publictermsfilename, True)
        # load personal name terms
        nameTerms = lengthLists(fileWords(nametermsfilename, True))
        # add user public terms to default publicterms
        for t in userpublicTerms:
            publicTerms[t] = userpublicTerms[t]

        logfiles = glob.glob(logpath)
        if len(logfiles) == 0:
            print('Unable to find any files with specification %s' % logpath)
            return -1
        print('Processing files: %s' % logfiles)

        namedEntities = getNamedEntities(logfiles)
        print('Adding named entities to list of public terms: %s' % namedEntities)
        # add named entities to default publicterms
        for t in namedEntities:
            publicTerms[t] = 10000


        allterms = dict()
        # FOR EACH FILE TO PROCESS BUILD UP REPLACEMENT MAPPING
        for logfile in logfiles:
            print('\tProcessing %s for terms.' % logfile)
            terms = fileWords(logfile, False)
            if terms == None:
                continue
            #text = readText(logfile)
            #terms = tokenize(text, False)
            #wordterms = tokenize(text, True)
            print('\tCalculating replacements for %u terms.' % len(terms))
            for term in terms:
                # add term and count to allterms
                incCount(allterms, term, terms[term])

                lower = term.lower()
                # if we haven't already made a replacement for this term and it's a private term or not a public term
                if lower not in replacements and (lower in privateTerms or lower not in publicTerms):
                    replacements[lower] = _generateReplacement(term, nameTerms) # make a replacement term

        if corporateEntity != None:
            corporateLower = corporateEntity.lower()
            publicTerms[corporateLower] = 10000
            publicTerms[WATERMARK] = 10000
            if corporateLower in replacements:
                del replacements[corporateLower]

        watermark(allterms, replacements, WATERMARK)
        timeInfoTuplet = getTimeInfoTuplet(timestampconfigfilename)

        directory = parentDirectory(logpath)
        mappingfilename = directory + os.sep + 'INFO-mapping.txt'
        replacetext = "Replacement Mappings\n--------------------\n"
        for item in replacements.items():
            replacetext += item[0] + " --> " + item[1] + "\n"
        writeText(mappingfilename, replacetext)
        print('===================================================')
        print('Wrote dictionary scrubbed terms with replacements to \"' + mappingfilename + '\"')

        #print('===================================================')
        #suggestOtherPrivateTerms(scrubeefilename, privateTerms, publicTerms)
        privateSuggestions, publicSuggestions = suggestTermsByFreq(allterms, privateTerms, publicTerms)
        suggestionsfilename = directory + os.sep + 'INFO-suggestions.txt'
        suggestText = "Terms to consider making private (currently not scrubbed):\n\n" + str(privateSuggestions) + "\n\n\nTerms to consider making public (currently scrubbed):\n\n" + str(publicSuggestions) + "\n"
        writeText(suggestionsfilename, str(suggestText))
        print('Wrote suggestions for dictionary to \"' + suggestionsfilename + '\"')
        print('===================================================')

        for logfile in logfiles:
            anonfilename = directory + os.sep + "ANON-" + fileNameNoDirectory(logfile)
            print("Writing out %s" % anonfilename)
            count = 1
            try:
                fout = open(anonfilename, 'w')
                fin = open(logfile, 'r')
                while (True):
                    if count % 100000 == 0:
                        print('\t%u processed...' % count)
                    line = fin.readline()
                    if len(line) == 0:
                        break
                    line = line[0:len(line)-1]

                    regions = findAllDatesAndTimes(line, timeInfoTuplet)
                    position = 0
                    tokens = re.split(WORD_SPLIT, line)
                    newtokens = list()
                    for token in tokens:
                        lower = token.lower()
                        inDateRegion = inRegions(position, regions)
                        # IF WE'RE IN A DATE REGION AND IT'S A NUMBER OF PUBLIC WORD, KEEP IT
                        # WE NEED TO DOUBLE CHECK FOR NUMBERS OF PUBLIC TERMS BECAUSE DATE REGIONS SOMETIMES
                        # HAVE EXTRAINEOUS TEXT IF THE REGEX MATCHES CONTAINS A NOISE TERM OR END OF EXPRESSION MATCH
                        if inDateRegion and (isInt(token) or (lower in publicTerms and lower not in privateTerms)):
                            #print('leaving: %s alone as it's part of a date.' % token)
                            newtoken = token
                        else:
                            newtoken = replacements.get(lower, token)
                            newtoken = caseSame(token, newtoken)

                        position += len(token)
                        newtokens.append(newtoken)
                    newline = ''.join(newtokens)
                    if corporateEntity != None:
                        newline = newline.replace(corporateEntity, SPLUNK_ENTITY)

                    fout.write(newline)
                    fout.write('\n')
                    count += 1
                fin.close()
                fout.close()
            except Exception as e:
                print('*** Scrubber error: %s\nSkipping %s ...' % (e, logfile))
                #import traceback
                #traceback.print_exc()
        print("Done.")
    except Exception as e:
        print('***Scrubber error: %s' % e)
        import traceback
        traceback.print_exc()

__source            = "-source"
__publicTerms       = "-public-terms"
__privateTerms      = "-private-terms"
__nameTerms         = "-name-terms"
__timestampConfig   = "-timestamp-config"
__dictionary        = "-dictionary"
__corpEntity        = "-corp-entity"
__fileSpecification = "<file specification>"


if __name__ == '__main__':
    import sys
    argc = len(sys.argv)
    argv = sys.argv

    scrubeefilename = None
    from splunk.clilib.bundle_paths import make_splunkhome_path
    root = make_splunkhome_path(['etc', 'anonymizer'])
    publictermsfilename     = os.path.join(root, "public-terms.txt")
    privatetermsfilename    = os.path.join(root, "private-terms.txt")
    nametermsfilename       = os.path.join(root, "names.txt")
    dictionaryfilename      = os.path.join(root, "dictionary.txt")
    timestampconfigfilename = os.path.join(root, 'anonymizer-time.ini')
    corporateEntity = None

    i = 1
    while i < argc-1:
        if argv[i] == __source:
            scrubeefilename = argv[i+1]
        elif argv[i] == __publicTerms:
            publictermsfilename = argv[i+1]
        elif argv[i] == __privateTerms:
            privatetermsfilename = argv[i+1]
        elif argv[i] == __nameTerms:
            nametermsfilename = argv[i+1]
        elif argv[i] == __dictionary:
            dictionaryfilename = argv[i+1]
        elif argv[i] == __timestampConfig:
            timestampconfigfilename = argv[i+1]
        elif argv[i] == __corpEntity:
            corporateEntity = argv[i+1]
        else:
            i = i - 1

        i = i + 2


    if scrubeefilename:
        scrub(scrubeefilename, publictermsfilename, privatetermsfilename, nametermsfilename, dictionaryfilename, timestampconfigfilename, corporateEntity)
    else:
        print('Simple Usage \n')
        print('\tsplunk anonymize file -source <filespecification> [additional arguments]\n')
        print('\t...for example...\n')
        print('\tsplunk anonymize file -source \'/home/myname/logs/*.log\'\n')

        print('\nAdditional optional arguments:')
        print('\t%s %s' % (__publicTerms, __fileSpecification))
        print('\t%s %s' % (__privateTerms, __fileSpecification))
        print('\t%s %s' % (__nameTerms, __fileSpecification))
        print('\t%s %s' % (__timestampConfig, __fileSpecification))
        print('\t%s %s' % (__dictionary, __fileSpecification))
        print('\t%s %s' % (__corpEntity, "<string>"))